In [23]:
import pandas as pd
import os

In [24]:
df = pd.read_csv(r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\combined_player_results_df.csv")

#### Create a new column that removes A/B from Division

In [25]:
# Create Integer Division column
df["div_int"] = df["Division"].str.strip("A|B")

#### Filter out non main league divisions or those that still play to 15

In [26]:
# Divisions to exclude
divisions = ['L2', 'L3', 'L4', 'M2', 'M3', 'M4', 'Premier Ladies',
       'Premier Main', 'Premier Masters']

# Filter out the divisions to exclude
df = df[~df["div_int"].isin(divisions)]

# Convert the div_int column to integer
df["div_int"] = df["div_int"].astype(int)

# Keep only rows where the div_int column is 2 to 8
df = df[df["div_int"].between(2, 8)]

In [27]:
# Exclude rows where the score is 'CR' or 'WO'
df = df[~df["Score"].isin(['CR', 'WO'])]

In [28]:
# Group 3-0 with 0-3, 3-1 with 1-3, 3-2 with 2-3
df["Score"] = df["Score"].replace({'0-3': '3-0', '1-3': '3-1', '2-3': '3-2'})

# Get percentage value counts for 3-0, 3-1, 3-2
df["Score"].value_counts(normalize=True)

Score
3-0    0.485616
3-1    0.317990
3-2    0.196394
Name: proportion, dtype: float64

In [29]:
# Count the percentage of scores that are 3-2 or 2-3
five_game_matches_perc = df["Score"].str.contains("3-2|2-3").mean()
print(f"Percentage of 5 game matches: {five_game_matches_perc:.2%}")

Percentage of 5 game matches: 19.64%


In [30]:
# Show the players in Player Name column with most 3-2 or 2-3 scores
df[df["Score"].str.contains("3-2|2-3")]["Player Name"].value_counts()

Player Name
Unknown                 11
Osmund Ronnie            7
Au Alex Ka Man           7
Chan Ming Yan Calvin     6
Au Pak Ki                6
                        ..
Mao Junhong              1
Chiang Ieng Long         1
Chung Ka Yau             1
Choi Angel Chung Yin     1
Man Kin Cheong           1
Name: count, Length: 477, dtype: int64

In [32]:
# Show the players in Player Name column where count of 3-2 or 2-3 results is 3 or more
df[df["Score"].str.contains("3-2|2-3")]["Player Name"].value_counts()[df[df["Score"].str.contains("3-2|2-3")]["Player Name"].value_counts() >= 5]

Player Name
Unknown                 11
Osmund Ronnie            7
Au Alex Ka Man           7
Chan Ming Yan Calvin     6
Au Pak Ki                6
Siu Chen Dong            6
Yip Tsz Ho               6
Liou King Long           6
Mohammad Gull Ilyas      5
Sze James Wang Cho       5
Lee Tin Hang             5
Yeung Kai San            5
Chow Hester Yuk Ting     5
Amjad Zain               5
Tanner Julian            5
Chan Siu Kwan            5
Chan Cheuk Fung          5
Choy Chun Hei Bosco      5
Chan Yan Yu              5
Lee Tsz Long Max         5
Leung Sum Nam            5
Lam Timmy Tin Yi         5
Chan Keiko Tsun Ling     5
Chan Chun Ching          5
Tam Kai Chung Hades      5
Yiu Sai Ngoi             5
Name: count, dtype: int64

In [38]:
# Get value counts for number of times a player's match wasn't 3-0 or 0-3
df[df["Score"].str.contains("3-0|0-3") == False]["Player Name"].value_counts().head(25)

Player Name
Unknown                   32
Chan Ming Yan Calvin      16
Cheng Tak Kwong           14
Sze James Wang Cho        14
Leung Chun Ho             13
Chan Cheuk Fung           13
Mak Chun Wai              12
Au Alex Ka Man            12
Osmund Ronnie             12
Lee Tsz Long Max          11
Raducki Stephen           11
Lai Michael Cheuk Fung    11
Chow Hester Yuk Ting      11
Tanner Julian             11
Wong Sze Yu               10
Hathiramani Dinesh        10
Yiu Sai Ngoi              10
Chow Ming Hei Quincy      10
Achouch Frank             10
Salahuddin                10
Bhandari Raghav           10
Wong Chun Fai             10
Yip Tsz Ho                10
Siu Chen Dong             10
Bidal Jay                 10
Name: count, dtype: int64

#### Get Equivalent divisions from last season

In [None]:
# Div 8 equals former Div 12

In [None]:
schedules_directory = r"C:\Users\bpali\PycharmProjects\SquashApp\previous_seasons\2023-2024\schedules_df"

In [None]:
# Import all CSVs in the schedules directory
schedules_list = []
for file in os.listdir(schedules_directory):
    df = pd.read_csv(os.path.join(schedules_directory, file))
    df["Division"] = file.split("_")[0]
    df["div_int"] = df["Division"].str.strip("A|B")
    schedules_list.append(df)

schedules_df = pd.concat(schedules_list, ignore_index=True)

In [None]:
# Filter dataframe for relevant divisions
divisions_to_keep = ['2', '3', '4', '5', '6', '7', '8', "10", "11", "12"]
schedules_df = schedules_df[schedules_df["div_int"].isin(divisions_to_keep)]
schedules_df["div_int"] = schedules_df["div_int"].astype(int)

In [None]:
schedules_df.head()

In [None]:
def parse_result(result):
    """
    Function to parse the 'result' string
    """
    overall, rubbers = result.split('(')
    rubbers = rubbers.strip(')').split(',')
    return overall, rubbers

In [None]:
# Exclude rows where 'Away Team' is '[BYE]' (indicative of a bye week)
results_df = schedules_df[schedules_df['Away Team'] != '[BYE]'].copy()

# Replace NaN values in 'Result' with an empty string before applying str.contains
results_df['Result'] = results_df['Result'].fillna('')

# Keep rows where 'Result' contains brackets (indicative of a played match)
results_df = results_df[results_df['Result'].str.contains(r'\(')]

# Apply the function to the 'Result' column
results_df[['Overall Score', 'Rubbers']] = results_df['Result'].apply(lambda x: pd.Series(parse_result(x)))

In [None]:
results_df.head()

In [None]:
# Create new columns to get counts of 3-0 or 0-3, 3-1 or 1-3, 3-2 or 2-3 for each match
results_df['3-0'] = results_df['Rubbers'].apply(lambda x: x.count('3-0'))
results_df['3-1'] = results_df['Rubbers'].apply(lambda x: x.count('3-1'))
results_df['3-2'] = results_df['Rubbers'].apply(lambda x: x.count('3-2'))
results_df["0-3"] = results_df["Rubbers"].apply(lambda x: x.count("0-3"))
results_df["1-3"] = results_df["Rubbers"].apply(lambda x: x.count("1-3"))
results_df["2-3"] = results_df["Rubbers"].apply(lambda x: x.count("2-3"))

# Group 3-0 columns with 0-3, 3-1 columns with 1-3, 3-2 columns with 2-3
results_df['3 games'] = results_df['3-0'] + results_df["0-3"]
results_df['4 games'] = results_df['3-1'] + results_df["1-3"]
results_df['5 games'] = results_df['3-2'] + results_df["2-3"]

In [None]:
# Get percentage of 5 games matches by Division
results_df.groupby('div_int')['5 games'].value_counts(normalize=True).loc[:, 1]

In [None]:
results_df.head()

In [None]:
# Get counts of 3-0, 3-1, 3-2, 2-3, 1-3, 0-3 in the 'Rubbers' column, with Division column
rubbers_df = results_df['Rubbers'].apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('Rubbers')

# Group 3-0 with 0-3, 3-1 with 1-3, and 3-2 with 2-3
rubbers_df['Rubbers'] = rubbers_df['Rubbers'].replace({'0-3': '3-0', '1-3': '3-1', '2-3': '3-2'})

# Drop rows where 'Rubbers' is 'CR', 'WO', '1-2', '4-1', '2-1'
rubbers_df = rubbers_df[~rubbers_df['Rubbers'].isin(['CR', 'WO', '1-2', '4-1', '2-1'])]

rubbers_df['Rubbers'].value_counts(normalize=True)

In [None]:
##rubbers_df = results_df['Rubbers'].apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('Rubbers')