In [145]:
import pandas as pd
import os

In [146]:
df = pd.read_csv(r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\combined_player_results_df.csv")

#### Create a new column that removes A/B from Division

In [147]:
# Create Integer Division column
df["div_int"] = df["Division"].str.strip("A|B")

#### Filter out non main league divisions or those that still play to 15

In [148]:
# Divisions to exclude
divisions = ['L2', 'L3', 'L4', 'M2', 'M3', 'M4', 'Premier Ladies',
       'Premier Main', 'Premier Masters']

# Filter out the divisions to exclude
df = df[~df["div_int"].isin(divisions)]

# Convert the div_int column to integer
df["div_int"] = df["div_int"].astype(int)

# Keep only rows where the div_int column is 2 to 8
df = df[df["div_int"].between(2, 8)]

In [149]:
# Exclude rows where the score is 'CR' or 'WO'
df = df[~df["Score"].isin(['CR', 'WO'])]

In [150]:
# Group 3-0 with 0-3, 3-1 with 1-3, 3-2 with 2-3
df["Score"] = df["Score"].replace({'0-3': '3-0', '1-3': '3-1', '2-3': '3-2'})

# Get percentage value counts for 3-0, 3-1, 3-2
df["Score"].value_counts(normalize=True)

Score
3-0    0.489221
3-1    0.323383
3-2    0.187396
Name: proportion, dtype: float64

In [151]:
# Count the percentage of scores that are 3-2 or 2-3
five_game_matches_perc = df["Score"].str.contains("3-2|2-3").mean()
print(f"Percentage of 5 game matches: {five_game_matches_perc:.2%}")

Percentage of 5 game matches: 18.74%


In [152]:
# Show the players in Player Name column with most 3-2 or 2-3 scores
df[df["Score"].str.contains("3-2|2-3")]["Player Name"].value_counts()

Player Name
Chan Keiko Tsun Ling    3
Chan Ming Yan Calvin    3
Au Alex Ka Man          3
Wong Kin Tung           2
Lin Charles Lan Kay     2
                       ..
Tam Thomas Siu Ying     1
Ng Tsz Chun             1
Li James King Hei       1
Leung Sum Nam           1
Yum Chun Ting           1
Name: count, Length: 188, dtype: int64

In [153]:
# Show the players in Player Name column where count of 3-2 or 2-3 results is 3 or more
df[df["Score"].str.contains("3-2|2-3")]["Player Name"].value_counts()[df[df["Score"].str.contains("3-2|2-3")]["Player Name"].value_counts() >= 3]

Player Name
Chan Keiko Tsun Ling    3
Chan Ming Yan Calvin    3
Au Alex Ka Man          3
Name: count, dtype: int64

#### Get Equivalent divisions from last season

In [154]:
# Div 8 equals former Div 12

In [155]:
schedules_directory = r"C:\Users\bpali\PycharmProjects\SquashApp\previous_seasons\2023-2024\schedules_df"

In [156]:
# Import all CSVs in the schedules directory
schedules_list = []
for file in os.listdir(schedules_directory):
    df = pd.read_csv(os.path.join(schedules_directory, file))
    df["Division"] = file.split("_")[0]
    df["div_int"] = df["Division"].str.strip("A|B")
    schedules_list.append(df)

schedules_df = pd.concat(schedules_list, ignore_index=True)

In [157]:
# Filter dataframe for relevant divisions
divisions_to_keep = ['2', '3', '4', '5', '6', '7', '8', "10", "11", "12"]
schedules_df = schedules_df[schedules_df["div_int"].isin(divisions_to_keep)]
schedules_df["div_int"] = schedules_df["div_int"].astype(int)

In [158]:
schedules_df.head()

Unnamed: 0,Home Team,vs,Away Team,Venue,Time,Result,Match Week,Date,Division,div_int
0,Kowloon Cricket Club 10,v,Twister 2,Kowloon Cricket Club,19:00,"5-0(3-0,3-0,3-0,3-0,3-1)",1,09/10/2023,10,10
1,The Best Group,v,FC 3,Cornwall Street Squash Centre,19:00,"1-4(0-3,3-0,0-3,0-3,0-3)",1,09/10/2023,10,10
2,HKU,v,The Hong Kong Jockey Club Div 10,Cornwall Street Squash Centre,19:00,"3-2(3-1,0-3,3-0,3-1,0-3)",1,09/10/2023,10,10
3,Hong Kong Football Club 10A,v,Hong Kong Football Club 10B,Hong Kong Football Club,19:00,"4-1(3-1,3-2,2-3,3-0,3-0)",1,09/10/2023,10,10
4,Social Squash Team,v,[BYE],,,,1,09/10/2023,10,10


In [159]:
def parse_result(result):
    """
    Function to parse the 'result' string
    """
    overall, rubbers = result.split('(')
    rubbers = rubbers.strip(')').split(',')
    return overall, rubbers

In [160]:
# Exclude rows where 'Away Team' is '[BYE]' (indicative of a bye week)
results_df = schedules_df[schedules_df['Away Team'] != '[BYE]'].copy()

# Replace NaN values in 'Result' with an empty string before applying str.contains
results_df['Result'] = results_df['Result'].fillna('')

# Keep rows where 'Result' contains brackets (indicative of a played match)
results_df = results_df[results_df['Result'].str.contains(r'\(')]

# Apply the function to the 'Result' column
results_df[['Overall Score', 'Rubbers']] = results_df['Result'].apply(lambda x: pd.Series(parse_result(x)))

In [161]:
results_df.head()

Unnamed: 0,Home Team,vs,Away Team,Venue,Time,Result,Match Week,Date,Division,div_int,Overall Score,Rubbers
0,Kowloon Cricket Club 10,v,Twister 2,Kowloon Cricket Club,19:00,"5-0(3-0,3-0,3-0,3-0,3-1)",1,09/10/2023,10,10,5-0,"[3-0, 3-0, 3-0, 3-0, 3-1]"
1,The Best Group,v,FC 3,Cornwall Street Squash Centre,19:00,"1-4(0-3,3-0,0-3,0-3,0-3)",1,09/10/2023,10,10,1-4,"[0-3, 3-0, 0-3, 0-3, 0-3]"
2,HKU,v,The Hong Kong Jockey Club Div 10,Cornwall Street Squash Centre,19:00,"3-2(3-1,0-3,3-0,3-1,0-3)",1,09/10/2023,10,10,3-2,"[3-1, 0-3, 3-0, 3-1, 0-3]"
3,Hong Kong Football Club 10A,v,Hong Kong Football Club 10B,Hong Kong Football Club,19:00,"4-1(3-1,3-2,2-3,3-0,3-0)",1,09/10/2023,10,10,4-1,"[3-1, 3-2, 2-3, 3-0, 3-0]"
5,FC 3,v,HKU,Cornwall Street Squash Centre,19:00,"4-1(2-3,3-0,3-0,3-0,3-0)",2,16/10/2023,10,10,4-1,"[2-3, 3-0, 3-0, 3-0, 3-0]"


In [162]:
# Create new columns to get counts of 3-0 or 0-3, 3-1 or 1-3, 3-2 or 2-3 for each match
results_df['3-0'] = results_df['Rubbers'].apply(lambda x: x.count('3-0'))
results_df['3-1'] = results_df['Rubbers'].apply(lambda x: x.count('3-1'))
results_df['3-2'] = results_df['Rubbers'].apply(lambda x: x.count('3-2'))
results_df["0-3"] = results_df["Rubbers"].apply(lambda x: x.count("0-3"))
results_df["1-3"] = results_df["Rubbers"].apply(lambda x: x.count("1-3"))
results_df["2-3"] = results_df["Rubbers"].apply(lambda x: x.count("2-3"))

# Group 3-0 columns with 0-3, 3-1 columns with 1-3, 3-2 columns with 2-3
results_df['3 games'] = results_df['3-0'] + results_df["0-3"]
results_df['4 games'] = results_df['3-1'] + results_df["1-3"]
results_df['5 games'] = results_df['3-2'] + results_df["2-3"]

In [163]:
# Get percentage of 5 games matches by Division
results_df.groupby('div_int')['5 games'].value_counts(normalize=True).loc[:, 1]

div_int
2     0.300000
3     0.375000
4     0.316667
5     0.347222
7     0.325758
8     0.486111
10    0.402778
11    0.349206
12    0.272109
Name: proportion, dtype: float64

In [164]:
results_df.head()

Unnamed: 0,Home Team,vs,Away Team,Venue,Time,Result,Match Week,Date,Division,div_int,...,Rubbers,3-0,3-1,3-2,0-3,1-3,2-3,3 games,4 games,5 games
0,Kowloon Cricket Club 10,v,Twister 2,Kowloon Cricket Club,19:00,"5-0(3-0,3-0,3-0,3-0,3-1)",1,09/10/2023,10,10,...,"[3-0, 3-0, 3-0, 3-0, 3-1]",4,1,0,0,0,0,4,1,0
1,The Best Group,v,FC 3,Cornwall Street Squash Centre,19:00,"1-4(0-3,3-0,0-3,0-3,0-3)",1,09/10/2023,10,10,...,"[0-3, 3-0, 0-3, 0-3, 0-3]",1,0,0,4,0,0,5,0,0
2,HKU,v,The Hong Kong Jockey Club Div 10,Cornwall Street Squash Centre,19:00,"3-2(3-1,0-3,3-0,3-1,0-3)",1,09/10/2023,10,10,...,"[3-1, 0-3, 3-0, 3-1, 0-3]",1,2,0,2,0,0,3,2,0
3,Hong Kong Football Club 10A,v,Hong Kong Football Club 10B,Hong Kong Football Club,19:00,"4-1(3-1,3-2,2-3,3-0,3-0)",1,09/10/2023,10,10,...,"[3-1, 3-2, 2-3, 3-0, 3-0]",2,1,1,0,0,1,2,1,2
5,FC 3,v,HKU,Cornwall Street Squash Centre,19:00,"4-1(2-3,3-0,3-0,3-0,3-0)",2,16/10/2023,10,10,...,"[2-3, 3-0, 3-0, 3-0, 3-0]",4,0,0,0,0,1,4,0,1


In [165]:
# Get counts of 3-0, 3-1, 3-2, 2-3, 1-3, 0-3 in the 'Rubbers' column, with Division column
rubbers_df = results_df['Rubbers'].apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('Rubbers')

# Group 3-0 with 0-3, 3-1 with 1-3, and 3-2 with 2-3
rubbers_df['Rubbers'] = rubbers_df['Rubbers'].replace({'0-3': '3-0', '1-3': '3-1', '2-3': '3-2'})

# Drop rows where 'Rubbers' is 'CR', 'WO', '1-2', '4-1', '2-1'
rubbers_df = rubbers_df[~rubbers_df['Rubbers'].isin(['CR', 'WO', '1-2', '4-1', '2-1'])]

rubbers_df['Rubbers'].value_counts(normalize=True)

Rubbers
3-0    0.542214
3-1    0.292683
3-2    0.165103
Name: proportion, dtype: float64

In [166]:
##rubbers_df = results_df['Rubbers'].apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('Rubbers')