In [77]:
import numpy as np
import pandas as pd
import os
import glob
import scipy.stats as stats

In [58]:
base_directory = r"C:\Users\bpali\PycharmProjects\SquashApp\previous_seasons"


### Load Schedules Data

#### Load Previous Seasons

In [59]:
# Load all seasons summary data
schedules_df_list = []
for season_folder in os.listdir(base_directory):
    season_folder_path = os.path.join(base_directory, season_folder)
    schedules_df_folder = os.path.join(season_folder_path, "schedules_df")
    for file in os.listdir(schedules_df_folder):
        if not file.endswith("schedules_df.csv"):
            continue
        file_path = os.path.join(schedules_df_folder, file)
        df = pd.read_csv(file_path)
        # Add Division column to the dataframe
        df["Division"] = file.split("_")[0]
        # Add Season column to the dataframe
        df["Season"] = season_folder

        schedules_df_list.append(df)

# Concatenate all the dataframes in df_list into one big dataframe
all_previous_schedules_df = pd.concat(schedules_df_list, ignore_index=True)

#### Load Current Season

In [60]:
all_divisions = {
    "Premier Main": 424,
    "2": 425,
    "3": 426,
    "4": 427,
    "5": 428,
    "6": 429,
    "7A": 430,
    "7B": 431,
    "8A": 432,
    "8B": 433,
    "9": 434,
    "10": 435,
    "11": 436,
    "12": 437,
    "13A": 438,
    "13B": 439,
    "14": 440,
    "15A": 441,
    "15B": 442,
    "Premier Masters": 443,
    "M2": 444,
    "M3": 445,
    "M4": 446,
    "Premier Ladies": 447,
    "L2": 448,
    "L3": 449,
    "L4": 450,
    }

In [61]:
# 2024-2025 schedules_df directory
schedules_df_dir = r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\schedules_df"

In [62]:
schedules_dfs = []
for division in all_divisions.keys():
    for week in range(30, 0, -1):
        try:
            schedules_df = pd.read_csv(f"{schedules_df_dir}\\week_{week}\\{division}_schedules_df.csv")
            schedules_df["Season"] = "2024-2025"
            schedules_df["Division"] = division
            schedules_dfs.append(schedules_df)
            break
        except FileNotFoundError:
            continue

In [63]:
# Concatenate all the summary dataframes into one
schedules_df_2024_2025 = pd.concat(schedules_dfs)

In [64]:
# Concatenate the schedules dataframes from all seasons
all_schedules_df = pd.concat([all_previous_schedules_df, schedules_df_2024_2025], ignore_index=True)

### Create Results Dataframe

In [65]:
def parse_result(result):
    """
    Function to parse the 'result' string
    """
    overall, rubbers = result.split('(')
    rubbers = rubbers.strip(')').split(',')
    return overall, rubbers

In [66]:
# Drop unnecessary columns
all_schedules_df.drop(columns=['vs', 'Time'], inplace=True)

# Exclude rows where 'Away Team' is '[BYE]' (indicative of a bye week)
results_df = all_schedules_df[all_schedules_df['Away Team'] != '[BYE]'].copy()

# Replace NaN values in 'Result' with an empty string before applying str.contains
results_df['Result'] = results_df['Result'].fillna('')

# Keep rows where 'Result' contains brackets (indicative of a played match)
results_df = results_df[results_df['Result'].str.contains(r'\(')]

# Apply the function to the 'Result' column
results_df[['Overall Score', 'Rubbers']] = results_df['Result'].apply(lambda x: pd.Series(parse_result(x)))

# Splitting the 'Overall Score' into two separate columns
results_df[['Home Score', 'Away Score']] = results_df['Overall Score'].str.split('-', expand=True).astype(int)

# Get counts for CR and WO
results_df['CR'] = results_df['Rubbers'].apply(lambda x: x.count('CR'))
results_df['WO'] = results_df['Rubbers'].apply(lambda x: x.count('WO'))

# Create combined CR and WO column
results_df['CR_WO'] = results_df['CR'] + results_df['WO']

### Load Teams dataframes

In [67]:
# Load all seasons summary data
teams_df_list = []
for season_folder in os.listdir(base_directory):
    season_folder_path = os.path.join(base_directory, season_folder)
    teams_df_folder = os.path.join(season_folder_path, "teams_df")
    for file in os.listdir(teams_df_folder):
        if not file.endswith("teams_df.csv"):
            continue
        file_path = os.path.join(teams_df_folder, file)
        df = pd.read_csv(file_path)
        # Add Division column to the dataframe
        df["Division"] = file.split("_")[0]
        # Add Season column to the dataframe
        df["Season"] = season_folder

        teams_df_list.append(df)

# Concatenate all the dataframes in df_list into one big dataframe
all_previous_teams_df = pd.concat(teams_df_list, ignore_index=True)

In [68]:
# 2024-2025 teams_df directory
teams_df_dir = r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\teams_df"

teams_dfs = []
for division in all_divisions.keys():
    for week in range(30, 0, -1):
        try:
            df = pd.read_csv(f"{teams_df_dir}\\week_{week}\\{division}_teams_df.csv")
            df["Season"] = "2024-2025"
            df["Division"] = division
            teams_dfs.append(df)
            break
        except FileNotFoundError:
            continue

# Concatenate all the teams dataframes into one
teams_df_2024_2025 = pd.concat(teams_dfs)

# Concatenate the teams dataframes from all seasons
all_teams_df = pd.concat([all_previous_teams_df, teams_df_2024_2025], ignore_index=True)

### Merge the results and teams dataframes

In [70]:
# Using "Team Name", "Home", "Divison", and "Season" columns of all_teams_df, and "Away Team", "Divsion", and "Season" columns of results_df, merge the two dataframes
merged_df = pd.merge(results_df, all_teams_df, left_on=['Away Team', 'Division', 'Season'], right_on=['Team Name', 'Division', 'Season'], how='left')

In [71]:
# Drop unnecessary columns
columns_to_drop = ["Email", "Convenor", "Team Name"]
merged_df.drop(columns=columns_to_drop, inplace=True)

# Rename "Home" column to "Away Team Home"
merged_df.rename(columns={"Home": "Away Team Home"}, inplace=True)

### Remove neutral venues matches

In [72]:
# Remove rows where "Venue" is equal to "Away Team Home"
filtered_merged_df = merged_df[merged_df["Venue"] != merged_df["Away Team Home"]].copy()

#### Create dataframe only for 5 player teams

In [73]:
filtered_merged_5_df = filtered_merged_df[filtered_merged_df["Home Score"] + filtered_merged_df["Away Score"] == 5].copy()

In [74]:
filtered_merged_5_df.groupby("Season")["Home Score"].mean()

Season
2016-2017    2.642896
2017-2018    2.654420
2018-2019    2.642656
2019-2020    2.726250
2021-2022    2.656296
2022-2023    2.711636
2023-2024    2.689470
2024-2025    2.725768
Name: Home Score, dtype: float64

In [None]:
# Create 'Home Win' column to indicate whether the home team won the match, with 1 indicating a win and 0 indicating a loss
filtered_merged_5_df["Home Win"] = np.where(filtered_merged_5_df["Home Score"] > filtered_merged_5_df["Away Score"], 1, 0)

Unnamed: 0,Home Team,Away Team,Venue,Result,Match Week,Date,Division,Season,Overall Score,Rubbers,Home Score,Away Score,CR,WO,CR_WO,Away Team Home,Home Win
0,The Hong Kong Jockey Club,i-MASK Advance Squash Club 7,HKJC Sha Tin Club House (HKJC),"1-4(0-3,1-3,0-3,3-2,2-3)",1,03/10/2016,10,2016-2017,1-4,"[0-3, 1-3, 0-3, 3-2, 2-3]",1,4,0,0,0,Cornwall Street (CSPSC),0
2,Royal Hong Kong Yacht Club 10,Banbil,Royal Hong Kong Yacht Club (RHKYC),"4-1(0-3,3-0,3-0,3-0,3-0)",1,03/10/2016,10,2016-2017,4-1,"[0-3, 3-0, 3-0, 3-0, 3-0]",4,1,0,0,0,HK Squash Centre (HKSC),1
3,Xavier,Perrier KCC 7,Cornwall Street (CSPSC),"3-2(3-2,3-0,3-2,0-3,2-3)",1,03/10/2016,10,2016-2017,3-2,"[3-2, 3-0, 3-2, 0-3, 2-3]",3,2,0,0,0,Kowloon Cricket Club (KCC),1
4,i-MASK Advance Squash Club 7,Royal Hong Kong Yacht Club 10,Cornwall Street (CSPSC),"3-2(3-0,3-0,3-1,1-3,2-3)",2,17/10/2016,10,2016-2017,3-2,"[3-0, 3-0, 3-1, 1-3, 2-3]",3,2,0,0,0,Royal Hong Kong Yacht Club (RHKYC),1
5,Banbil,Perrier KCC 7,HK Squash Centre (HKSC),"1-4(1-3,1-3,3-2,1-3,0-3)",2,17/10/2016,10,2016-2017,1-4,"[1-3, 1-3, 3-2, 1-3, 0-3]",1,4,0,0,0,Kowloon Cricket Club (KCC),0


In [76]:
filtered_merged_5_df.groupby("Season")["Home Win"].mean()

Season
2016-2017    0.545255
2017-2018    0.529277
2018-2019    0.532921
2019-2020    0.573750
2021-2022    0.539259
2022-2023    0.540472
2023-2024    0.550346
2024-2025    0.550827
Name: Home Win, dtype: float64

In [None]:
# Observed data
home_win_mean = filtered_merged_5_df["Home Win"].mean()  # Observed proportion
n = len(filtered_merged_5_df)  # Sample size
p_null = 0.5  # Null hypothesis proportion

# Calculate z-score
z = (home_win_mean - p_null) / ((p_null * (1 - p_null)) / n) ** 0.5

# Calculate p-value (one-tailed test)
p_value = 1 - stats.norm.cdf(z)

# Print results
print(f"Observed Home Win Rate: {home_win_mean:.4f}")
print(f"Z-Score: {z:.4f}")
print(f"P-Value: {p_value:.4f}")

# Decision
alpha = 0.01
if p_value < alpha:
    print("Reject the null hypothesis: Home advantage is statistically significant.")
else:
    print("Fail to reject the null hypothesis: No significant home advantage.")

Observed Home Win Rate: 0.5424
Z-Score: 8.3650
P-Value: 0.0000
Reject the null hypothesis: Home advantage is statistically significant.


In [79]:
# Extract Home Score column
home_scores = filtered_merged_5_df["Home Score"]

# Define parameters
mu_null = 2.5  # Null hypothesis mean

# Calculate t-test
t_stat, p_value = stats.ttest_1samp(home_scores, mu_null)

# Print results
print(f"Mean Home Score: {np.mean(home_scores):.4f}")
print(f"T-Statistic: {t_stat:.4f}")
print(f"P-Value: {p_value:.4f}")

# Decision
alpha = 0.01
if p_value < alpha:
    print("Reject the null hypothesis: The average Home Score is significantly different from 2.5.")
else:
    print("Fail to reject the null hypothesis: No significant difference from 2.5.")

Mean Home Score: 2.6709
T-Statistic: 10.4109
P-Value: 0.0000
Reject the null hypothesis: The average Home Score is significantly different from 2.5.


### Create dataframe for only 2023-2024 season

In [80]:
filtered_merged_5_2023_df = filtered_merged_5_df[filtered_merged_5_df["Season"] == "2023-2024"].copy()

In [81]:
filtered_merged_5_2023_df.groupby("Venue")["Home Win"].count().sort_values(ascending=False)

Venue
Cornwall Street Squash Centre          367
HK Squash Centre                       243
Hong Kong Football Club                178
Kowloon Cricket Club                    91
Hong Kong Cricket Club                  77
United Services Recreation Club         54
Kwun Chung Sports Centre                39
Pei Ho Street Sports Centre             37
Ladies Recreation Club                  29
Masters - HK Squash Centre              24
Harbour Road Sports Centre              22
Wai Tsuen Sports Centre                 17
Yuen Long Jockey Club Squash Courts     17
Royal Hong Kong Yacht Club              16
Tai Po Sports Ground                    16
Java Road Sports Centre                 12
Craigengower Cricket Club               11
Sir Denys Roberts Squash Court          10
Tsing Yi Sports Centre                   9
HKJC Sha Tin Club House                  8
HK CLUB                                  8
MTR Club House                           8
Tai Kiu Market Squash Courts             4
Yeung

In [82]:
filtered_merged_5_2023_df.groupby("Venue")["Home Win"].mean().sort_values(ascending=False)

Venue
Kowloon Cricket Club                   0.824176
Wai Tsuen Sports Centre                0.764706
Pei Ho Street Sports Centre            0.756757
Tai Po Sports Ground                   0.750000
Hong Kong Football Club                0.679775
Sir Denys Roberts Squash Court         0.600000
United Services Recreation Club        0.574074
Tsing Yi Sports Centre                 0.555556
Cornwall Street Squash Centre          0.544959
Hong Kong Cricket Club                 0.519481
Harbour Road Sports Centre             0.500000
Masters - HK Squash Centre             0.500000
Ladies Recreation Club                 0.482759
HK Squash Centre                       0.465021
Craigengower Cricket Club              0.454545
Kwun Chung Sports Centre               0.358974
Yuen Long Jockey Club Squash Courts    0.352941
Royal Hong Kong Yacht Club             0.312500
Java Road Sports Centre                0.250000
HK CLUB                                0.250000
Tai Kiu Market Squash Courts      

In [83]:
filtered_merged_5_2023_df.head()

Unnamed: 0,Home Team,Away Team,Venue,Result,Match Week,Date,Division,Season,Overall Score,Rubbers,Home Score,Away Score,CR,WO,CR_WO,Away Team Home,Home Win
10985,Kowloon Cricket Club 10,Twister 2,Kowloon Cricket Club,"5-0(3-0,3-0,3-0,3-0,3-1)",1,09/10/2023,10,2023-2024,5-0,"[3-0, 3-0, 3-0, 3-0, 3-1]",5,0,0,0,0,Cornwall Street Squash Centre,1
10987,HKU,The Hong Kong Jockey Club Div 10,Cornwall Street Squash Centre,"3-2(3-1,0-3,3-0,3-1,0-3)",1,09/10/2023,10,2023-2024,3-2,"[3-1, 0-3, 3-0, 3-1, 0-3]",3,2,0,0,0,HKJC Sha Tin Club House,1
10990,Hong Kong Football Club 10B,The Best Group,Hong Kong Football Club,"4-1(3-1,3-0,3-1,3-1,0-3)",2,16/10/2023,10,2023-2024,4-1,"[3-1, 3-0, 3-1, 3-1, 0-3]",4,1,0,0,0,Cornwall Street Squash Centre,1
10991,Kowloon Cricket Club 10,Hong Kong Football Club 10A,Kowloon Cricket Club,"4-1(3-1,3-0,3-0,0-3,3-1)",2,16/10/2023,10,2023-2024,4-1,"[3-1, 3-0, 3-0, 0-3, 3-1]",4,1,0,0,0,Hong Kong Football Club,1
10992,Social Squash Team,Twister 2,HK Squash Centre,"3-2(0-3,3-2,3-0,3-1,2-3)",2,16/10/2023,10,2023-2024,3-2,"[0-3, 3-2, 3-0, 3-1, 2-3]",3,2,0,0,0,Cornwall Street Squash Centre,1


In [None]:
filtered_merged_5_df[(filtered_merged_5_df["Home Team"].str.contains("Hong Kong Cricket Club")) &
                      (filtered_merged_5_df["Away Team Home"].str.contains("Cornwall"))]["Home Win"].mean()

In [None]:
filtered_merged_5_df[filtered_merged_5_df["Home Team"].str.contains("Hong Kong Cricket Club")]["Home Win"].mean()

In [84]:
filtered_merged_5_2023_df.groupby("Venue")["CR_WO"].mean().sort_values(ascending=False)

Venue
Sir Denys Roberts Squash Court         2.200000
Tai Po Sports Ground                   1.937500
Wai Tsuen Sports Centre                1.823529
Tsing Yi Sports Centre                 1.333333
Kwun Chung Sports Centre               0.846154
Pei Ho Street Sports Centre            0.810811
Java Road Sports Centre                0.750000
HK Squash Centre                       0.707819
HK CLUB                                0.625000
HKJC Sha Tin Club House                0.625000
Yuen Long Jockey Club Squash Courts    0.588235
Cornwall Street Squash Centre          0.577657
Kowloon Cricket Club                   0.483516
Ladies Recreation Club                 0.482759
Masters - HK Squash Centre             0.458333
Hong Kong Cricket Club                 0.428571
Hong Kong Football Club                0.314607
United Services Recreation Club        0.240741
Harbour Road Sports Centre             0.090909
Craigengower Cricket Club              0.000000
Royal Hong Kong Yacht Club        

In [85]:
filtered_merged_5_2023_df.groupby("Division")["CR_WO"].mean().sort_values(ascending=False)

Division
17A    1.436620
18     1.250000
12B    1.060606
19     1.000000
17B    0.886364
13     0.800000
16     0.650943
3      0.617647
7      0.490909
5      0.480769
2      0.472222
12A    0.447368
11     0.333333
4      0.272727
M3     0.264706
14     0.128205
10     0.120690
8      0.092308
15     0.088889
M2     0.015152
Name: CR_WO, dtype: float64