In [169]:
import pandas as pd
import numpy as np

In [170]:
all_divisions = {
    "Premier Main": 424,
    "2": 425,
    "3": 426,
    "4": 427,
    "5": 428,
    "6": 429,
    "7A": 430,
    "7B": 431,
    "8A": 432,
    "8B": 433,
    "9": 434,
    "10": 435,
    "11": 436,
    "12": 437,
    "13A": 438,
    "13B": 439,
    "14": 440,
    "15A": 441,
    "15B": 442,
    "Premier Masters": 443,
    "M2": 444,
    "M3": 445,
    "M4": 446,
    "Premier Ladies": 447,
    "L2": 448,
    "L3": 449,
    "L4": 450,
    }

In [171]:
summary_directory = r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\summary_df"
schedules_directory = r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\schedules_df"

In [172]:
awaiting_results_directory = r"C:\Users\bpali\PycharmProjects\SquashApp\2024-2025\awaiting_results"

In [173]:
# In the the schedules directory are week_x folders (eg week_4). First check the most recent folder (ie the highest number), then check the next highest, etc, 
# using the all_divisions dictionary to make sure we load every division's most recent summary file and avoid loading an older file for a division that has a more recent file.

schedules_dfs = []
for division in all_divisions.keys():
    for week in range(25, 0, -1):
        try:
            # Load csv file
            schedules_df = pd.read_csv(f"{schedules_directory}\\week_{week}\\{division}_schedules_df.csv")
            # Create division column
            schedules_df["Division"] = division
            # Append to list
            schedules_dfs.append(schedules_df)
            break
        except FileNotFoundError:
            continue

# Concatenate all the dataframes
schedules_df = pd.concat(schedules_dfs, ignore_index=True)

# Sort schedules_df by 'Division', then by 'Match Week', then by 'Date'
schedules_df = schedules_df.sort_values(by=['Division', 'Match Week', 'Date'])

# Save schedules_df to CSV
schedules_df.to_csv("updated_schedule_2024_2025.csv", index=False)

# Convert Date column to datetime
schedules_df['Date'] = pd.to_datetime(schedules_df['Date'], dayfirst=True)

In [174]:
# In the the summary directory are week_x folders (eg week_4). First check the most recent folder (ie the highest number), then check the next highest, etc, 
# using the all_divisions dictionary to make sure we load every division's most recent summary file and avoid loading an older file for a division that has a more recent file.

summary_dfs = []
for division in all_divisions.keys():
    for week in range(25, 0, -1):
        try:
            summary_df = pd.read_csv(f"{summary_directory}\\week_{week}\\{division}_summary_df.csv")
            summary_df["Division"] = division
            # Create Rank column based on Points column
            summary_df["Rank"] = summary_df["Points"].rank(ascending=False, method="min")
            # Make sure Rank column is an integer
            summary_df["Rank"] = summary_df["Rank"].astype(int)
            # Create column for number of teams in the division
            summary_df["Teams"] = len(summary_df)
            summary_dfs.append(summary_df)
            break
        except FileNotFoundError:
            continue

# Concatenate all the summary dataframes into one
summary_df = pd.concat(summary_dfs, ignore_index=True)

In [175]:
# In the the awaiting_results directory are week_x folders (eg week_4). First check the most recent folder (ie the highest number), then check the next highest, etc, 
# using the all_divisions dictionary to make sure we load every division's most recent summary file and avoid loading an older file for a division that has a more recent file.

awaiting_results_dfs = []
for division in all_divisions.keys():
    for week in range(25, 0, -1):
        try:
            awaiting_results_df = pd.read_csv(f"{awaiting_results_directory}\\week_{week}\\{division}_awaiting_results.csv")
            awaiting_results_df["Division"] = division
            awaiting_results_dfs.append(awaiting_results_df)
            break
        except FileNotFoundError:
            continue

# Concatenate all the awaiting_results dataframes into one
awaiting_results_df = pd.concat(awaiting_results_dfs)

### Create dataframe just for HKCC teams

In [176]:
hkcc = "Hong Kong Cricket Club"

# Filter rows where Team column contains "Hong Kong Cricket Club"
hkcc_summary_df = summary_df[(summary_df["Team"].str.contains(hkcc)) | 
                             (summary_df["Team"].str.contains("hkcc", case=False))].reset_index()

hkcc_summary_df[["Division", "Team", "Played", "Won", "Lost", "Points", "Rank", "Teams"]]

Unnamed: 0,Division,Team,Played,Won,Lost,Points,Rank,Teams
0,Premier Main,Hong Kong Cricket Club 1,7,6,1,20,2,6
1,2,Hong Kong Cricket Club 2,9,1,8,10,4,5
2,4,Hong Kong Cricket Club 3,12,4,8,26,6,8
3,6,Hong Kong Cricket Club 4,12,8,4,42,3,10
4,7A,Hong Kong Cricket Club 5,11,6,5,33,4,7
5,7B,Hong Kong Cricket Club 6,11,7,4,35,4,7
6,11,HKCC Tuesday Night Rockers,11,5,6,33,7,11
7,15A,Hong Kong Cricket Club 8,10,3,7,27,3,7
8,Premier Masters,Hong Kong Cricket Club M1B,10,6,4,23,3,5
9,Premier Masters,Hong Kong Cricket Club M1A,12,3,9,15,4,5


In [218]:
# Count the number of rows where Rank is 4 or less
len(hkcc_summary_df[hkcc_summary_df["Rank"] <= 4])

12

In [177]:
kcc = "Kowloon Cricket Club"

# Filter rows where Team column contains "Hong Kong Cricket Club"
kcc_summary_df = summary_df[summary_df["Team"].str.contains(kcc)].reset_index()

kcc_summary_df["Win %"] = kcc_summary_df["Won"] / kcc_summary_df["Played"] * 100

kcc_summary_df[["Division", "Team", "Played", "Won", "Lost", "Points", "Rank", "Teams", "Win %"]]

Unnamed: 0,Division,Team,Played,Won,Lost,Points,Rank,Teams,Win %
0,Premier Main,Kowloon Cricket Club 1A,7,7,0,25,1,6,100.0
1,Premier Main,Kowloon Cricket Club 1B,7,0,7,4,6,6,0.0
2,2,Kowloon Cricket Club 2,10,10,0,47,1,5,100.0
3,3,Kowloon Cricket Club 3A,11,8,3,44,2,9,72.727273
4,3,Kowloon Cricket Club 3B,10,7,3,36,4,9,70.0
5,4,Kowloon Cricket Club 4,12,10,2,53,2,8,83.333333
6,6,Kowloon Cricket Club 6B,12,12,0,60,1,10,100.0
7,6,Kowloon Cricket Club 6A,12,6,6,39,4,10,50.0
8,7B,Kowloon Cricket Club 7,12,9,3,44,1,7,75.0
9,9,Kowloon Cricket Club 9,13,9,4,45,3,8,69.230769


### Filter schedules_df for HKCC teams

In [178]:
# Filter schedules_df for rows where Home Team column or Away Team column contains "Hong Kong Cricket Club" or "HKCC"
hkcc_schedules_df = schedules_df[(schedules_df["Home Team"].str.contains(hkcc)) |
                                    (schedules_df["Home Team"].str.contains("hkcc", case=False)) |
                                    (schedules_df["Away Team"].str.contains(hkcc)) |
                                    (schedules_df["Away Team"].str.contains("hkcc", case=False))]

### Create results_df from hkcc_schedules_df

In [179]:
def parse_result(result):
    """
    Function to parse the 'result' string
    """
    overall, rubbers = result.split('(')
    rubbers = rubbers.strip(')').split(',')
    return overall, rubbers

In [180]:
def count_games_won(row):
    """
    Function to count the number of games won by each team in a match,
    handling walkovers (WO) and conceded rubbers (CR) by referring to the 'Overall Score'.
    """
    home_games_won = 0
    away_games_won = 0

    # Calculate the games won from the rubbers, excluding 'CR' and 'WO'
    for rubber in row['Rubbers']:
        if rubber == 'CR' or rubber == 'WO':
            continue
        home, away = map(int, rubber.split('-'))
        home_games_won += home
        away_games_won += away

    # Now handle the 'WO' and 'CR' rubbers by referring to the 'Overall Score'
    if 'WO' in row['Rubbers'] or 'CR' in row['Rubbers']:
        home_overall_score, away_overall_score = map(int, row['Overall Score'].split('-'))
        
        # If the home team has a higher overall score, award the missing games to them
        # Otherwise, award the missing games to the away team
        for rubber in row['Rubbers']:
            if rubber == 'WO' or rubber == 'CR':
                if home_overall_score > away_overall_score:
                    home_games_won += 3
                else:
                    away_games_won += 3

    return home_games_won, away_games_won

In [181]:
# Exclude rows where 'Away Team' is '[BYE]' (indicative of a bye week)
results_df = hkcc_schedules_df[hkcc_schedules_df['Away Team'] != '[BYE]'].copy()

# Replace NaN values in 'Result' with an empty string before applying str.contains
results_df['Result'] = results_df['Result'].fillna('')

# Keep rows where 'Result' contains brackets (indicative of a played match)
results_df = results_df[results_df['Result'].str.contains(r'\(')]

# Apply the function to the 'Result' column
results_df[['Overall Score', 'Rubbers']] = results_df['Result'].apply(lambda x: pd.Series(parse_result(x)))

# Drop the original 'Result' column
results_df.drop(columns=['Result'], inplace=True)

# Replace 'CR' with NaN
results_df.replace('CR', np.nan, inplace=True)
results_df.replace('WO', np.nan, inplace=True)

# Count the number of Rubbers For and Against for each team

# Splitting the 'Overall Score' into two separate columns
results_df[['Home Score', 'Away Score']] = results_df['Overall Score'].str.split('-', expand=True).astype(int)

# Initialize dictionaries to keep track of won and conceded rubbers
rubbers_won = {}
rubbers_conceded = {}

# Create Games Won columns
results_df[['Home Games Won', 'Away Games Won']] = results_df.apply(count_games_won, axis=1, result_type='expand')

In [182]:
# Create Home Win column, using Home Games Won vs Away Games Won as tiebreaker
for i, row in results_df.iterrows():
    if row['Home Score'] > row['Away Score']:
        results_df.loc[i, 'Home Win'] = 1
    elif row['Home Score'] < row['Away Score']:
        results_df.loc[i, 'Home Win'] = 0
    else:
        if row["Home Games Won"] > row["Away Games Won"]:
            results_df.loc[i, 'Home Win'] = 1
        elif row["Home Games Won"] < row["Away Games Won"]:
            results_df.loc[i, 'Home Win'] = 0
        else:
            results_df.loc[i, 'Home Win'] = "Don't know"


  results_df.loc[i, 'Home Win'] = "Don't know"


In [183]:
# Find the row where Home Win is "Don't know"
results_df[results_df["Home Win"] == "Don't know"]

Unnamed: 0,Home Team,vs,Away Team,Venue,Time,Match Week,Date,Division,Overall Score,Rubbers,Home Score,Away Score,Home Games Won,Away Games Won,Home Win
55,Hong Kong Cricket Club 2,v,Hong Kong Football Club 2B,Hong Kong Cricket Club,19:00,9,2024-12-16,2,2-2,"[3-0, 3-1, 1-3, 0-3]",2,2,7,7,Don't know


In [184]:
# For the row where Home Win is "Don't know", change Home Win to 0, Home Points to 2, and Away Points to 3
results_df.loc[results_df["Home Win"] == "Don't know", "Home Win"] = 0
results_df.loc[results_df["Home Win"] == "Don't know", "Home Points"] = 2
results_df.loc[results_df["Home Win"] == "Don't know", "Away Points"] = 3

In [185]:
results_df["Home Win"] = results_df["Home Win"].astype(int) # if this leads to error, we have an unknown 'tied' result to investigate

In [186]:
# Function to create Home Points and Away Points columns
def create_points_columns(df):

    df['Home Points'] = 0
    df['Away Points'] = 0

    for i, row in df.iterrows():
        if row['Home Score'] > row['Away Score']:
            df.at[i, 'Home Points'] = row["Home Score"] + 1
            df.at[i, 'Away Points'] = row["Away Score"]
        elif row['Home Score'] < row['Away Score']:
            df.at[i, 'Home Points'] = row["Home Score"]
            df.at[i, 'Away Points'] = row["Away Score"] + 1
        else:
            if row["Home Games Won"] > row["Away Games Won"]:
                df.at[i, 'Home Points'] = row["Home Score"] + 1
                df.at[i, 'Away Points'] = row["Away Score"]
            elif row["Home Games Won"] < row["Away Games Won"]:
                df.at[i, 'Home Points'] = row["Home Score"]
                df.at[i, 'Away Points'] = row["Away Score"] + 1
            else:
                print(f"Error: No winner found for {row['Home Team']} vs {row['Away Team']}")

    return df

# Apply the function to the dataframe
results_df = create_points_columns(results_df)

Error: No winner found for Hong Kong Cricket Club 2 vs Hong Kong Football Club 2B


In [187]:
# Convert Date column to datetime format and then filter for rows that are in November
results_df['Date'] = pd.to_datetime(results_df['Date'], dayfirst=True)
november_results_df = results_df[results_df['Date'].dt.month == 11]
november_results_df.head()

Unnamed: 0,Home Team,vs,Away Team,Venue,Time,Match Week,Date,Division,Overall Score,Rubbers,Home Score,Away Score,Home Games Won,Away Games Won,Home Win,Home Points,Away Points
982,HKCC Tuesday Night Rockers,v,Sha Tin 5,Hong Kong Cricket Club,19:00,5,2024-11-05,11,1-4,"[3-1, 0-3, 0-3, 1-3, 1-3]",1,4,5,13,0,1,5
984,Global Squash 2,v,HKCC Tuesday Night Rockers,Harbour Road Sports Centre,19:00,6,2024-11-12,11,4-1,"[3-0, 2-3, 3-0, 3-0, 3-0]",4,1,14,3,1,5,1
993,HKCC Tuesday Night Rockers,v,Invader Squash,Hong Kong Cricket Club,19:00,7,2024-11-19,11,5-0,"[3-2, 3-0, 3-0, 3-0, 3-2]",5,0,15,4,1,6,0
997,HKCC Tuesday Night Rockers,v,7NT,Hong Kong Cricket Club,19:00,8,2024-11-26,11,5-0,"[3-2, 3-1, 3-0, 3-0, 3-0]",5,0,15,3,1,6,0
1582,Hong Kong Cricket Club 8,v,Squashathon One,Hong Kong Cricket Club,17:00,5,2024-11-23,15A,2-3,"[0-3, 0-3, 0-3, 3-0, CR]",2,3,3,12,0,2,4


In [189]:
dec_jan_results_df = results_df[(results_df['Date'].dt.month == 12) 
                                | (results_df['Date'].dt.month == 1)]
len(dec_jan_results_df)

59

In [217]:
# Create a dataframe similar to hkcc_summary_df but based on the results in november_results_df
# To do this, we aggregate the Home Team Home Points won and Away Team Away Points won and combine them
# We also aggregate the number of games played by each team

# Create a dictionary to store the points won by each team
points_won = {}
games_played = {}
games_won = {}

# Iterate over the rows in november_results_df
for i, row in dec_jan_results_df.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    home_points = row['Home Points']
    away_points = row['Away Points']

    # If the team is not in the dictionary, add it
    if home_team not in points_won:
        points_won[home_team] = 0
    if away_team not in points_won:
        points_won[away_team] = 0

    # Add the points won to the dictionary
    points_won[home_team] += home_points
    points_won[away_team] += away_points

    # Add games played to the dictionary
    if home_team not in games_played:
        games_played[home_team] = 0
    if away_team not in games_played:
        games_played[away_team] = 0

    games_played[home_team] += 1
    games_played[away_team] += 1

    # Add games won to the dictionary
    if home_team not in games_won:
        games_won[home_team] = 0
    if away_team not in games_won:
        games_won[away_team] = 0
    
    if row["Home Win"] == 1:
        games_won[home_team] += 1
    else:
        games_won[away_team] += 1

# Create a dataframe from the dictionary
monthly_summary_df = pd.DataFrame(list(points_won.items()), columns=['Team', 'Points'])
monthly_summary_df['Played'] = monthly_summary_df['Team'].map(games_played)
monthly_summary_df['Won'] = monthly_summary_df['Team'].map(games_won)
monthly_summary_df['Lost'] = monthly_summary_df['Played'] - monthly_summary_df['Won']

# Drop Team rows that don't contain "Hong Kong Cricket Club" or "HKCC"
monthly_summary_df = monthly_summary_df[(monthly_summary_df["Team"].str.contains(hkcc)) |
                                          (monthly_summary_df["Team"].str.contains("hkcc", case=False))]

# Create Division column by using Team column of november_summary_df and hkcc_summary_df
monthly_summary_df["Division"] = monthly_summary_df["Team"].map(hkcc_summary_df.set_index("Team")["Division"])

monthly_summary_df = monthly_summary_df[["Division", "Team", "Played", "Won", "Lost", "Points"]]

# Sort by Division in this order: Premier Main, 2, 3, 4, 5, 6, 7A, 7B, 8A, 8B, 9, 10, 11, 12, 13A, 13B, 14, 15A, 15B, Premier Masters, M2, M3, M4, Premier Ladies, L2, L3, L4
monthly_summary_df["Division"] = pd.Categorical(monthly_summary_df["Division"], 
                                               categories=["Premier Main", "2", "3", "4", "5", "6", "7A", "7B", "8A", "8B", "9", "10", "11", "12", "13A", "13B", "14", "15A", "15B", "Premier Masters", "M2", "M3", "M4", "Premier Ladies", "L2", "L3", "L4"],
                                               ordered=True)

monthly_summary_df = monthly_summary_df.sort_values(by=["Division", "Team"], ascending=[True, True])

monthly_summary_df

Unnamed: 0,Division,Team,Played,Won,Lost,Points
62,Premier Main,Hong Kong Cricket Club 1,2,2,0,7
10,2,Hong Kong Cricket Club 2,3,0,3,0
15,4,Hong Kong Cricket Club 3,4,1,3,6
20,6,Hong Kong Cricket Club 4,4,3,1,14
24,7A,Hong Kong Cricket Club 5,4,3,1,14
30,7B,Hong Kong Cricket Club 6,4,4,0,17
1,11,HKCC Tuesday Night Rockers,4,3,1,15
6,15A,Hong Kong Cricket Club 8,4,2,2,12
65,Premier Masters,Hong Kong Cricket Club M1A,4,0,4,3
66,Premier Masters,Hong Kong Cricket Club M1B,3,2,1,8


In [191]:
# If 'Home Team' contains 'Hong Kong Cricket Club' or 'HKCC', then summarize the Home Points won for each Home Team. Do the same for the Away Team.
# Then combine the two dataframes into one.

hkcc_home_points = results_df[results_df['Home Team'].str.contains(hkcc) 
                              | results_df['Home Team'].str.contains('hkcc', case=False)].groupby('Home Team')['Home Points'].sum()

hkcc_away_points = results_df[results_df['Away Team'].str.contains(hkcc) 
                                | results_df['Away Team'].str.contains('hkcc', case=False)].groupby('Away Team')['Away Points'].sum()

hkcc_points_df = pd.concat([hkcc_home_points, hkcc_away_points], axis=1).fillna(0)

In [192]:
hkcc_points_df

Unnamed: 0,Home Points,Away Points
HKCC Tuesday Night Rockers,24,9
Hong Kong Cricket Club 1,14,6
Hong Kong Cricket Club 2,6,2
Hong Kong Cricket Club 3,16,10
Hong Kong Cricket Club 4,24,18
Hong Kong Cricket Club 5,21,12
Hong Kong Cricket Club 6,16,19
Hong Kong Cricket Club 8,15,12
Hong Kong Cricket Club L1,0,2
Hong Kong Cricket Club L2,5,9


In [193]:
results_df[results_df["Home Score"] == 
            results_df["Away Score"]]

Unnamed: 0,Home Team,vs,Away Team,Venue,Time,Match Week,Date,Division,Overall Score,Rubbers,Home Score,Away Score,Home Games Won,Away Games Won,Home Win,Home Points,Away Points
55,Hong Kong Cricket Club 2,v,Hong Kong Football Club 2B,Hong Kong Cricket Club,19:00,9,2024-12-16,2,2-2,"[3-0, 3-1, 1-3, 0-3]",2,2,7,7,0,0,0
2062,Hong Kong Football Club L2A,v,Hong Kong Cricket Club L2,Hong Kong Football Club,19:00,4,2024-10-29,L2,2-2,"[3-1, 0-3, 3-1, 0-3]",2,2,6,8,0,2,3
2080,Hong Kong Football Club L2C,v,Hong Kong Cricket Club L2,Hong Kong Football Club,19:00,10,2025-01-07,L2,2-2,"[0-3, 1-3, 3-0, 3-0]",2,2,7,6,1,3,2
1796,Hong Kong Cricket Club M2A,v,Hong Kong Cricket Club M2B,Hong Kong Cricket Club,19:00,1,2024-10-02,M2,2-2,"[3-0, 1-3, 3-0, 0-3]",2,2,7,6,1,3,2
1801,Ladies Recreation Club M2A,v,Hong Kong Cricket Club M2A,Ladies Recreation Club,19:00,3,2024-10-16,M2,2-2,"[3-1, 3-1, 0-3, 0-3]",2,2,6,8,0,2,3
1804,Hong Kong Cricket Club M2B,v,Ladies Recreation Club M2A,Hong Kong Cricket Club,19:00,4,2024-10-23,M2,2-2,"[0-3, 3-2, 3-0, 1-3]",2,2,7,8,0,2,3
1816,Hong Kong Cricket Club M2A,v,Ladies Recreation Club M2A,Hong Kong Cricket Club,19:00,8,2024-11-20,M2,2-2,"[3-0, 2-3, 2-3, 3-2]",2,2,10,8,1,3,2
1819,Ladies Recreation Club M2A,v,Hong Kong Cricket Club M2B,Ladies Recreation Club,19:00,9,2024-11-27,M2,2-2,"[3-2, 1-3, 1-3, CR]",2,2,5,11,0,2,3
1825,Hong Kong Football Club M2B,v,Hong Kong Cricket Club M2A,Hong Kong Football Club,19:00,11,2025-01-08,M2,2-2,"[3-1, 1-3, 1-3, 3-0]",2,2,8,7,1,3,2
1829,Hong Kong Cricket Club M2B,v,Hong Kong Football Club M2B,Hong Kong Cricket Club,19:00,12,2025-01-15,M2,2-2,"[1-3, 3-1, 3-1, 0-3]",2,2,7,8,0,2,3


In [194]:
round(hkcc_summary_df["Won"].sum() / hkcc_summary_df["Played"].sum(), 3)

0.446

### Show HKCC results that haven't been uploaded yet

In [195]:
# Filter rows where Home Team column or Away Team column contains "Hong Kong Cricket Club" or "hkcc" (case-insensitive)
hkcc_awaiting_results_df = awaiting_results_df[(awaiting_results_df["Home Team"].str.contains(hkcc, case=False)) | 
                                               (awaiting_results_df["Away Team"].str.contains(hkcc, case=False)) |
                                               (awaiting_results_df["Home Team"].str.contains("hkcc", case=False)) |
                                                (awaiting_results_df["Away Team"].str.contains("hkcc", case=False))]


In [196]:
hkcc_awaiting_results_df.sort_values("Date")

Unnamed: 0,Home Team,Away Team,Venue,Match Week,Date,Division


In [197]:
hkcc_schedules_df

Unnamed: 0,Home Team,vs,Away Team,Venue,Time,Result,Match Week,Date,Division
957,Star River 1,v,HKCC Tuesday Night Rockers,Cornwall Street Squash Centre,19:00,"4-1(2-3,3-0,3-1,3-1,3-1)",1,2024-10-08,11
960,Tuen Mun Squash Club,v,HKCC Tuesday Night Rockers,Cornwall Street Squash Centre,19:00,"3-2(3-0,3-0,3-0,2-3,0-3)",2,2024-10-15,11
970,BOSS,v,HKCC Tuesday Night Rockers,Cornwall Street Squash Centre,19:00,"4-1(0-3,3-1,3-0,3-0,3-0)",3,2024-10-22,11
977,HKCC Tuesday Night Rockers,v,[BYE],,,,4,2024-10-29,11
982,HKCC Tuesday Night Rockers,v,Sha Tin 5,Hong Kong Cricket Club,19:00,"1-4(3-1,0-3,0-3,1-3,1-3)",5,2024-11-05,11
...,...,...,...,...,...,...,...,...,...
1786,Hong Kong Cricket Club M1A,v,Hong Kong Football Club M1A,Hong Kong Cricket Club,19:00,,18,2025-03-13,Premier Masters
1788,Hong Kong Football Club M1A,v,Hong Kong Cricket Club M1B,Hong Kong Football Club,19:00,,19,2025-03-20,Premier Masters
1790,Hong Kong Cricket Club M1A,v,[BYE],,,,19,2025-03-20,Premier Masters
1791,Hong Kong Football Club M1A,v,Hong Kong Cricket Club M1B,Hong Kong Football Club,19:00,,20,2025-03-27,Premier Masters


In [198]:
# In the the schedules directory are week_x folders (eg week_4). First check the most recent folder (ie the highest number), then check the next highest, etc, 
# using the all_divisions dictionary to make sure we load every division's most recent summary file and avoid loading an older file for a division that has a more recent file.

old_schedules_dfs = []
for division in all_divisions.keys():
    for week in range(0, 5, 1):
        try:
            # Load csv file
            schedules_df = pd.read_csv(f"{schedules_directory}\\week_{week}\\{division}_schedules_df.csv")
            # Create division column
            schedules_df["Division"] = division
            # Append to list
            old_schedules_dfs.append(schedules_df)
            break
        except FileNotFoundError:
            continue

# Concatenate all the dataframes
old_schedules_df = pd.concat(old_schedules_dfs, ignore_index=True)

# Convert Date column to datetime
old_schedules_df['Date'] = pd.to_datetime(old_schedules_df['Date'], dayfirst=True)

# Convert all rows where 'Home Team' or 'Away Team' contains 'Hong Kong Cricket Club 7' to 'HKCC Tuesday Night Rockers'2024-2025'
old_schedules_df.loc[old_schedules_df['Home Team'].str.contains('Hong Kong Cricket Club 7'), 'Home Team'] = 'HKCC Tuesday Night Rockers'
old_schedules_df.loc[old_schedules_df['Away Team'].str.contains('Hong Kong Cricket Club 7'), 'Away Team'] = 'HKCC Tuesday Night Rockers'

In [199]:
# Create hkcc_old_schedules_df
hkcc_old_schedules_df = old_schedules_df[(old_schedules_df["Home Team"].str.contains(hkcc)) |
                                         (old_schedules_df["Away Team"].str.contains(hkcc)) |
                                         (old_schedules_df["Home Team"].str.contains("hkcc", case=False)) |
                                         (old_schedules_df["Away Team"].str.contains("hkcc", case=False))]

In [200]:
hkcc_schedules_df = hkcc_schedules_df.sort_values("Date", ascending=True)

In [201]:
hkcc_old_schedules_df = hkcc_old_schedules_df.sort_values("Date", ascending=True)

In [202]:
# Drop unnecessary columns
columns_to_drop = ["vs", "Time", "Result"]
hkcc_schedules_df.drop(columns=columns_to_drop, inplace=True)
hkcc_old_schedules_df.drop(columns=columns_to_drop, inplace=True)

# Drop rows where 'Away Team' is '[BYE]'
hkcc_schedules_df = hkcc_schedules_df[hkcc_schedules_df['Away Team'] != '[BYE]']
hkcc_old_schedules_df = hkcc_old_schedules_df[hkcc_old_schedules_df['Away Team'] != '[BYE]']

In [203]:
# Merge the two dataframes on 'Home Team' and 'Away Team', and "Date"
merged_df = pd.merge(hkcc_schedules_df, hkcc_old_schedules_df, on=["Home Team", "Away Team", "Date", "Division", "Venue"], suffixes=('', '_old'), how="outer")

In [204]:
# Sort the dataframe by 'Date'
merged_df = merged_df.sort_values("Date")

In [205]:
# Show rows where Match Week is NaN or Match Week_old is NaN
problem_df_new = merged_df[merged_df["Match Week"].isna()]
problem_df_old = merged_df[merged_df["Match Week_old"].isna()]



In [206]:
# Merge the two dataframes on 'Home Team' and 'Away Team', and "Venue"
merged_problem_df = pd.merge(problem_df_new, problem_df_old, on=["Home Team", "Away Team"], suffixes=('', '_new'), how="outer")

In [207]:
# Rename Date to "Old Date" and "Date_new" to "New Date"
merged_problem_df.rename(columns={"Date": "Old Date", "Date_new": "New Date"}, inplace=True)

In [208]:
merged_problem_df = merged_problem_df[["Home Team", "Away Team", "Old Date", "New Date", "Division"]]

In [209]:
# Filter merged_problem_df for rows where 'Home Team' contains "hkcc" or "Hong Kong Cricket Club"
hkcc_problem_df = merged_problem_df[(merged_problem_df["Home Team"].str.contains(hkcc)) | 
                                    (merged_problem_df["Home Team"].str.contains("hkcc", case=False))]
hkcc_problem_df

Unnamed: 0,Home Team,Away Team,Old Date,New Date,Division
1,Hong Kong Cricket Club 4,United Services Recreation Club 2,2024-12-09,2025-03-17,6
2,HKCC Tuesday Night Rockers,i-Mask Advance Squash Club 4,2024-12-10,2025-04-29,11
3,Hong Kong Cricket Club L2,Hong Kong Football Club L2A,2024-12-10,2025-04-01,L2
6,Hong Kong Cricket Club 5,The Best Group 1,2024-12-11,2025-04-02,7A
7,Hong Kong Cricket Club 6,Physical Chess 2,2024-12-11,2025-04-02,7B
9,Hong Kong Cricket Club M3,Hong Kong Football Club M3A,2024-12-12,2025-04-03,M3
11,Hong Kong Cricket Club Rockettes,Hong Kong Football Club L3C,2024-12-13,2025-03-28,L3


In [210]:
merged_problem_df

Unnamed: 0,Home Team,Away Team,Old Date,New Date,Division
0,Young Player 2,Hong Kong Cricket Club 2,2024-12-09,2025-03-31,2
1,Hong Kong Cricket Club 4,United Services Recreation Club 2,2024-12-09,2025-03-17,6
2,HKCC Tuesday Night Rockers,i-Mask Advance Squash Club 4,2024-12-10,2025-04-29,11
3,Hong Kong Cricket Club L2,Hong Kong Football Club L2A,2024-12-10,2025-04-01,L2
4,X-Alpha 1,Hong Kong Cricket Club 3,2024-12-10,2025-04-08,4
5,Ladies Recreation Club M2B,Hong Kong Cricket Club M2A,2024-12-11,2025-03-26,M2
6,Hong Kong Cricket Club 5,The Best Group 1,2024-12-11,2025-04-02,7A
7,Hong Kong Cricket Club 6,Physical Chess 2,2024-12-11,2025-04-02,7B
8,Hong Kong Football Club M2A,Hong Kong Cricket Club M2B,2024-12-11,2025-03-26,M2
9,Hong Kong Cricket Club M3,Hong Kong Football Club M3A,2024-12-12,2025-04-03,M3
