In [72]:
import pandas as pd
import os

In [73]:
base_directory = r"C:\Users\bpali\PycharmProjects\SquashApp\previous_seasons"

In [74]:
# In the base directory are seasons folders (eg. 2019-2020, 2020-2021, etc.). We want to go through each season folder, go into the ranking_df folder for that season, 
# and read in all the ranking_df files (determined by whether the file ends in ranking_df) into a list of dataframes. We will then concatenate all the dataframes into one big dataframe.
# Add a Season column to the dataframe to keep track of which season the data is from.


df_list = []
for season_folder in os.listdir(base_directory):
    season_folder_path = os.path.join(base_directory, season_folder)
    ranking_df_folder = os.path.join(season_folder_path, "ranking_df")
    for file in os.listdir(ranking_df_folder):
        if not file.endswith("ranking_df.csv"):
            continue
        file_path = os.path.join(ranking_df_folder, file)
        df = pd.read_csv(file_path)
        df["Season"] = season_folder
        df_list.append(df)


In [75]:
df_list[0]

Unnamed: 0,Position,Name of Player,Team,Average Points,Total Game Points,Games Played,Won,Lost,Division,Win Percentage,Season
0,1.0,Thanner Claus,Banbil,24.27,267,11,10,1,10,0.909091,2016-2017
1,2.0,Tong Sze Chai,Perrier KCC 7,23.25,465,20,16,4,10,0.800000,2016-2017
2,3.0,Gates Stuart,Hong Kong Football Club 10A,21.14,444,21,16,5,10,0.761905,2016-2017
3,4.0,Tam Kai Chung Hades,Xavier,20.57,288,14,9,5,10,0.642857,2016-2017
4,5.0,Murakami Akifumi,i-MASK Advance Squash Club 7,20.18,222,11,10,1,10,0.909091,2016-2017
...,...,...,...,...,...,...,...,...,...,...,...
81,,Chan Ka Yue Justin,Perrier KCC 7,0.00,0,0,0,0,10,,2016-2017
82,,Fraser Alexander Leigh,Perrier KCC 7,0.00,0,0,0,0,10,,2016-2017
83,,Chung Kristy Cheuk Ki,Hong Kong Football Club 10A,0.00,0,0,0,0,10,,2016-2017
84,,Mehta Jai,Royal Hong Kong Yacht Club 10,0.00,0,0,0,0,10,,2016-2017


In [76]:
# Concatenate all the dataframes in df_list into one big dataframe
big_df = pd.concat(df_list)

In [77]:
# Get total number of players in each season
total_players = big_df.groupby("Season")["Name of Player"].nunique()

In [78]:
total_players

Season
2016-2017    1890
2017-2018    1946
2018-2019    1900
2019-2020    1925
2021-2022    1592
2022-2023    1519
2023-2024    1657
Name: Name of Player, dtype: int64

In [79]:
# Filter big_df to only include players who have played 5+ games
big_df_filtered = big_df[big_df["Games Played"] >= 5]

In [80]:
# Get total number of players in each season who have played at least 5 matches
total_players_5 = big_df[big_df["Games Played"] >= 5].groupby("Season")["Name of Player"].nunique()
total_players_5

Season
2016-2017    1597
2017-2018    1593
2018-2019    1584
2019-2020    1080
2021-2022     980
2022-2023    1252
2023-2024    1336
Name: Name of Player, dtype: int64

In [81]:
big_df.sort_values(by="Won", ascending=False)

Unnamed: 0,Position,Name of Player,Team,Average Points,Total Game Points,Games Played,Won,Lost,Division,Win Percentage,Season
0,1.0,Liang Chun Wai,i-Mask Advance Squash Club 4,25.17,579,23,22,1,13,0.956522,2022-2023
0,1.0,Dai Kwun San Harry,Perrier Sergio Tacchini KCC 7,25.91,570,22,22,0,11,1.000000,2017-2018
0,1.0,Lin Chun Wah,Alpha CUHK,26.18,576,22,22,0,14,1.000000,2022-2023
0,1.0,Lam Ka Lok Gary,Hong Kong Football Club 7B,25.96,597,23,22,1,7,0.956522,2023-2024
0,1.0,Seung Wai Ching,Happy Squash 1,26.05,573,22,22,0,4,1.000000,2022-2023
...,...,...,...,...,...,...,...,...,...,...,...
31,31.0,Rittmayer Mark Calvin,Royal Hong Kong Yacht Club 10,6.43,45,7,0,7,10,0.000000,2019-2020
30,31.0,Sullivan Andrew,Royal Hong Kong Yacht Club 10,6.43,45,7,0,7,10,0.000000,2019-2020
115,,Cassidy Dennis,HK Club Blue,0.00,0,0,0,0,M4,,2018-2019
114,,Isler Joachim,HK Club Blue,0.00,0,0,0,0,M4,,2018-2019


In [82]:
# Find players in Name of Player column who have played in all 7 seasons of Season column
players = big_df_filtered.groupby("Name of Player")["Season"].nunique().sort_values(ascending=False)

In [83]:
# Filter players who have played in all 7 seasons
players = players[players == 7]

In [84]:
players

Name of Player
To Shing Yuk             7
Olsen Sven               7
Chu Raymond Wai Man      7
Chan Keiko Tsun Ling     7
Nihalani Manoj           7
                        ..
Lau Si Chun              7
Lam Dennis Wun Cheung    7
Lau Yan Kiu              7
Cheung Peter Tat Kei     7
Brettell David           7
Name: Season, Length: 255, dtype: int64

In [85]:
# Drop rows where Division is NaN
big_df_filtered = big_df_filtered.dropna(subset=["Division"])
# Filter out rows where Division contains "M" or "L"
big_df_filtered = big_df_filtered[~big_df_filtered["Division"].str.contains("M|L", na=False)]
# Aggregate the data to ensure no duplicate entries
big_df_filtered_unique = big_df_filtered.groupby(["Name of Player", "Season"])["Division"].first().reset_index()
# Create a new dataframe where the seasons are the columns, the rows are the players, and the values are the division the player was in that season
player_divisions = big_df_filtered_unique.pivot(index="Name of Player", columns="Season", values="Division")

In [86]:
player_divisions

Season,2016-2017,2017-2018,2018-2019,2019-2020,2021-2022,2022-2023,2023-2024
Name of Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
* Chak Tony Kin Chung,15,12B,12A,,,,
* Chan Kwok Wa,,15B,16,15,15A,12A,
* Cheung Tony Cheuk Lim,15,12B,12A,,,,
* Ho Kin Fung,,12B,,,17A,17A,
* Hui Tak Cheung,,15B,16,15,,12A,
...,...,...,...,...,...,...,...
^ West Steven,,,8,8,6,5,8
^ Windmill Mark,6,4,5,4,,,
^ Yu Vincent Hok Yan,2,2,2,,,2,2
^ Zhang James,,5,3,,3,3,


In [87]:
# Add a column to player_divisions that contains the number of seasons each player has played in, then sort by that column
player_divisions["Seasons Played"] = player_divisions.count(axis=1)
player_divisions_sorted = player_divisions.sort_values(by="Seasons Played", ascending=False)

In [88]:
player_divisions_sorted

Season,2016-2017,2017-2018,2018-2019,2019-2020,2021-2022,2022-2023,2023-2024,Seasons Played
Name of Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mak Tsun Hei,12A,12A,12B,4,3,3,3,7
Luen Tsz Yu,2,2,3,2,2,3,3,7
Lau Yan Kiu,8A,6,6,6,8,8,8,7
Leung Eugene Chi Hang,6,7B,7,9,8,10,10,7
Wong Wing Kuen,17A,11,6,6,4,3,2,7
...,...,...,...,...,...,...,...,...
Suri Shivam,,,,,,,16,1
Sutcliffe Mark,13,,,,,,,1
Chiu Damon,,,19A,,,,,1
Li Regan Chek Heng,13,,,,,,,1


In [133]:
# Filter player_divisions_sorted for player whose name contains 'Heron'
filtered = player_divisions_sorted[player_divisions_sorted.index.str.contains("jessica", case=False)]

In [134]:
filtered

Season,2016-2017,2017-2018,2018-2019,2019-2020,2021-2022,2022-2023,2023-2024,Seasons Played
Name of Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Wong Jessica Po Sze,6,6,4,6,6,5,8,7
