In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./crawled/merged_premier_league_stats.csv")
df.replace('N/a', np.nan, inplace=True)
df

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,Playing Time-MP,Playing Time-Starts,Playing Time-Min,Playing Time-90s,...,Performance-Crs,Performance-Int,Performance-TklW,Performance-PKwon,Performance-PKcon,Performance-OG,Performance-Recov,Aerial Duels-Won,Aerial Duels-Lost,Aerial Duels-Won%
0,Aaron Cresswell,eng ENG,"DF,FW",West Ham,33,1989,11,4,436,4.8,...,11,3,2,0,0,0,18,6,3,66.7
1,Aaron Hickey,sct SCO,DF,Brentford,21,2002,9,9,713,7.9,...,5,3,9,0,0,0,42,1,9,10.0
2,Aaron Ramsdale,eng ENG,GK,Arsenal,25,1998,6,6,540,6.0,...,0,0,0,0,0,0,6,0,0,
3,Aaron Ramsey,eng ENG,"MF,FW",Burnley,20,2003,14,5,527,5.9,...,2,1,14,0,1,0,24,4,5,44.4
4,Aaron Wan-Bissaka,eng ENG,DF,Manchester Utd,25,1997,22,20,1780,19.8,...,14,41,25,0,1,0,94,21,19,52.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,Yves Bissouma,ml MLI,MF,Tottenham,26,1996,28,26,2068,23.0,...,1,37,52,0,0,0,128,14,15,48.3
489,Zeki Amdouni,ch SUI,FW,Burnley,22,2000,34,27,1953,21.7,...,8,9,18,1,0,0,79,15,56,21.1
490,Álex Moreno,es ESP,DF,Aston Villa,30,1993,21,11,1031,11.5,...,37,13,12,0,0,1,40,4,10,28.6
491,Đorđe Petrović,rs SRB,GK,Chelsea,23,1999,23,22,1987,22.1,...,0,0,0,0,0,0,17,5,0,100.0


In [4]:
exclude_cols = ['Player', 'Nation', 'Pos', 'Squad']
for col in df.columns:
    if df[col].dtype == 'object' and col not in exclude_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("Non-numeric:", non_numeric_cols)

Non-numeric: ['Player', 'Nation', 'Pos', 'Squad']


In [5]:
def get_top_bottom_players(df, stat):
    top_3 = df.nlargest(3, stat)[['Player', 'Squad', stat]]
    bottom_3 = df.nsmallest(3, stat)[['Player', 'Squad', stat]]
    return top_3, bottom_3

# Identify numeric columns 
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

stats_cols = [col for col in numeric_cols if col not in exclude_cols]

In [6]:
results = {}

# Analyze each statistic
for stat in stats_cols:
    # Skip columns with NaN values
    if df[stat].isna().all():
        continue
    
    # Get top and bottom 3 players
    top_3, bottom_3 = get_top_bottom_players(df, stat)
    
    results[stat] = {
        'top_3': top_3.to_dict('records'),
        'bottom_3': bottom_3.to_dict('records')
    }

In [7]:
output_rows = []
for stat, data in results.items():
    # Add top 3 players
    for i in range(3):
        output_rows.append({
            'Statistic': stat,
            'Rank': f'Top {i+1}',
            'Player': data['top_3'][i]['Player'],
            'Squad': data['top_3'][i]['Squad'],
            'Value': data['top_3'][i][stat]
        })
    # Add bottom 3 players
    for i in range(3):
        output_rows.append({
            'Statistic': stat,
            'Rank': f'Bottom {i+1}',
            'Player': data['bottom_3'][i]['Player'],
            'Squad': data['bottom_3'][i]['Squad'],
            'Value': data['bottom_3'][i][stat]
        })

In [8]:
output_df = pd.DataFrame(output_rows)
output_df

Unnamed: 0,Statistic,Rank,Player,Squad,Value
0,Age,Top 1,Ashley Young,Everton,38.0
1,Age,Top 2,Thiago Silva,Chelsea,38.0
2,Age,Top 3,Łukasz Fabiański,West Ham,38.0
3,Age,Bottom 1,Leon Chiwome,Wolves,17.0
4,Age,Bottom 2,Lewis Miley,Newcastle Utd,17.0
...,...,...,...,...,...
1117,Aerial Duels-Won%,Top 2,Bart Verbruggen,Brighton,100.0
1118,Aerial Duels-Won%,Top 3,Caoimhín Kelleher,Liverpool,100.0
1119,Aerial Duels-Won%,Bottom 1,Anass Zaroury,Burnley,0.0
1120,Aerial Duels-Won%,Bottom 2,Andros Townsend,Luton Town,0.0


In [9]:
output_df.to_csv("./crawled/player_stats_analysis.csv", index=False)
print("\nDetailed results saved to 'player_stats_analysis.csv'")


Detailed results saved to 'player_stats_analysis.csv'


In [10]:
results = []

# Overall statistics
overall_stats = df[stats_cols].agg(['median', 'mean', 'std']).T
overall_stats.reset_index(inplace=True)
overall_stats.columns = ['Attribute', 'Median of Attribute 1', 'Mean of Attribute 1', 'Std of Attribute 1']
results.append(['All'] + overall_stats[['Median of Attribute 1', 'Mean of Attribute 1', 'Std of Attribute 1']].values.flatten().tolist())

results

[['All',
  25.0,
  25.49898580121704,
  4.127355078609052,
  1997.0,
  1997.1115618661258,
  4.134640520095159,
  23.0,
  22.657200811359026,
  10.136975196396461,
  16.0,
  16.941176470588236,
  11.167178800079329,
  1419.0,
  1518.369168356998,
  949.2410584329765,
  15.8,
  16.86997971602434,
  10.54645893832613,
  1.0,
  2.4239350912778903,
  3.81599052286232,
  1.0,
  1.7363083164300204,
  2.473321094819899,
  2.0,
  4.1602434077079105,
  5.62737587959306,
  1.0,
  2.22920892494929,
  3.3523136627191548,
  0.0,
  0.1947261663286004,
  0.8425984394403213,
  0.0,
  0.21703853955375255,
  0.9280814158064703,
  3.0,
  3.3204868154158214,
  2.842535183000222,
  0.0,
  0.11764705882352941,
  0.34089930858535655,
  1.1,
  2.436105476673428,
  3.5713907594005634,
  1.1,
  2.263894523326572,
  3.1127254293638167,
  1.0,
  1.7545638945233266,
  2.193422461818703,
  2.3,
  4.020892494929006,
  4.785716129967442,
  17.0,
  28.851926977687626,
  32.30420039482681,
  44.0,
  59.33265720081136,


In [11]:
# Team-wise statistics
team_stats = df.groupby('Squad')[stats_cols].agg(['median', 'mean', 'std']).reset_index()
team_stats.columns = ['Team'] + [f'{stat} {metric}' for stat, metric in team_stats.columns[1:]]

# Append team statistics to results
for _, row in team_stats.iterrows():
    team_row = [row['Team']]
    for stat in stats_cols:
        team_row.extend(row[[f'{stat} median', f'{stat} mean', f'{stat} std']].values.flatten().tolist())
    results.append(team_row)

results

[['All',
  25.0,
  25.49898580121704,
  4.127355078609052,
  1997.0,
  1997.1115618661258,
  4.134640520095159,
  23.0,
  22.657200811359026,
  10.136975196396461,
  16.0,
  16.941176470588236,
  11.167178800079329,
  1419.0,
  1518.369168356998,
  949.2410584329765,
  15.8,
  16.86997971602434,
  10.54645893832613,
  1.0,
  2.4239350912778903,
  3.81599052286232,
  1.0,
  1.7363083164300204,
  2.473321094819899,
  2.0,
  4.1602434077079105,
  5.62737587959306,
  1.0,
  2.22920892494929,
  3.3523136627191548,
  0.0,
  0.1947261663286004,
  0.8425984394403213,
  0.0,
  0.21703853955375255,
  0.9280814158064703,
  3.0,
  3.3204868154158214,
  2.842535183000222,
  0.0,
  0.11764705882352941,
  0.34089930858535655,
  1.1,
  2.436105476673428,
  3.5713907594005634,
  1.1,
  2.263894523326572,
  3.1127254293638167,
  1.0,
  1.7545638945233266,
  2.193422461818703,
  2.3,
  4.020892494929006,
  4.785716129967442,
  17.0,
  28.851926977687626,
  32.30420039482681,
  44.0,
  59.33265720081136,


In [12]:
num_attributes = len(stats_cols)

In [13]:
# Generate column names
columns = ['Team'] + [f'{stat} {metric}' for stat in stats_cols for metric in ['Median', 'Mean', 'Std']]
final_stats_df = pd.DataFrame(results, columns=columns)

In [14]:
# Add an index column starting from 1
final_stats_df.index += 1

In [15]:
final_stats_df.to_csv('./crawled/team_stats_analysis.csv', index=True)
final_stats_df.to_csv('./crawled/results2.csv', index=True)
print("Results have been saved to 'team_stats_analysis'")

Results have been saved to 'team_stats_analysis'


In [16]:
df['Team'] = df['Squad']
df_all_players = df.copy()
df_all_players['Team'] = 'All Players'

df_combined = pd.concat([df, df_all_players])
df_combined

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,Playing Time-MP,Playing Time-Starts,Playing Time-Min,Playing Time-90s,...,Performance-Int,Performance-TklW,Performance-PKwon,Performance-PKcon,Performance-OG,Performance-Recov,Aerial Duels-Won,Aerial Duels-Lost,Aerial Duels-Won%,Team
0,Aaron Cresswell,eng ENG,"DF,FW",West Ham,33,1989,11,4,436,4.8,...,3,2,0,0,0,18,6,3,66.7,West Ham
1,Aaron Hickey,sct SCO,DF,Brentford,21,2002,9,9,713,7.9,...,3,9,0,0,0,42,1,9,10.0,Brentford
2,Aaron Ramsdale,eng ENG,GK,Arsenal,25,1998,6,6,540,6.0,...,0,0,0,0,0,6,0,0,,Arsenal
3,Aaron Ramsey,eng ENG,"MF,FW",Burnley,20,2003,14,5,527,5.9,...,1,14,0,1,0,24,4,5,44.4,Burnley
4,Aaron Wan-Bissaka,eng ENG,DF,Manchester Utd,25,1997,22,20,1780,19.8,...,41,25,0,1,0,94,21,19,52.5,Manchester Utd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,Yves Bissouma,ml MLI,MF,Tottenham,26,1996,28,26,2068,23.0,...,37,52,0,0,0,128,14,15,48.3,All Players
489,Zeki Amdouni,ch SUI,FW,Burnley,22,2000,34,27,1953,21.7,...,9,18,1,0,0,79,15,56,21.1,All Players
490,Álex Moreno,es ESP,DF,Aston Villa,30,1993,21,11,1031,11.5,...,13,12,0,0,1,40,4,10,28.6,All Players
491,Đorđe Petrović,rs SRB,GK,Chelsea,23,1999,23,22,1987,22.1,...,0,0,0,0,0,17,5,0,100.0,All Players


In [17]:
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [18]:
safe_numeric_cols = [stat.replace('/', '').replace(':', '') for stat in numeric_cols]

In [19]:
df['Team'] = df['Squad']
df_all_players = df.copy()
df_all_players['Team'] = 'All Players'
df_combined = pd.concat([df, df_all_players])
df_combined

Unnamed: 0,Player,Nation,Pos,Squad,Age,Born,Playing Time-MP,Playing Time-Starts,Playing Time-Min,Playing Time-90s,...,Performance-Int,Performance-TklW,Performance-PKwon,Performance-PKcon,Performance-OG,Performance-Recov,Aerial Duels-Won,Aerial Duels-Lost,Aerial Duels-Won%,Team
0,Aaron Cresswell,eng ENG,"DF,FW",West Ham,33,1989,11,4,436,4.8,...,3,2,0,0,0,18,6,3,66.7,West Ham
1,Aaron Hickey,sct SCO,DF,Brentford,21,2002,9,9,713,7.9,...,3,9,0,0,0,42,1,9,10.0,Brentford
2,Aaron Ramsdale,eng ENG,GK,Arsenal,25,1998,6,6,540,6.0,...,0,0,0,0,0,6,0,0,,Arsenal
3,Aaron Ramsey,eng ENG,"MF,FW",Burnley,20,2003,14,5,527,5.9,...,1,14,0,1,0,24,4,5,44.4,Burnley
4,Aaron Wan-Bissaka,eng ENG,DF,Manchester Utd,25,1997,22,20,1780,19.8,...,41,25,0,1,0,94,21,19,52.5,Manchester Utd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,Yves Bissouma,ml MLI,MF,Tottenham,26,1996,28,26,2068,23.0,...,37,52,0,0,0,128,14,15,48.3,All Players
489,Zeki Amdouni,ch SUI,FW,Burnley,22,2000,34,27,1953,21.7,...,9,18,1,0,0,79,15,56,21.1,All Players
490,Álex Moreno,es ESP,DF,Aston Villa,30,1993,21,11,1031,11.5,...,13,12,0,0,1,40,4,10,28.6,All Players
491,Đorđe Petrović,rs SRB,GK,Chelsea,23,1999,23,22,1987,22.1,...,0,0,0,0,0,17,5,0,100.0,All Players


In [20]:
if not os.path.exists('histogram'):
    os.makedirs('histogram')

In [21]:
for stat in numeric_cols:
    plt.figure(figsize=(12, 8))

    # Calculate mean for team
    team_stats = df_combined.groupby('Team')[stat].mean().reset_index()

    # Create a bar plot 
    sns.barplot(data=team_stats, x='Team', y=stat, hue='Team', palette='Set2', errorbar=None, legend=False)

    # Add title and labels
    plt.title(f"Mean {stat} for All Players and Each Team", fontsize=16)
    plt.xlabel("Team", fontsize=14)
    plt.ylabel(stat, fontsize=14)

    # Rotate x-axis labels 
    plt.xticks(rotation=90)
    
    # Save the figure 
    safe_stat_name = stat.replace('/', '').replace(':', '')  # Safe for file names
    plt.savefig(f'histogram/{safe_stat_name}barplot.png', bbox_inches='tight', dpi=300)
    print(f"Saved plot for {stat}")

    # Close the plot 
    plt.close()

Saved plot for Age
Saved plot for Born
Saved plot for Playing Time-MP
Saved plot for Playing Time-Starts
Saved plot for Playing Time-Min
Saved plot for Playing Time-90s
Saved plot for Performance-Gls
Saved plot for Performance-Ast
Saved plot for Performance-G+A
Saved plot for Performance-G-PK
Saved plot for Performance-PK
Saved plot for Performance-PKatt
Saved plot for Performance-CrdY
Saved plot for Performance-CrdR
Saved plot for Expected-xG
Saved plot for Expected-npxG
Saved plot for Expected-xAG
Saved plot for Expected-npxG+xAG
Saved plot for Progression-PrgC
Saved plot for Progression-PrgP
Saved plot for Progression-PrgR
Saved plot for Per 90 Minutes-Gls
Saved plot for Per 90 Minutes-Ast
Saved plot for Per 90 Minutes-G+A
Saved plot for Per 90 Minutes-G-PK
Saved plot for Per 90 Minutes-G+A-PK
Saved plot for Per 90 Minutes-xG
Saved plot for Per 90 Minutes-xAG
Saved plot for Per 90 Minutes-xG+xAG
Saved plot for Per 90 Minutes-npxG
Saved plot for Per 90 Minutes-npxG+xAG
Saved plot for

In [22]:
final_stats_df = pd.read_csv('./crawled/team_stats_analysis.csv', index_col=0)
final_stats_df

Unnamed: 0,Team,Age Median,Age Mean,Age Std,Born Median,Born Mean,Born Std,Playing Time-MP Median,Playing Time-MP Mean,Playing Time-MP Std,...,Performance-Recov Std,Aerial Duels-Won Median,Aerial Duels-Won Mean,Aerial Duels-Won Std,Aerial Duels-Lost Median,Aerial Duels-Lost Mean,Aerial Duels-Lost Std,Aerial Duels-Won% Median,Aerial Duels-Won% Mean,Aerial Duels-Won% Std
1,All,25.0,25.498986,4.127355,1997.0,1997.111562,4.134641,23.0,22.657201,10.136975,...,55.842238,12.0,20.488844,23.827172,16.0,20.444219,20.97471,46.9,48.338809,21.536946
2,Arsenal,24.0,24.761905,2.547641,1998.0,1997.761905,2.718543,27.0,26.809524,10.191266,...,60.381288,15.0,23.809524,28.002891,16.0,23.619048,24.830377,48.8,48.83,21.108868
3,Aston Villa,26.0,25.956522,3.548089,1997.0,1996.695652,3.508882,27.0,24.173913,11.109587,...,52.137057,13.0,14.478261,12.943477,14.0,15.695652,16.457755,50.0,53.656522,22.613669
4,Bournemouth,24.5,25.038462,3.538144,1997.5,1997.653846,3.393433,25.5,22.076923,11.852166,...,66.1474,15.0,22.192308,23.901497,21.0,25.653846,29.389716,42.1,45.768,21.677518
5,Brentford,26.0,25.8,3.593976,1997.0,1996.92,3.661967,26.0,22.96,10.346014,...,51.125434,14.0,25.24,26.833251,23.0,24.0,16.849827,45.6,46.05,17.682957
6,Brighton,23.5,24.785714,5.698324,1999.5,1997.857143,5.628725,20.0,20.928571,8.751417,...,45.848655,7.5,14.464286,17.045651,11.5,12.214286,8.808065,45.7,47.935714,23.503683
7,Burnley,24.0,24.071429,3.838678,1999.0,1998.428571,3.795012,16.0,20.392857,9.346575,...,47.773759,13.5,20.035714,26.141328,22.0,24.714286,23.325169,43.75,40.807143,22.099069
8,Chelsea,22.0,23.0,3.905125,2001.0,1999.56,4.093491,23.0,21.88,9.404432,...,61.512384,9.0,17.12,17.965986,14.0,15.88,13.860375,47.2,49.976,22.813707
9,Crystal Palace,25.5,25.166667,4.280051,1997.5,1997.5,4.413418,22.5,22.458333,9.477567,...,60.453345,12.5,22.916667,24.207287,17.0,23.125,20.749673,48.5,50.266667,17.860514
10,Everton,26.0,26.347826,4.858064,1997.0,1996.304348,4.912274,28.0,23.304348,11.561829,...,68.186068,16.0,29.391304,40.626614,19.0,25.086957,31.137398,47.2,47.827273,21.788489


In [23]:
higher_is_better = [
    'Playing Time-MP Mean', 'Playing Time-Starts Mean', 'Playing Time-Min Mean', 
    'Playing Time-90s Mean', 'Performance-Gls Mean', 'Performance-Ast Mean', 
    'Performance-G+A Mean', 'Performance-G-PK Mean', 'Performance-PK Mean', 
    'Performance-PKatt Mean', 'Expected-xG Mean', 'Expected-npxG Mean', 
    'Expected-xAG Mean', 'Expected-npxG+xAG Mean', 'Progression-PrgC Mean', 
    'Progression-PrgP Mean', 'Progression-PrgR Mean', 'Per 90 Minutes-Gls Mean', 
    'Per 90 Minutes-Ast Mean', 'Per 90 Minutes-G+A Mean', 'Per 90 Minutes-G-PK Mean', 
    'Per 90 Minutes-G+A-PK Mean', 'Per 90 Minutes-xG Mean', 'Per 90 Minutes-xAG Mean', 
    'Per 90 Minutes-xG+xAG Mean', 'Per 90 Minutes-npxG Mean', 
    'Performance-GA Mean', 'Performance-GA90 Mean', 'Performance-SoTA Mean', 
    'Performance-Saves Mean', 'Performance-Save% Mean', 'Performance-W Mean', 
    'Performance-D Mean', 'Performance-L Mean', 'Performance-CS Mean', 
    'Performance-CS% Mean', 'Penalty Kicks-PKatt Mean', 'Penalty Kicks-PKA Mean', 
    'Penalty Kicks-PKsv Mean', 'Penalty Kicks-PKm Mean', 
    'Penalty Kicks-Save% Mean', '90s Mean', 'Standard-Gls Mean', 
    'Standard-Sh Mean', 'Standard-SoT Mean', 'Standard-SoT% Mean', 
    'Standard-Sh/90 Mean', 'Standard-SoT/90 Mean', 'Standard-G/Sh Mean', 
    'Standard-G/SoT Mean', 'Standard-Dist Mean', 'Standard-FK Mean', 
    'Standard-PK Mean', 'Standard-PKatt Mean', 'Expected-npxG/Sh Mean', 
    'Expected-G-xG Mean', 'Expected-np:G-xG Mean', 'Total-Cmp Mean', 
    'Total-Att Mean', 'Total-Cmp% Mean', 'Total-TotDist Mean', 
    'Total-PrgDist Mean', 'Short-Cmp Mean', 'Short-Att Mean', 
    'Short-Cmp% Mean', 'Medium-Cmp Mean', 'Medium-Att Mean', 
    'Medium-Cmp% Mean', 'Long-Cmp Mean', 'Long-Att Mean', 'Long-Cmp% Mean', 
    'Ast Mean', 'xAG Mean', 'Expected-xA Mean', 'Expected-A-xAG Mean', 
    'KP Mean', '1/3 Mean', 'PPA Mean', 'CrsPA Mean', 'PrgP Mean', 
    'Att Mean', 'Pass Types-Live Mean', 'Pass Types-Dead Mean', 
    'Pass Types-FK Mean', 'Pass Types-TB Mean', 'Pass Types-Sw Mean', 
    'Pass Types-Crs Mean', 'Pass Types-TI Mean', 'Pass Types-CK Mean', 
    'Corner Kicks-In Mean', 'Corner Kicks-Out Mean', 'Corner Kicks-Str Mean', 
    'Outcomes-Cmp Mean', 'Outcomes-Off Mean', 'Outcomes-Blocks Mean', 
    'SCA-SCA Mean', 'SCA-SCA90 Mean', 'SCA Types-PassLive Mean', 
    'SCA Types-PassDead Mean', 'SCA Types-TO Mean', 'SCA Types-Sh Mean', 
    'SCA Types-Fld Mean', 'SCA Types-Def Mean', 'GCA-GCA Mean', 
    'GCA-GCA90 Mean', 'GCA Types-PassLive Mean', 'GCA Types-PassDead Mean', 
    'GCA Types-TO Mean', 'GCA Types-Sh Mean', 'GCA Types-Fld Mean', 
    'GCA Types-Def Mean', 'Tackles-Tkl Mean', 'Tackles-TklW Mean', 
    'Tackles-Def 3rd Mean', 'Tackles-Mid 3rd Mean', 'Tackles-Att 3rd Mean', 
    'Challenges-Tkl Mean', 'Challenges-Att Mean', 'Challenges-Tkl% Mean', 
    'Challenges-Lost Mean', 'Blocks-Blocks Mean', 'Blocks-Sh Mean', 
    'Blocks-Pass Mean', 'Int Mean', 'Tkl+Int Mean', 'Clr Mean', 
    'Touches-Touches Mean', 'Touches-Def Pen Mean', 'Touches-Def 3rd Mean', 
    'Touches-Mid 3rd Mean', 'Touches-Att 3rd Mean', 'Touches-Att Pen Mean', 
    'Touches-Live Mean', 'Take-Ons-Att Mean', 'Take-Ons-Succ Mean', 
    'Take-Ons-Succ% Mean', 'Take-Ons-Tkld Mean', 'Take-Ons-Tkld% Mean', 
    'Carries-Carries Mean', 'Carries-TotDist Mean', 'Carries-PrgDist Mean', 
    'Carries-PrgC Mean', 'Carries-1/3 Mean', 'Carries-CPA Mean', 
    'Carries-Mis Mean', 'Carries-Dis Mean', 'Receiving-Rec Mean', 
    'Receiving-PrgR Mean', 'Playing Time-Mn/MP Mean', 'Playing Time-Min% Mean', 
    'Starts-Starts Mean', 'Starts-Mn/Start Mean', 'Starts-Compl Mean', 
    'Subs-Subs Mean', 'Subs-Mn/Sub Mean', 'Subs-unSub Mean', 
    'Team Success-PPM Mean', 'Team Success-onG Mean', 'Team Success-onGA Mean', 
    'Team Success-+/- Mean', 'Team Success-+/-90 Mean', 'Team Success-On-Off Mean', 
    'Team Success (xG)-onxG Mean', 'Team Success (xG)-onxGA Mean', 
    'Team Success (xG)-xG+/- Mean', 'Team Success (xG)-xG+/-90 Mean', 
    'Team Success (xG)-On-Off Mean', 'Performance-PKwon Mean', 
    'Performance-OG Mean', 'Performance-Recov Mean', 'Aerial Duels-Won Mean', 
    'Aerial Duels-Lost Mean', 'Aerial Duels-Won% Mean'
]

lower_is_better = [
    'Performance-CrdY Mean', 'Performance-CrdR Mean', 'Performance-2CrdY Mean', 
    'Performance-Fls Mean', 'Performance-Fld Mean', 'Performance-Off Mean', 
    'Performance-Crs Mean', 'Performance-Int Mean', 'Performance-TklW Mean', 
    'Performance-PKcon Mean', 'Err Mean'
]

In [24]:
normalized_df = final_stats_df.copy()

In [25]:
# Normalize higher is better stats 
for stat in higher_is_better:
    normalized_df[stat] = (normalized_df[stat] - normalized_df[stat].min()) / (normalized_df[stat].max() - normalized_df[stat].min())

# Normalize lower is better stats 
for stat in lower_is_better:
    normalized_df[stat] = 1 - ((normalized_df[stat] - normalized_df[stat].min()) / (normalized_df[stat].max() - normalized_df[stat].min()))

# Initialize a total score 
final_stats_df['Total_Score'] = 0

In [26]:
# Calculate the total score 
for stat in higher_is_better:
    final_stats_df['Total_Score'] += normalized_df[stat]

for stat in lower_is_better:
    final_stats_df['Total_Score'] += normalized_df[stat]

In [27]:
sorted_teams_df = final_stats_df.sort_values(by='Total_Score', ascending=False)
sorted_teams_df[['Team', 'Total_Score']]

Unnamed: 0,Team,Total_Score
14,Manchester City,134.287272
12,Liverpool,124.756984
2,Arsenal,121.787046
11,Fulham,103.681985
19,Tottenham,101.243226
16,Newcastle Utd,99.356396
8,Chelsea,94.071945
3,Aston Villa,91.648256
20,West Ham,89.750068
10,Everton,82.932848


In [28]:
sorted_teams_df.to_csv('./crawled/teams_scores.csv', index=True)

In [29]:
best_team = sorted_teams_df.loc[sorted_teams_df['Total_Score'].idxmax(), 'Team']
highest_total_score = sorted_teams_df['Total_Score'].max()
print(f"\nThe best team is {best_team} with a score of {highest_total_score}")


The best team is Manchester City with a score of 134.28727164164565
