In [100]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [101]:
ml_df = pd.read_csv("output_files/correctly_grouped_data.csv")

In [102]:
del ml_df["Unnamed: 0"]

In [103]:
ml_df

Unnamed: 0,Player,Pos,Squad,Age,MP,Starts,Min,90s,Gls,Ast,...,year,season,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,poy_winner
0,Aaron Connolly,FW,Brighton,19,24,14,1258,14.0,3,1,...,2019,2019/2020,4.0,4.0,0.5,4.5,21,10,86,0
1,Aaron Connolly,FW,Brighton,20,17,9,791,8.8,2,1,...,2020,2020/2021,4.0,4.0,0.2,4.2,12,4,58,0
2,Aaron Connolly,"FW,MF",Brighton,21,4,1,156,1.7,0,0,...,2021,2021/2022,0.4,0.4,0.1,0.5,3,1,7,0
3,Aaron Cresswell,"DF,MF",West Ham,24,38,38,3420,38.0,2,4,...,2014,2014/2015,0.0,0.0,0.0,0.0,0,0,0,0
4,Aaron Cresswell,DF,West Ham,25,37,37,3314,36.8,2,4,...,2015,2015/2016,0.0,0.0,0.0,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17523,Łukasz Fabiański,GK,West Ham,34,25,25,2117,23.5,0,0,...,2019,2019/2020,0.0,0.0,0.0,0.0,0,0,0,0
17524,Łukasz Fabiański,GK,West Ham,35,35,35,3150,35.0,0,0,...,2020,2020/2021,0.0,0.0,0.0,0.0,0,0,0,0
17525,Łukasz Fabiański,GK,West Ham,36,37,37,3330,37.0,0,0,...,2021,2021/2022,0.0,0.0,0.0,0.0,0,0,0,0
17526,Łukasz Fabiański,GK,West Ham,37,36,36,3111,34.6,0,0,...,2022,2022/2023,0.0,0.0,0.0,0.0,0,0,0,0


In [104]:
duplicate_df = ml_df.copy()

In [105]:
ml_df[ml_df["poy_winner"] == 1]

Unnamed: 0,Player,Pos,Squad,Age,MP,Starts,Min,90s,Gls,Ast,...,year,season,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,poy_winner
412,Alan Shearer,FW,Blackburn,23,42,42,3770,41.9,34,13,...,1994,1994/1995,0.0,0.0,0.0,0.0,0,0,0,1
3026,Cristiano Ronaldo,"FW,MF",Manchester Utd,22,34,31,2747,30.5,31,6,...,2007,2007/2008,0.0,0.0,0.0,0.0,0,0,0,1
3027,Cristiano Ronaldo,"FW,MF",Manchester Utd,23,33,31,2742,30.5,18,6,...,2008,2008/2009,0.0,0.0,0.0,0.0,0,0,0,1
4589,Dwight Yorke,"FW,MF",Manchester Utd,26,32,32,2781,30.9,18,11,...,1998,1998/1999,1.0,0.0,0.0,0.0,0,0,0,1
4648,Eden Hazard,"FW,MF",Chelsea,23,38,38,3367,37.4,14,9,...,2014,2014/2015,0.0,0.0,0.0,0.0,0,0,0,1
4923,Eric Cantona,FW,Manchester Utd,27,34,34,2987,33.2,18,12,...,1993,1993/1994,0.0,0.0,0.0,0.0,0,0,0,1
4980,Erling Haaland,FW,Manchester City,22,35,33,2769,30.8,36,8,...,2022,2022/2023,28.4,23.0,5.3,28.3,35,58,151,1
5273,Frank Lampard,MF,Chelsea,26,38,38,3413,37.9,13,18,...,2004,2004/2005,0.0,0.0,0.0,0.0,0,0,0,1
5343,Freddie Ljungberg,"FW,MF",Arsenal,24,25,24,1904,21.2,12,4,...,2001,2001/2002,0.0,0.0,0.0,0.0,0,0,0,1
5452,Gareth Bale,"FW,MF",Tottenham,23,33,33,2921,32.5,21,4,...,2012,2012/2013,0.0,0.0,0.0,0.0,0,0,0,1


Step 1: Ranking the best player of the season through machine learning using parameters present in the list of predictors using RIDGE REGRESSION

In [106]:
predictors = [
    'Age', 'MP', 'Starts', 'Min', '90s', 'Gls',
       'Ast', 'G+A', 'xG', 'npxG', 'xAG',
       'npxG+xAG', 'PrgC', 'PrgP', 'PrgR'
]
#'CrdY', 'CrdR'

In [107]:
train = ml_df[ml_df["season"] < '2022/2023']

In [108]:
test = ml_df[ml_df["season"] == '2022/2023']

In [109]:
reg = Ridge(alpha = 0.1)

In [110]:
reg.fit(train[predictors], train["poy_winner"])

In [111]:
predictions = reg.predict(test[predictors])

In [112]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [113]:
predictions

Unnamed: 0,predictions
11,0.012060
14,-0.010417
65,-0.002505
85,-0.006739
100,0.010923
...,...
17389,-0.007258
17448,0.003660
17466,0.002716
17509,0.012374


In [114]:
combination = pd.concat([test[["Player", "poy_winner", "season"]],predictions], axis=1)

In [115]:
combination

Unnamed: 0,Player,poy_winner,season,predictions
11,Aaron Cresswell,0,2022/2023,0.012060
14,Aaron Hickey,0,2022/2023,-0.010417
65,Aaron Ramsdale,0,2022/2023,-0.002505
85,Aaron Wan-Bissaka,0,2022/2023,-0.006739
100,Abdoulaye Doucouré,0,2022/2023,0.010923
...,...,...,...,...
17389,Yves Bissouma,0,2022/2023,-0.007258
17448,Álex Moreno,0,2022/2023,0.003660
17466,Çağlar Söyüncü,0,2022/2023,0.002716
17509,İlkay Gündoğan,0,2022/2023,0.012374


In [116]:
combination.sort_values("predictions", ascending=False)

Unnamed: 0,Player,poy_winner,season,predictions
2019,Bruno Fernandes,0,2022/2023,0.092829
9102,Kevin De Bruyne,0,2022/2023,0.074452
12778,Pascal Groß,0,2022/2023,0.068344
9321,Kieran Trippier,0,2022/2023,0.067939
6303,Harry Kane,0,2022/2023,0.065524
...,...,...,...,...
7227,Jamie Vardy,0,2022/2023,-0.017965
1285,Antonee Robinson,0,2022/2023,-0.018284
7067,James Milner,0,2022/2023,-0.019071
12803,Patrick Bamford,0,2022/2023,-0.020300


In [117]:
combination = combination.sort_values('predictions', ascending=False)
combination['Predicted_Rk'] = list(range(1, combination.shape[0]+1))

In [118]:
combination

Unnamed: 0,Player,poy_winner,season,predictions,Predicted_Rk
2019,Bruno Fernandes,0,2022/2023,0.092829,1
9102,Kevin De Bruyne,0,2022/2023,0.074452,2
12778,Pascal Groß,0,2022/2023,0.068344,3
9321,Kieran Trippier,0,2022/2023,0.067939,4
6303,Harry Kane,0,2022/2023,0.065524,5
...,...,...,...,...,...
7227,Jamie Vardy,0,2022/2023,-0.017965,565
1285,Antonee Robinson,0,2022/2023,-0.018284,566
7067,James Milner,0,2022/2023,-0.019071,567
12803,Patrick Bamford,0,2022/2023,-0.020300,568


In [99]:
combination.sort_values('Predicted_Rk', ascending=True)

KeyError: 'Predicted_Rk'

In [89]:
predicted = combination.sort_values('predictions', ascending=False)

In [90]:
predicted

Unnamed: 0,Player,poy_winner,season,predictions,Predicted_Rk
2019,Bruno Fernandes,0,2022/2023,0.092829,1
9102,Kevin De Bruyne,0,2022/2023,0.074452,2
12778,Pascal Groß,0,2022/2023,0.068344,3
9321,Kieran Trippier,0,2022/2023,0.067939,4
6303,Harry Kane,0,2022/2023,0.065524,5
...,...,...,...,...,...
7227,Jamie Vardy,0,2022/2023,-0.017965,565
1285,Antonee Robinson,0,2022/2023,-0.018284,566
7067,James Milner,0,2022/2023,-0.019071,567
12803,Patrick Bamford,0,2022/2023,-0.020300,568


Based on overall statistics, number of matches played and age, for the 2022/2023. Bruno Fernandez seemed to be the player of the Season

Step 2: Determine error margin using mean squared error

In [91]:
mean_squared_error(combination['poy_winner'], combination['predictions'])

0.0016850007645311518

In [98]:
combination.sort_values("poy_winner", ascending = False)

Unnamed: 0,Player,poy_winner,season,predictions
13417,Phil Foden,1,2023/2024,0.056579
12,Aaron Cresswell,0,2023/2024,-0.003576
11455,Michael Olise,0,2023/2024,0.036655
10969,Matt Doherty,0,2023/2024,-0.007903
11051,Matt Ritchie,0,2023/2024,-0.006857
...,...,...,...,...
5412,Gabriel Jesus,0,2023/2024,0.000952
5416,Gabriel Magalhães,0,2023/2024,0.010088
5421,Gabriel Martinelli,0,2023/2024,-0.010230
5429,Gabriel Osho,0,2023/2024,0.003846


Step 3: Creating a loop to run predictions for specific seasons and also backtesting if needed. So now we can see the ranking for specified years

In [93]:
start_year = 1992
end_year = 2023
seasons = [f"{year}/{year+1}" for year in range(start_year, end_year+1)]

In [94]:
seasons[21]

'2013/2014'

In [95]:
all_predictions = []
for season in seasons[5:]:
    train = ml_df[ml_df["season"] < season]
    test = ml_df[ml_df["season"] == season]
    reg.fit(train[predictors], train["poy_winner"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "poy_winner", "season"]],predictions], axis=1)
    all_predictions.append(combination)

In [96]:
def add_ranks(combination):
    combination = combination.sort_values('predictions', ascending=False)
    combination['predicted_Rk'] = list(range(1, combination.shape[0]+1))
    return combination
add_ranks(combination)

Unnamed: 0,Player,poy_winner,season,predictions,predicted_Rk
2820,Cole Palmer,0,2023/2024,0.076016,1
4981,Erling Haaland,0,2023/2024,0.066244,2
12614,Ollie Watkins,0,2023/2024,0.061845,3
15291,Son Heung-Min,0,2023/2024,0.058561,4
13417,Phil Foden,1,2023/2024,0.056579,5
...,...,...,...,...,...
17177,William Osula,0,2023/2024,-0.015407,576
7548,Jeremy Doku,0,2023/2024,-0.016055,577
7791,Joe Gomez,0,2023/2024,-0.016288,578
1311,Antony,0,2023/2024,-0.016407,579


With the add_rank function you can easily call up the projected player ranking as suggested by the model by just editing the index number in the list

In [97]:
add_ranks(all_predictions[13])

Unnamed: 0,Player,poy_winner,season,predictions,predicted_Rk
2256,Carlos Tevez,0,2010/2011,0.050099,1
4384,Dimitar Berbatov,0,2010/2011,0.047913,2
14212,Robin Van Persie,0,2010/2011,0.046518,3
13335,Peter Odemwingie,0,2010/2011,0.039914,4
4433,Dirk Kuyt,0,2010/2011,0.032765,5
...,...,...,...,...,...
17229,Wilson Palacios,0,2010/2011,-0.009687,559
13179,Paulo Ferreira,0,2010/2011,-0.010221,560
15336,Steed Malbranque,0,2010/2011,-0.010648,561
12439,Nile Ranger,0,2010/2011,-0.011127,562
