In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
ml_df = pd.read_csv("output_files/correctly_grouped_data.csv")

In [4]:
del ml_df["Unnamed: 0"]

In [None]:
ml_df

In [7]:
duplicate_df = ml_df.copy()

In [6]:
ml_df[ml_df["poy_winner"] == 1]

Unnamed: 0,Player,Pos,Squad,Age,MP,Starts,Min,90s,Gls,Ast,...,year,season,xG,npxG,xAG,npxG+xAG,PrgC,PrgP,PrgR,poy_winner
412,Alan Shearer,FW,Blackburn,23,42,42,3770,41.9,34,13,...,1994,1994/1995,0.0,0.0,0.0,0.0,0,0,0,1
3026,Cristiano Ronaldo,"FW,MF",Manchester Utd,22,34,31,2747,30.5,31,6,...,2007,2007/2008,0.0,0.0,0.0,0.0,0,0,0,1
3027,Cristiano Ronaldo,"FW,MF",Manchester Utd,23,33,31,2742,30.5,18,6,...,2008,2008/2009,0.0,0.0,0.0,0.0,0,0,0,1
4589,Dwight Yorke,"FW,MF",Manchester Utd,26,32,32,2781,30.9,18,11,...,1998,1998/1999,1.0,0.0,0.0,0.0,0,0,0,1
4648,Eden Hazard,"FW,MF",Chelsea,23,38,38,3367,37.4,14,9,...,2014,2014/2015,0.0,0.0,0.0,0.0,0,0,0,1
4923,Eric Cantona,FW,Manchester Utd,27,34,34,2987,33.2,18,12,...,1993,1993/1994,0.0,0.0,0.0,0.0,0,0,0,1
4980,Erling Haaland,FW,Manchester City,22,35,33,2769,30.8,36,8,...,2022,2022/2023,28.4,23.0,5.3,28.3,35,58,151,1
5273,Frank Lampard,MF,Chelsea,26,38,38,3413,37.9,13,18,...,2004,2004/2005,0.0,0.0,0.0,0.0,0,0,0,1
5343,Freddie Ljungberg,"FW,MF",Arsenal,24,25,24,1904,21.2,12,4,...,2001,2001/2002,0.0,0.0,0.0,0.0,0,0,0,1
5452,Gareth Bale,"FW,MF",Tottenham,23,33,33,2921,32.5,21,4,...,2012,2012/2013,0.0,0.0,0.0,0.0,0,0,0,1


Step 1: Ranking the best player of the season through machine learning using parameters present in the list of predictors using RIDGE REGRESSION

In [8]:
predictors = [
    'Age', 'MP', 'Starts', 'Min', '90s', 'Gls',
       'Ast', 'G+A', 'xG', 'npxG', 'xAG',
       'npxG+xAG', 'PrgC', 'PrgP', 'PrgR'
]
#'CrdY', 'CrdR'

In [9]:
train = ml_df[ml_df["season"] < '2022/2023']

In [10]:
test = ml_df[ml_df["season"] == '2022/2023']

In [11]:
reg = Ridge(alpha = 0.1)

In [12]:
reg.fit(train[predictors], train["poy_winner"])

In [13]:
predictions = reg.predict(test[predictors])

In [14]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [15]:
predictions

Unnamed: 0,predictions
11,0.012060
14,-0.010417
65,-0.002505
85,-0.006739
100,0.010923
...,...
17389,-0.007258
17448,0.003660
17466,0.002716
17509,0.012374


In [51]:
combination = pd.concat([test[["Player", "poy_winner", "season"]],predictions], axis=1)

In [52]:
combination

Unnamed: 0,Player,poy_winner,season,predictions
12,Aaron Cresswell,0,2023/2024,-0.003576
15,Aaron Hickey,0,2023/2024,-0.003003
66,Aaron Ramsdale,0,2023/2024,0.000396
78,Aaron Ramsey,0,2023/2024,-0.004437
86,Aaron Wan-Bissaka,0,2023/2024,-0.001545
...,...,...,...,...
17392,Zack Nelson,0,2023/2024,0.000290
17411,Zeki Amdouni,0,2023/2024,-0.005828
17449,Álex Moreno,0,2023/2024,-0.004683
17502,Đorđe Petrović,0,2023/2024,-0.001432


In [53]:
combination.sort_values("predictions", ascending=False)

Unnamed: 0,Player,poy_winner,season,predictions
2820,Cole Palmer,0,2023/2024,0.076016
4981,Erling Haaland,0,2023/2024,0.066244
12614,Ollie Watkins,0,2023/2024,0.061845
15291,Son Heung-Min,0,2023/2024,0.058561
13417,Phil Foden,1,2023/2024,0.056579
...,...,...,...,...
17177,William Osula,0,2023/2024,-0.015407
7548,Jeremy Doku,0,2023/2024,-0.016055
7791,Joe Gomez,0,2023/2024,-0.016288
1311,Antony,0,2023/2024,-0.016407


In [54]:
combination = combination.sort_values('predictions', ascending=False)
combination['Predicted_Rk'] = list(range(1, combination.shape[0]+1))

In [55]:
combination

Unnamed: 0,Player,poy_winner,season,predictions,Predicted_Rk
2820,Cole Palmer,0,2023/2024,0.076016,1
4981,Erling Haaland,0,2023/2024,0.066244,2
12614,Ollie Watkins,0,2023/2024,0.061845,3
15291,Son Heung-Min,0,2023/2024,0.058561,4
13417,Phil Foden,1,2023/2024,0.056579,5
...,...,...,...,...,...
17177,William Osula,0,2023/2024,-0.015407,576
7548,Jeremy Doku,0,2023/2024,-0.016055,577
7791,Joe Gomez,0,2023/2024,-0.016288,578
1311,Antony,0,2023/2024,-0.016407,579


In [56]:
combination.sort_values('Predicted_Rk', ascending=True)

Unnamed: 0,Player,poy_winner,season,predictions,Predicted_Rk
2820,Cole Palmer,0,2023/2024,0.076016,1
4981,Erling Haaland,0,2023/2024,0.066244,2
12614,Ollie Watkins,0,2023/2024,0.061845,3
15291,Son Heung-Min,0,2023/2024,0.058561,4
13417,Phil Foden,1,2023/2024,0.056579,5
...,...,...,...,...,...
17177,William Osula,0,2023/2024,-0.015407,576
7548,Jeremy Doku,0,2023/2024,-0.016055,577
7791,Joe Gomez,0,2023/2024,-0.016288,578
1311,Antony,0,2023/2024,-0.016407,579


In [57]:
predicted = combination.sort_values('predictions', ascending=False)

In [58]:
predicted

Unnamed: 0,Player,poy_winner,season,predictions,Predicted_Rk
2820,Cole Palmer,0,2023/2024,0.076016,1
4981,Erling Haaland,0,2023/2024,0.066244,2
12614,Ollie Watkins,0,2023/2024,0.061845,3
15291,Son Heung-Min,0,2023/2024,0.058561,4
13417,Phil Foden,1,2023/2024,0.056579,5
...,...,...,...,...,...
17177,William Osula,0,2023/2024,-0.015407,576
7548,Jeremy Doku,0,2023/2024,-0.016055,577
7791,Joe Gomez,0,2023/2024,-0.016288,578
1311,Antony,0,2023/2024,-0.016407,579


Based on overall statistics, number of matches played and age, for the 2022/2023. Bruno Fernandez seemed to be the player of the Season

Step 2: Determine error margin using mean squared error

In [59]:
mean_squared_error(combination['poy_winner'], combination['predictions'])

0.001675953377521606

In [60]:
combination.sort_values("poy_winner", ascending = False).head(10)

Unnamed: 0,Player,poy_winner,season,predictions,Predicted_Rk
13417,Phil Foden,1,2023/2024,0.056579,5
2820,Cole Palmer,0,2023/2024,0.076016,1
205,Adam Smith,0,2023/2024,-0.002223,391
5018,Ezri Konsa,0,2023/2024,-0.00205,384
1348,Armando Broja,0,2023/2024,-0.002052,385
9818,Lewis Dobbin,0,2023/2024,-0.002062,386
16943,Vitaliy Mykolenko,0,2023/2024,-0.002096,387
699,Alisson,0,2023/2024,-0.002125,388
9527,Leander Dendoncker,0,2023/2024,-0.002181,389
3698,David Brooks,0,2023/2024,-0.002195,390


Step 3: Creating a loop to run predictions for specific seasons and also backtesting if needed. So now we can see the ranking for specified years

In [61]:
start_year = 1992
end_year = 2023
seasons = [f"{year}/{year+1}" for year in range(start_year, end_year+1)]

In [62]:
seasons[21]

'2013/2014'

In [65]:
all_predictions = []
for season in seasons[5:]:
    train = ml_df[ml_df["season"] < season]
    test = ml_df[ml_df["season"] == season]
    reg.fit(train[predictors], train["poy_winner"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "poy_winner", "season"]],predictions], axis=1)
    all_predictions.append(combination)

In [66]:
def add_ranks(combination):
    combination = combination.sort_values('predictions', ascending=False)
    combination['predicted_Rk'] = list(range(1, combination.shape[0]+1))
    return combination
add_ranks(combination)

Unnamed: 0,Player,poy_winner,season,predictions,predicted_Rk
2820,Cole Palmer,0,2023/2024,0.076016,1
4981,Erling Haaland,0,2023/2024,0.066244,2
12614,Ollie Watkins,0,2023/2024,0.061845,3
15291,Son Heung-Min,0,2023/2024,0.058561,4
13417,Phil Foden,1,2023/2024,0.056579,5
...,...,...,...,...,...
17177,William Osula,0,2023/2024,-0.015407,576
7548,Jeremy Doku,0,2023/2024,-0.016055,577
7791,Joe Gomez,0,2023/2024,-0.016288,578
1311,Antony,0,2023/2024,-0.016407,579


In [68]:
add_ranks(all_predictions[13])

Unnamed: 0,Player,poy_winner,season,predictions,predicted_Rk
2256,Carlos Tevez,0,2010/2011,0.050099,1
4384,Dimitar Berbatov,0,2010/2011,0.047913,2
14212,Robin Van Persie,0,2010/2011,0.046518,3
13335,Peter Odemwingie,0,2010/2011,0.039914,4
4433,Dirk Kuyt,0,2010/2011,0.032765,5
...,...,...,...,...,...
17229,Wilson Palacios,0,2010/2011,-0.009687,559
13179,Paulo Ferreira,0,2010/2011,-0.010221,560
15336,Steed Malbranque,0,2010/2011,-0.010648,561
12439,Nile Ranger,0,2010/2011,-0.011127,562
