In [1]:
import warnings
import itertools

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

# Looking at our Dataset

In [2]:
df = pd.read_excel('data/final_dataset.xlsx')

In [3]:
df.head(10)

Unnamed: 0,Player,Club,Age,Position,Nation,Value,Contract Years Left,League,Squad (20/21),MP (20/21),...,Offsides (17/18),Crosses (17/18),Interceptions (17/18),Penalty Kicks Won (17/18),Penalties Conceded (17/18),Own Goals (17/18),Total Loose Balls Recovered (17/18),Aerial Duel Won (17/18),Aerial Duel Lost (17/18),% Aerial Duels Won (17/18)
0,Kylian Mbappe,Paris Saint-Germain,22,attack,France,144000000,1,Ligue 1,Paris S-G,31.0,...,23.0,62.0,1.0,1.0,0.0,0.0,98.0,1.0,4.0,20.0
1,Erling Haaland,Borussia Dortmund,21,attack,Norway,117000000,3,Bundesliga,Dortmund,28.0,...,,,,,,,,,,
2,Harry Kane,Tottenham Hotspur,28,attack,England,108000000,3,Premier League,Tottenham,35.0,...,43.0,24.0,7.0,1.0,0.0,0.0,124.0,69.0,111.0,38.3
3,Jadon Sancho,Manchester United,21,attack,England,90000000,5,Premier League,Dortmund,26.0,...,1.0,15.0,6.0,0.0,0.0,0.0,57.0,3.0,14.0,17.6
4,Mohamed Salah,Liverpool FC,29,attack,Egypt,90000000,2,Premier League,Liverpool,37.0,...,18.0,50.0,13.0,1.0,0.0,0.0,219.0,19.0,58.0,24.7
5,Neymar,Paris Saint-Germain,29,attack,Brazil,90000000,4,Ligue 1,Paris S-G,18.0,...,10.0,58.0,3.0,2.0,0.0,0.0,110.0,2.0,2.0,50.0
6,Kevin De Bruyne,Manchester City,30,midfield,Belgium,90000000,4,Premier League,Manchester City,25.0,...,2.0,146.0,42.0,1.0,0.0,0.0,278.0,14.0,18.0,43.8
7,Romelu Lukaku,Chelsea FC,28,attack,Belgium,90000000,5,Premier League,Inter,36.0,...,19.0,37.0,6.0,0.0,0.0,0.0,101.0,127.0,122.0,51.0
8,Joshua Kimmich,Bayern Munich,26,midfield,Germany,81000000,4,Bundesliga,Bayern Munich,27.0,...,2.0,107.0,17.0,1.0,1.0,0.0,253.0,16.0,18.0,47.1
9,Bruno Fernandes,Manchester United,26,midfield,Portugal,81000000,4,Premier League,Manchester Utd,37.0,...,,,,,,,,,,


# Baseline Linear Regression Model

In [4]:
def baseline_linear_regression(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []

    for position in positions:

        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:6].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

        ss= StandardScaler()
        lr = LinearRegression()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        lr.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(lr, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=10)
        cross_val_test = cross_val_score(lr, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=10)
        
        #Appending min to Max list and max to Min list because they are taken as negative values
        scores_train.append(-(cross_val_train.mean()))
        #scores_train_std.append(cross_val_train.std())
        scores_train_min.append(-(cross_val_train).max())
        scores_train_max.append(-(cross_val_train).min())
        
        scores_test.append(-(cross_val_test.mean()))
        #scores_test_std.append(cross_val_test.std())
        scores_test_min.append(-(cross_val_test).max())
        scores_test_max.append(-(cross_val_test).min())
        
    
    print("Attackers Train:")
    print(f'Attackers Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Attackers Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Attackers Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Attackers Test:")
    print(f'Attackers Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Attackers Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Attackers Test Min RMSE = ${round(scores_test_min[0],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Midfielders Train:")
    print(f'Midfielders Train Mean RMSE = ${round(scores_train[1],2)}')
    #print(f'Midfielders Train RMSE Std = ${round(scores_train_std[1],2)}')
    print(f'Midfielders Train Max RMSE = ${round(scores_train_max[1],2)}')
    print(f'Midfielders Train Min RMSE = ${round(scores_train_min[1],2)}')
    print("")
    print("Midfielders Test:")
    print(f'Midfielders Test Mean RMSE = ${round(scores_test[1],2)}')
    #print(f'Midfielders Test RMSE Std = ${round(scores_test_std[1],2)}')
    print(f'Midfielders Test Max RMSE = ${round(scores_test_max[1],2)}')
    print(f'Midfielders Test Min RMSE = ${round(scores_test_min[1],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Defenders Train:")
    print(f'Defenders Train Mean RMSE = ${round(scores_train[2],2)}')
   # print(f'Defenders Train RMSE Std = ${round(scores_train_std[2],2)}')
    print(f'Defenders Train Max RMSE = ${round(scores_train_max[2],2)}')
    print(f'Defenders Train Min RMSE = ${round(scores_train_min[2],2)}')
    print("")
    print("Defenders Test:")
    print(f'Defenders Test Mean RMSE = ${round(scores_test[2],2)}')
    #print(f'Defenders Test RMSE Std = ${round(scores_test_std[2],2)}')
    print(f'Defenders Test Max RMSE = ${round(scores_test_max[2],2)}')
    print(f'Defenders Test Min RMSE = ${round(scores_test_min[2],2)}')

In [5]:
baseline_linear_regression(df)

Attackers Train:
Attackers Train Mean RMSE = $13513486.28
Attackers Train Max RMSE = $17751596.8
Attackers Train Min RMSE = $9069237.13

Attackers Test:
Attackers Test Mean RMSE = $11394265.66
Attackers Test Max RMSE = $17642165.76
Attackers Test Min RMSE = $7576161.37
----------------------------------------
----------------------------------------
Midfielders Train:
Midfielders Train Mean RMSE = $12256730.25
Midfielders Train Max RMSE = $14555602.84
Midfielders Train Min RMSE = $10184552.77

Midfielders Test:
Midfielders Test Mean RMSE = $12201753.92
Midfielders Test Max RMSE = $20493903.76
Midfielders Test Min RMSE = $6802634.06
----------------------------------------
----------------------------------------
Defenders Train:
Defenders Train Mean RMSE = $9423928.21
Defenders Train Max RMSE = $10917174.33
Defenders Train Min RMSE = $7578970.85

Defenders Test:
Defenders Test Mean RMSE = $12006190.48
Defenders Test Max RMSE = $17813898.61
Defenders Test Min RMSE = $6457786.97


# Lasso Regression Model

In [6]:
def lasso_regression(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []

    for position in positions:
        
        warnings.filterwarnings('ignore')
        
        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:11].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

        ss= StandardScaler()
        l1 = Lasso()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        l1.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(l1, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=10)
        cross_val_test = cross_val_score(l1, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=10)
        
        #Appending min to Max list and max to Min list because they are taken as negative values
        scores_train.append(-(cross_val_train.mean()))
        #scores_train_std.append(cross_val_train.std())
        scores_train_min.append(-(cross_val_train).max())
        scores_train_max.append(-(cross_val_train).min())
        
        scores_test.append(-(cross_val_test.mean()))
        #scores_test_std.append(cross_val_test.std())
        scores_test_min.append(-(cross_val_test).max())
        scores_test_max.append(-(cross_val_test).min())
        
    
    print("Attackers Train:")
    print(f'Attackers Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Attackers Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Attackers Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Attackers Test:")
    print(f'Attackers Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Attackers Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Attackers Test Min RMSE = ${round(scores_test_min[0],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Midfielders Train:")
    print(f'Midfielders Train Mean RMSE = ${round(scores_train[1],2)}')
    #print(f'Midfielders Train RMSE Std = ${round(scores_train_std[1],2)}')
    print(f'Midfielders Train Max RMSE = ${round(scores_train_max[1],2)}')
    print(f'Midfielders Train Min RMSE = ${round(scores_train_min[1],2)}')
    print("")
    print("Midfielders Test:")
    print(f'Midfielders Test Mean RMSE = ${round(scores_test[1],2)}')
    #print(f'Midfielders Test RMSE Std = ${round(scores_test_std[1],2)}')
    print(f'Midfielders Test Max RMSE = ${round(scores_test_max[1],2)}')
    print(f'Midfielders Test Min RMSE = ${round(scores_test_min[1],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Defenders Train:")
    print(f'Defenders Train Mean RMSE = ${round(scores_train[2],2)}')
   # print(f'Defenders Train RMSE Std = ${round(scores_train_std[2],2)}')
    print(f'Defenders Train Max RMSE = ${round(scores_train_max[2],2)}')
    print(f'Defenders Train Min RMSE = ${round(scores_train_min[2],2)}')
    print("")
    print("Defenders Test:")
    print(f'Defenders Test Mean RMSE = ${round(scores_test[2],2)}')
    #print(f'Defenders Test RMSE Std = ${round(scores_test_std[2],2)}')
    print(f'Defenders Test Max RMSE = ${round(scores_test_max[2],2)}')
    print(f'Defenders Test Min RMSE = ${round(scores_test_min[2],2)}')

In [7]:
lasso_regression(df)

Attackers Train:
Attackers Train Mean RMSE = $12613040.93
Attackers Train Max RMSE = $17452297.03
Attackers Train Min RMSE = $8349163.29

Attackers Test:
Attackers Test Mean RMSE = $15034054.81
Attackers Test Max RMSE = $23310322.05
Attackers Test Min RMSE = $7754340.68
----------------------------------------
----------------------------------------
Midfielders Train:
Midfielders Train Mean RMSE = $12255933.79
Midfielders Train Max RMSE = $15750909.48
Midfielders Train Min RMSE = $9775987.12

Midfielders Test:
Midfielders Test Mean RMSE = $12271809.89
Midfielders Test Max RMSE = $21362422.31
Midfielders Test Min RMSE = $5420799.89
----------------------------------------
----------------------------------------
Defenders Train:
Defenders Train Mean RMSE = $10001612.23
Defenders Train Max RMSE = $11911000.2
Defenders Train Min RMSE = $6198900.99

Defenders Test:
Defenders Test Mean RMSE = $9876425.4
Defenders Test Max RMSE = $15956340.51
Defenders Test Min RMSE = $4244632.88


# Ridge Regression Model

In [8]:
def ridge_regression(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []

    for position in positions:
        
        warnings.filterwarnings('ignore')

        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:11].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

        ss= StandardScaler()
        l2 = Ridge()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        l2.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(l2, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=10)
        cross_val_test = cross_val_score(l2, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=10)
        
        #Appending min to Max list and max to Min list because they are taken as negative values
        scores_train.append(-(cross_val_train.mean()))
        #scores_train_std.append(cross_val_train.std())
        scores_train_min.append(-(cross_val_train).max())
        scores_train_max.append(-(cross_val_train).min())
        
        scores_test.append(-(cross_val_test.mean()))
        #scores_test_std.append(cross_val_test.std())
        scores_test_min.append(-(cross_val_test).max())
        scores_test_max.append(-(cross_val_test).min())
        
    
    print("Attackers Train:")
    print(f'Attackers Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Attackers Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Attackers Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Attackers Test:")
    print(f'Attackers Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Attackers Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Attackers Test Min RMSE = ${round(scores_test_min[0],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Midfielders Train:")
    print(f'Midfielders Train Mean RMSE = ${round(scores_train[1],2)}')
    #print(f'Midfielders Train RMSE Std = ${round(scores_train_std[1],2)}')
    print(f'Midfielders Train Max RMSE = ${round(scores_train_max[1],2)}')
    print(f'Midfielders Train Min RMSE = ${round(scores_train_min[1],2)}')
    print("")
    print("Midfielders Test:")
    print(f'Midfielders Test Mean RMSE = ${round(scores_test[1],2)}')
    #print(f'Midfielders Test RMSE Std = ${round(scores_test_std[1],2)}')
    print(f'Midfielders Test Max RMSE = ${round(scores_test_max[1],2)}')
    print(f'Midfielders Test Min RMSE = ${round(scores_test_min[1],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Defenders Train:")
    print(f'Defenders Train Mean RMSE = ${round(scores_train[2],2)}')
   # print(f'Defenders Train RMSE Std = ${round(scores_train_std[2],2)}')
    print(f'Defenders Train Max RMSE = ${round(scores_train_max[2],2)}')
    print(f'Defenders Train Min RMSE = ${round(scores_train_min[2],2)}')
    print("")
    print("Defenders Test:")
    print(f'Defenders Test Mean RMSE = ${round(scores_test[2],2)}')
    #print(f'Defenders Test RMSE Std = ${round(scores_test_std[2],2)}')
    print(f'Defenders Test Max RMSE = ${round(scores_test_max[2],2)}')
    print(f'Defenders Test Min RMSE = ${round(scores_test_min[2],2)}')

In [9]:
ridge_regression(df)

Attackers Train:
Attackers Train Mean RMSE = $12408173.11
Attackers Train Max RMSE = $15031314.82
Attackers Train Min RMSE = $8921770.62

Attackers Test:
Attackers Test Mean RMSE = $14988878.12
Attackers Test Max RMSE = $33119113.09
Attackers Test Min RMSE = $7040839.37
----------------------------------------
----------------------------------------
Midfielders Train:
Midfielders Train Mean RMSE = $12058328.23
Midfielders Train Max RMSE = $15999176.9
Midfielders Train Min RMSE = $8267813.73

Midfielders Test:
Midfielders Test Mean RMSE = $13214668.2
Midfielders Test Max RMSE = $23130367.51
Midfielders Test Min RMSE = $6471949.92
----------------------------------------
----------------------------------------
Defenders Train:
Defenders Train Mean RMSE = $9888204.92
Defenders Train Max RMSE = $12051378.6
Defenders Train Min RMSE = $8067575.09

Defenders Test:
Defenders Test Mean RMSE = $10419642.68
Defenders Test Max RMSE = $16761320.26
Defenders Test Min RMSE = $5110081.27
