In [1]:
# Csv and Array manuplation libraries
import numpy as np
import pandas as pd

# Visual Libraries
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns 
sns.set_theme()

# Sklearn 
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.neighbors as ng
import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.ensemble as en

# Data loading and parsing

In [2]:
# Load the data
data = pd.read_csv("../input/epl-football-dataset/football.csv")

# Handle the single missing value in the data with most probable value 
data["region"] = data["region"].fillna(data["region"].value_counts().index[0])

# Printer
print(data.shape)
data.head()

(461, 17)


Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3.0,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2.0,Germany,0,4,1,1,0
2,Petr Cech,Arsenal,35,GK,4,7.0,1529,5.5,5.90%,134,2.0,Czech Republic,0,6,1,1,0
3,Theo Walcott,Arsenal,28,RW,1,20.0,2393,7.5,1.50%,122,1.0,England,0,4,1,1,0
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2.0,France,0,4,1,1,0


# Data Preprocessing

In [3]:
# Necessary Cols 
drop_cols = ["name", "club_id", "market_value", "nationality"]
target_col = "market_value"

# Drop the unnecessary cols
target = data[target_col]
data = data.drop(drop_cols, axis=1)

In [4]:
def extract_percent(val):
    return float(val[:-1])


def pre_process_original(data):
    """
    Preprocesss the data given the columns and the data
    """
    for col in data.columns:
        if col in ["age", "page_views", "fpl_val", "fpl_points"]:
            # Perform standardization
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
            
        if col in ["club", "position", "position_cat", "region", "age_cat"]:
            # Get the dummies and concat them
            dummies = pd.get_dummies(data[col], prefix=col)
            data = pd.concat((data, dummies), axis=1)

            # Drop the data 
            data = data.drop(col, axis=1)
            
        if col == "fpl_sel":
            # Extract the feature
            data[col] = data[col].apply(extract_percent)
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
         
    return data

def pre_process_tree(data):
    """
    Preprocesss the data given the columns and the data
    """
    for col in data.columns:
        if col in ["age", "page_views", "fpl_val", "fpl_points"]:
            # Perform standardization
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
            
        if col == "fpl_sel":
            # Extract the feature
            data[col] = data[col].apply(extract_percent)
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
        
        # Ordinal Encoded columns
        ord_cols = ["club", "position", "position_cat", "region", "age_cat"]
        encoder = OrdinalEncoder()
        data[ord_cols] = encoder.fit_transform(data[ord_cols])
         
    return data

In [5]:
data_normal = pre_process_original(data=data.copy())
print(data_normal.shape)
data_normal.head(2)

26.80477223427332 3.957592308299133 age
763.7765726681127 930.7945734332158 page_views
3.2442516268980475 5.6694977276052985 fpl_sel
57.31453362255965 53.05617275805522 fpl_points
(461, 55)


Unnamed: 0,age,page_views,fpl_value,fpl_sel,fpl_points,new_foreign,big_club,new_signing,club_Arsenal,club_Bournemouth,...,region_1.0,region_2.0,region_3.0,region_4.0,age_cat_1,age_cat_2,age_cat_3,age_cat_4,age_cat_5,age_cat_6
0,0.302009,3.830301,12.0,2.443911,3.895597,0,1,0,1,0,...,0,0,1,0,0,0,0,1,0,0
1,0.302009,3.901208,9.5,0.415513,2.067346,0,1,0,1,0,...,0,1,0,0,0,0,0,1,0,0


In [6]:
data_ordinal = pre_process_tree(data=data.copy())
print(data_ordinal.shape)
data_ordinal.head(2)

26.80477223427332 3.957592308299133 age
763.7765726681127 930.7945734332158 page_views
3.2442516268980475 5.6694977276052985 fpl_sel
57.31453362255965 53.05617275805522 fpl_points
(461, 13)


Unnamed: 0,club,age,position,position_cat,page_views,fpl_value,fpl_sel,fpl_points,region,new_foreign,age_cat,big_club,new_signing
0,0.0,0.302009,8.0,0.0,3.830301,12.0,2.443911,3.895597,2.0,0,3.0,1,0
1,0.0,0.302009,0.0,0.0,3.901208,9.5,0.415513,2.067346,1.0,0,3.0,1,0


In [7]:
# One-Hot encoded data
train_dn, val_dn, train_ln, val_ln = train_test_split(data_normal.values,
                                                      target.values,
                                                      test_size=0.2,
                                                      random_state=7)

# Ordinal encoded data
train_do, val_do, train_lo, val_lo = train_test_split(data_ordinal.values,
                                                      target.values,
                                                      test_size=0.2,
                                                      random_state=7)

# Modelling and Predictions

In [8]:
class Model_Predictor:
    """
    Predicts the model, prints the scores, and model predictions
    """
    def __init__(self, model_instance, train_data, train_label,
                 val_data, val_label, name="lasso"):
        
        # Init the model and the params to optimize
        self.model_instance = model_instance
        self.train_data = train_data
        self.train_label = train_label
        self.val_data = val_data
        self.val_label = val_label
        self.name = name
    
    def predict(self):
        # Collect the model
        model = self.model_instance()
        model.fit(self.train_data, self.train_label)
        
        # Print the scores
        mse_train = mean_squared_error(y_true=self.train_label, 
                                               y_pred=model.predict(self.train_data))
        mse_val = mean_squared_error(y_true=self.val_label, 
                                             y_pred=model.predict(self.val_data))
        
        r2_train = r2_score(y_true=self.train_label, 
                                    y_pred=model.predict(self.train_data))
        r2_val = r2_score(y_true=self.val_label, 
                                  y_pred=model.predict(self.val_data))
        
        # Print the results
        print("%s mse train ==> %.5f" % (self.name, mse_train))
        print("%s mse val ==> %.5f" % (self.name, mse_val))
        print("%s r2 train ==> %.5f" % (self.name, r2_train))
        print("%s r2 val ==> %.5f" % (self.name, r2_val))
        print("Ouptut :", model.predict(self.val_data)[:50])

In [9]:
linear_reg = Model_Predictor(model_instance=LinearRegression, train_data=train_dn,
                                   train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                   name="Linear Regression")
linear_reg.predict()

Linear Regression mse train ==> 25.18266
Linear Regression mse val ==> 19.69924
Linear Regression r2 train ==> 0.84263
Linear Regression r2 val ==> 0.81068
Ouptut : [22.7109375   3.10546875  7.29882812 -7.36523438  3.03515625 58.4296875
 14.45898438  4.58203125 -0.42578125  9.32226562  7.05273438 16.11914062
 -2.390625    1.2734375   3.68554688  3.81640625  3.38867188  9.43164062
 26.63671875 -2.24609375  1.55273438 -0.28710938 25.00390625  2.5546875
  6.078125    8.4609375   3.15820312 15.7109375   3.60546875 15.078125
  3.87304688 -1.80859375  5.83203125  5.52929688  3.4609375   8.0859375
  7.33203125  3.15820312 30.796875   -0.3515625   5.26171875  7.80078125
  2.75390625  4.58398438  3.61914062  8.390625    5.36328125  7.48632812
 20.56445312 -0.5859375 ]


In [10]:
lasso_reg = Model_Predictor(model_instance=lm.Lasso, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="Lasso Regression")
lasso_reg.predict()

Lasso Regression mse train ==> 44.84268
Lasso Regression mse val ==> 30.37217
Lasso Regression r2 train ==> 0.71978
Lasso Regression r2 val ==> 0.70810
Ouptut : [10.18708954  4.14273732 12.15210155  1.71182175  6.50622602 57.79424406
  9.32823723  8.80477536  3.05787345 10.08986898  7.23699168 10.95291619
  7.91867106  1.31739832  6.99682084  7.42257473  6.93250959  5.57184796
 22.65542762  4.38612239  3.11140763  1.1346731  10.46681093  9.34660246
  6.66294953  8.34120752  5.57976372 16.93100061  3.09291764 13.73120719
  7.30131292  3.34741566  3.03938347  9.29644449 10.17416838  8.4713894
  8.36476255  3.92278348 21.02644881  3.43430075  9.0508036  11.64210364
  6.06262883  5.25724185  7.20606673 10.29179815  7.44569034 10.71709699
 13.60755456  7.71661614]


In [11]:
ridge_reg = Model_Predictor(model_instance=lm.Ridge, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="Ridge Regression")
ridge_reg.predict()

Ridge Regression mse train ==> 25.39708
Ridge Regression mse val ==> 18.34165
Ridge Regression r2 train ==> 0.84129
Ridge Regression r2 val ==> 0.82372
Ouptut : [ 2.18957488e+01  3.22106444e+00  7.62022857e+00 -6.11708209e+00
  3.04890430e+00  5.84494130e+01  1.40893137e+01  4.41769283e+00
 -2.85858572e-01  9.26575130e+00  7.24677534e+00  1.56086777e+01
 -5.98537985e-01  1.46612962e+00  3.52930467e+00  3.60303106e+00
  3.93376770e+00  8.58882620e+00  2.63519847e+01 -1.70372990e+00
  1.22453317e+00  2.53985692e-01  2.40736566e+01  2.46297162e+00
  6.60090210e+00  8.42768107e+00  2.98307601e+00  1.51854726e+01
  3.81016139e+00  1.47839730e+01  3.84443828e+00 -1.58286721e+00
  5.60692915e+00  6.15634879e+00  3.89602075e+00  8.18794491e+00
  7.31765514e+00  2.83762456e+00  3.05776077e+01 -4.63130809e-02
  5.68888872e+00  7.35860462e+00  2.90195880e+00  4.51222346e+00
  4.07636155e+00  8.26536019e+00  5.38595015e+00  8.66015946e+00
  2.03440089e+01  3.02593360e-01]


In [12]:
knn = Model_Predictor(model_instance=ng.KNeighborsRegressor, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="K-Nearest Neighbors")
knn.predict()

K-Nearest Neighbors mse train ==> 28.47428
K-Nearest Neighbors mse val ==> 24.52769
K-Nearest Neighbors r2 train ==> 0.82206
K-Nearest Neighbors r2 val ==> 0.76427
Ouptut : [11.2   2.6   9.2   2.4   3.6  54.    7.    5.4   2.45  5.2   6.2  14.8
  6.7   4.4   4.3   4.6   4.25  2.55 19.    1.5   1.8   3.5   7.9   4.5
  3.9   6.8   3.5  11.1   2.9   9.7   5.6   3.95  3.    7.1   9.    2.
  4.4   7.3  18.2   0.9   0.9   6.    1.9   2.05  5.6   9.2   5.3  10.
 20.4   3.3 ]


In [13]:
svr = Model_Predictor(model_instance=svm.SVR, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="Support Vector Regressor")
svr.predict()

Support Vector Regressor mse train ==> 71.76643
Support Vector Regressor mse val ==> 42.67468
Support Vector Regressor r2 train ==> 0.55153
Support Vector Regressor r2 val ==> 0.58987
Ouptut : [ 8.45286114  3.52976116  8.66946795  1.93783708  4.72170191 21.63730298
  7.41622306  6.09498015  1.26633201  7.92866127  6.43000337  8.73291709
  6.10171053  2.49991881  5.05322979  4.76769612  5.20944562  4.73437501
 15.75691108  1.97944631  1.9785635   1.36598592  9.0509113   5.78431248
  6.02825763  6.68970469  3.33816507 13.27777956  2.71662861 12.23168125
  5.89178567  1.8834317   3.17486337  7.34439623  6.95399417  4.88771462
  6.16955524  3.05062875 19.20669373  2.69933169  6.08337114  7.47116436
  4.4691364   3.38361675  5.85177803 10.98041753  5.66356257  9.15348055
 11.87257719  6.08054464]


In [14]:
dt = Model_Predictor(model_instance=tree.DecisionTreeRegressor, train_data=train_do,
                                  train_label=train_lo, val_data=val_do, val_label=val_lo,
                                  name="Decision Tree Regressor")
dt.predict()

Decision Tree Regressor mse train ==> 0.00000
Decision Tree Regressor mse val ==> 43.55661
Decision Tree Regressor r2 train ==> 1.00000
Decision Tree Regressor r2 val ==> 0.58139
Ouptut : [10.    3.   11.    2.5   3.   45.    4.    2.5   0.25 30.    2.    4.
 15.    1.75  1.5   2.5   5.   12.   12.    1.5   1.5   1.5  22.    7.
  2.5   9.    1.25 25.    1.   10.   12.    0.5   1.5   2.5  10.   18.
  3.    5.5  40.    0.5  10.    2.5   4.    4.5   2.5   1.5  10.    8.
 30.    1.  ]


In [15]:
random_forest = Model_Predictor(model_instance=en.RandomForestRegressor, train_data=train_do,
                                  train_label=train_lo, val_data=val_do, val_label=val_lo,
                                  name="Random Forest Regressor")
random_forest.predict()

Random Forest Regressor mse train ==> 5.25218
Random Forest Regressor mse val ==> 26.40214
Random Forest Regressor r2 train ==> 0.96718
Random Forest Regressor r2 val ==> 0.74626
Ouptut : [11.3     4.2775  9.65    1.1725  4.308  53.1     5.0575  6.88    0.455
 15.455   4.3075 11.925   6.935   1.8675  4.269   5.92    5.92    5.446
 21.6     2.9975  1.294   2.0225 17.44    6.211   6.165   7.825   3.6925
 13.745   1.2395 12.11    6.935   1.1925  2.772   5.18    9.955   4.7325
  5.75    3.4305 32.84    1.6175 12.795   8.325   5.2505  2.6955  5.94
  4.585   7.185  11.18   23.195   4.1045]


In [16]:
gradient_boost = Model_Predictor(model_instance=en.GradientBoostingRegressor, train_data=train_do,
                                       train_label=train_lo, val_data=val_do, val_label=val_lo,
                                       name="Gradient Boosting Regressor")
gradient_boost.predict()

Gradient Boosting Regressor mse train ==> 5.75137
Gradient Boosting Regressor mse val ==> 22.23996
Gradient Boosting Regressor r2 train ==> 0.96406
Gradient Boosting Regressor r2 val ==> 0.78626
Ouptut : [22.28274359  3.34980568  7.23746322  0.57965466  3.5002313  58.0046134
 12.47009419  9.16461042  0.29617008 10.42761579  6.71758603 13.24039781
  6.81519723  2.96394329  3.92897631  5.91969656  6.19453423  6.65153443
 22.93224028  1.99814276  2.18371149  1.1493993  16.3921912  11.5461811
  5.9808685   6.90944541  3.19552715 13.70806937  2.2881358  11.73100651
  6.79090796 -0.29581744  3.09570998  6.48019124  7.80953986  5.94255077
  5.81565613  4.14489623 35.55526885  1.28714706  9.67452013  6.45325492
  2.16486565  1.92576518  5.79641694  3.34219919  7.54314113 10.06593265
 23.85123484  1.75245219]


In [17]:
ada_boost = Model_Predictor(model_instance=en.AdaBoostRegressor, train_data=train_do,
                                  train_label=train_lo, val_data=val_do, val_label=val_lo,
                                  name="Ada Boost Regressor")
ada_boost.predict()

Ada Boost Regressor mse train ==> 24.01156
Ada Boost Regressor mse val ==> 38.23950
Ada Boost Regressor r2 train ==> 0.84995
Ada Boost Regressor r2 val ==> 0.63249
Ouptut : [16.35833333  4.9204918  13.44680851  4.9204918   8.0826087  56.55172414
 10.30294118 10.37244898  4.79444444 15.          7.54615385 15.78571429
 10.37244898  4.79444444  7.10211268  7.15        7.10211268  6.26309524
 22.60382514  5.41029412  4.79444444  4.79444444 19.8045977  19.20934959
  6.503125    9.99236641  5.5469697  10.55        4.79444444 16.8875969
  7.30758929  4.79444444  4.79444444 10.37244898 10.83125     7.54772727
  8.33661417  4.9204918  29.69392523  4.79444444 13.96683938 13.44680851
  7.30758929  5.5469697   7.30758929  8.0826087   7.15       10.55
 23.75462963  7.15      ]
