In [1]:
# Csv and Array manuplation libraries
import numpy as np
import pandas as pd

# Visual Libraries
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns 
sns.set_theme()

# Sklearn 
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.neighbors as ng
import sklearn.svm as svm
import sklearn.tree as tree
import sklearn.ensemble as en

In [2]:
data = pd.read_csv("../input/epl-football-dataset/football.csv")
print(data.shape)
data.head()

(461, 17)


Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3.0,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2.0,Germany,0,4,1,1,0
2,Petr Cech,Arsenal,35,GK,4,7.0,1529,5.5,5.90%,134,2.0,Czech Republic,0,6,1,1,0
3,Theo Walcott,Arsenal,28,RW,1,20.0,2393,7.5,1.50%,122,1.0,England,0,4,1,1,0
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2.0,France,0,4,1,1,0


In [3]:
# Handle the single missing value in the data with most probable value 
data["region"] = data["region"].fillna(data["region"].value_counts().index[0])

# Data Preprocessing

In [4]:
# Necessary Cols 
drop_cols = ["name", "club_id", "market_value", "nationality"]
target_col = "market_value"

In [5]:
# Drop the unnecessary cols
target = data[target_col]
data = data.drop(drop_cols, axis=1)

In [6]:
def extract_percent(val):
    return float(val[:-1])


def pre_process_original(data):
    """
    Preprocesss the data given the columns and the data
    """
    for col in data.columns:
        if col in ["age", "page_views", "fpl_val", "fpl_points"]:
            # Perform standardization
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
            
        if col in ["club", "position", "position_cat", "region", "age_cat"]:
            # Get the dummies and concat them
            dummies = pd.get_dummies(data[col], prefix=col)
            data = pd.concat((data, dummies), axis=1)

            # Drop the data 
            data = data.drop(col, axis=1)
            
        if col == "fpl_sel":
            # Extract the feature
            data[col] = data[col].apply(extract_percent)
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
         
    return data

def pre_process_tree(data):
    """
    Preprocesss the data given the columns and the data
    """
    for col in data.columns:
        if col in ["age", "page_views", "fpl_val", "fpl_points"]:
            # Perform standardization
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
            
        if col == "fpl_sel":
            # Extract the feature
            data[col] = data[col].apply(extract_percent)
            mean = np.mean(data[col])
            std  = np.std(data[col])
            data[col] = (data[col] - mean) / std
            print(mean, std, col)
        
        # Ordinal Encoded columns
        ord_cols = ["club", "position", "position_cat", "region", "age_cat"]
        encoder = OrdinalEncoder()
        data[ord_cols] = encoder.fit_transform(data[ord_cols])
         
    return data

In [7]:
data_normal = pre_process_original(data=data.copy())
print(data_normal.shape)
data_normal.head(2)

26.80477223427332 3.957592308299133 age
763.7765726681127 930.7945734332158 page_views
3.2442516268980475 5.6694977276052985 fpl_sel
57.31453362255965 53.05617275805522 fpl_points
(461, 55)


Unnamed: 0,age,page_views,fpl_value,fpl_sel,fpl_points,new_foreign,big_club,new_signing,club_Arsenal,club_Bournemouth,...,region_1.0,region_2.0,region_3.0,region_4.0,age_cat_1,age_cat_2,age_cat_3,age_cat_4,age_cat_5,age_cat_6
0,0.302009,3.830301,12.0,2.443911,3.895597,0,1,0,1,0,...,0,0,1,0,0,0,0,1,0,0
1,0.302009,3.901208,9.5,0.415513,2.067346,0,1,0,1,0,...,0,1,0,0,0,0,0,1,0,0


In [8]:
data_ordinal = pre_process_tree(data=data.copy())
print(data_ordinal.shape)
data_ordinal.head(2)

26.80477223427332 3.957592308299133 age
763.7765726681127 930.7945734332158 page_views
3.2442516268980475 5.6694977276052985 fpl_sel
57.31453362255965 53.05617275805522 fpl_points
(461, 13)


Unnamed: 0,club,age,position,position_cat,page_views,fpl_value,fpl_sel,fpl_points,region,new_foreign,age_cat,big_club,new_signing
0,0.0,0.302009,8.0,0.0,3.830301,12.0,2.443911,3.895597,2.0,0,3.0,1,0
1,0.0,0.302009,0.0,0.0,3.901208,9.5,0.415513,2.067346,1.0,0,3.0,1,0


In [9]:
# One-Hot encoded data
train_dn, val_dn, train_ln, val_ln = train_test_split(data_normal.values,
                                                      target.values,
                                                      test_size=0.2,
                                                      random_state=7)

# Ordinal encoded data
train_do, val_do, train_lo, val_lo = train_test_split(data_ordinal.values,
                                                      target.values,
                                                      test_size=0.2,
                                                      random_state=7)

# MODELLING

In [10]:
class HyperParamTuner:
    def __init__(self, model_instance, params,train_data, train_label, val_data, val_label, cv=5, name="lasso"):
        # Init the model and the params to optimize
        self.model_instance = model_instance
        self.params = params
        self.cv = cv
        self.new_params = None
        self.train_data = train_data
        self.train_label = train_label
        self.val_data = val_data
        self.val_label = val_label
        self.name = name
    
    def optimize_gscv(self):
        # Activate the grid search cv instance
        param_grid = GridSearchCV(estimator=self.model_instance(),
                                  param_grid=self.params,
                                  verbose=2,
                                  n_jobs=-1,
                                  cv=self.cv,
                                  scoring="r2")
        
        # Fit the model to grid search
        param_grid.fit(X=self.train_data, y=self.train_label)
        
        # Assign the params
        self.new_params = param_grid.best_params_
        
        # Return the best fitted params 
        return param_grid.best_params_
    
    def present_comparison(self):
        model_default = self.model_instance()
        model_default.fit(self.train_data, self.train_label)
        # Print the old scores 
        mse_train_default = mean_squared_error(y_true=self.train_label, 
                                               y_pred=model_default.predict(self.train_data))
        mse_val_default = mean_squared_error(y_true=self.val_label, 
                                             y_pred=model_default.predict(self.val_data))
        
        r2_train_default = r2_score(y_true=self.train_label, 
                                    y_pred=model_default.predict(self.train_data))
        r2_val_default = r2_score(y_true=self.val_label, 
                                  y_pred=model_default.predict(self.val_data))
        print("\n")
        print("%s mse train default ==> %.5f" % (self.name, mse_train_default))
        print("%s mse val default ==> %.5f" % (self.name, mse_val_default))
        print("%s r2 train default ==> %.5f" % (self.name, r2_train_default))
        print("%s r2 val default ==> %.5f" % (self.name, r2_val_default))
        
        
        model_optimized = self.model_instance(**self.new_params)
        model_optimized.fit(self.train_data, self.train_label)
        # Print the old scores 
        mse_train_optimized = mean_squared_error(y_true=self.train_label, 
                                               y_pred=model_optimized.predict(self.train_data))
        mse_val_optimized = mean_squared_error(y_true=self.val_label, 
                                             y_pred=model_optimized.predict(self.val_data))
        
        r2_train_optimized = r2_score(y_true=self.train_label, 
                                      y_pred=model_optimized.predict(self.train_data))
        r2_val_optimized = r2_score(y_true=self.val_label, 
                                    y_pred=model_optimized.predict(self.val_data))
        print("\n")
        print("%s mse train optimized ==> %.5f" % (self.name, mse_train_optimized))
        print("%s mse val optimized ==> %.5f" % (self.name, mse_val_optimized))
        print("%s r2 train optimized ==> %.5f" % (self.name, r2_train_optimized))
        print("%s r2 val optimized ==> %.5f" % (self.name, r2_val_optimized))

In [11]:
params = {
    "fit_intercept": [True, False]
}
linear_reg_tuner = HyperParamTuner(model_instance=LinearRegression, params=params, train_data=train_dn,
                                   train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                   name="Linear Regression")
print(linear_reg_tuner.optimize_gscv())
linear_reg_tuner.present_comparison()

Fitting 5 folds for each of 2 candidates, totalling 10 fits
{'fit_intercept': False}


Linear Regression mse train default ==> 25.18266
Linear Regression mse val default ==> 19.69924
Linear Regression r2 train default ==> 0.84263
Linear Regression r2 val default ==> 0.81068


Linear Regression mse train optimized ==> 25.18251
Linear Regression mse val optimized ==> 19.68932
Linear Regression r2 train optimized ==> 0.84263
Linear Regression r2 val optimized ==> 0.81077


In [12]:
params = {
    "alpha": [0.001, 0.1, 1, 10, 100],
    "selection": ["cyclic", "random"],
    "max_iter": [500, 1000, 1500],
}
lasso_reg_tuner = HyperParamTuner(model_instance=lm.Lasso, params=params, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="Lasso Regression")
print(lasso_reg_tuner.optimize_gscv())
lasso_reg_tuner.present_comparison()

Fitting 5 folds for each of 30 candidates, totalling 150 fits
{'alpha': 0.1, 'max_iter': 1500, 'selection': 'random'}


Lasso Regression mse train default ==> 44.84268
Lasso Regression mse val default ==> 30.37217
Lasso Regression r2 train default ==> 0.71978
Lasso Regression r2 val default ==> 0.70810


Lasso Regression mse train optimized ==> 29.32603
Lasso Regression mse val optimized ==> 17.88464
Lasso Regression r2 train optimized ==> 0.81674
Lasso Regression r2 val optimized ==> 0.82812


In [13]:
params = {
    "alpha": [0.001, 0.1, 1, 10, 100],
    "max_iter": [500, 1000, 1500],
    "solver": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
}
lasso_reg_tuner = HyperParamTuner(model_instance=lm.Ridge, params=params, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="Ridge Regression", cv=5)
print(lasso_reg_tuner.optimize_gscv())
lasso_reg_tuner.present_comparison()

Fitting 5 folds for each of 105 candidates, totalling 525 fits
{'alpha': 10, 'max_iter': 500, 'solver': 'svd'}


Ridge Regression mse train default ==> 25.39708
Ridge Regression mse val default ==> 18.34165
Ridge Regression r2 train default ==> 0.84129
Ridge Regression r2 val default ==> 0.82372


Ridge Regression mse train optimized ==> 26.96904
Ridge Regression mse val optimized ==> 16.62527
Ridge Regression r2 train optimized ==> 0.83147
Ridge Regression r2 val optimized ==> 0.84022


In [14]:
params = {
    "n_neighbors": [2, 3, 6, 8, 10],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [10, 20, 30, 40, 50]
}
knn_tuner = HyperParamTuner(model_instance=ng.KNeighborsRegressor, params=params, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="K-Nearest Neighbors")
print(knn_tuner.optimize_gscv())
knn_tuner.present_comparison()

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
{'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 8, 'weights': 'distance'}


K-Nearest Neighbors mse train default ==> 28.47428
K-Nearest Neighbors mse val default ==> 24.52769
K-Nearest Neighbors r2 train default ==> 0.82206
K-Nearest Neighbors r2 val default ==> 0.76427


K-Nearest Neighbors mse train optimized ==> 0.00000
K-Nearest Neighbors mse val optimized ==> 22.81316
K-Nearest Neighbors r2 train optimized ==> 1.00000
K-Nearest Neighbors r2 val optimized ==> 0.78075


In [15]:
params = {
    "kernel": ["linear", "rbf", "sigmoid"],
    "degree": [1, 2, 3, 4, 5],
    "gamma": ["auto", "scale"],
    "C": [0.001, 0.1, 1, 10, 100]
}
svr_tuner = HyperParamTuner(model_instance=svm.SVR, params=params, train_data=train_dn,
                                  train_label=train_ln, val_data=val_dn, val_label=val_ln,
                                  name="Support Vector Regressor")
print(svr_tuner.optimize_gscv())
svr_tuner.present_comparison()

Fitting 5 folds for each of 150 candidates, totalling 750 fits
{'C': 100, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf'}


Support Vector Regressor mse train default ==> 71.76643
Support Vector Regressor mse val default ==> 42.67468
Support Vector Regressor r2 train default ==> 0.55153
Support Vector Regressor r2 val default ==> 0.58987


Support Vector Regressor mse train optimized ==> 10.85431
Support Vector Regressor mse val optimized ==> 13.39847
Support Vector Regressor r2 train optimized ==> 0.93217
Support Vector Regressor r2 val optimized ==> 0.87123


In [16]:
params = {
    "criterion": ["mse", "friedman_mse", "mae", "poisson"],
    "splitter": ["best", "random"],
    "max_depth": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 3, 4]
}
dt_tuner = HyperParamTuner(model_instance=tree.DecisionTreeRegressor, params=params, train_data=train_do,
                                  train_label=train_lo, val_data=val_do, val_label=val_lo,
                                  name="Decision Tree Regressor")
print(dt_tuner.optimize_gscv())
dt_tuner.present_comparison()

Fitting 5 folds for each of 160 candidates, totalling 800 fits
{'criterion': 'mae', 'max_depth': 6, 'min_samples_leaf': 3, 'splitter': 'random'}


Decision Tree Regressor mse train default ==> 0.00000
Decision Tree Regressor mse val default ==> 35.38995
Decision Tree Regressor r2 train default ==> 1.00000
Decision Tree Regressor r2 val default ==> 0.65988


Decision Tree Regressor mse train optimized ==> 30.26789
Decision Tree Regressor mse val optimized ==> 30.01642
Decision Tree Regressor r2 train optimized ==> 0.81086
Decision Tree Regressor r2 val optimized ==> 0.71152


In [17]:
params = {
    "n_estimators": [50, 100, 150, 200],
    "max_features": ["auto", "sqrt", "log2"],
    "max_depth": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 3, 4]
}
random_forest_tuner = HyperParamTuner(model_instance=en.RandomForestRegressor, params=params,train_data=train_do,
                                  train_label=train_lo, val_data=val_do, val_label=val_lo,
                                  name="Random Forest Regressor")
print(random_forest_tuner.optimize_gscv())
random_forest_tuner.present_comparison()

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}


Random Forest Regressor mse train default ==> 5.46292
Random Forest Regressor mse val default ==> 24.96168
Random Forest Regressor r2 train default ==> 0.96586
Random Forest Regressor r2 val default ==> 0.76010


Random Forest Regressor mse train optimized ==> 5.79417
Random Forest Regressor mse val optimized ==> 19.53876
Random Forest Regressor r2 train optimized ==> 0.96379
Random Forest Regressor r2 val optimized ==> 0.81222


In [18]:
params = {
    "loss": ["ls", "lad", "huber"],
    "learning_rate": [0.01, 0.1],
    "min_samples_leaf": [3, 4, 5, 6],
    "max_depth": [2, 3, 4, 5],
    "max_features": ["auto", "sqrt", "log2"],
}
gradient_boost_tuner = HyperParamTuner(model_instance=en.GradientBoostingRegressor, params=params,train_data=train_do,
                                       train_label=train_lo, val_data=val_do, val_label=val_lo,
                                       name="Gradient Boosting Regressor")
print(gradient_boost_tuner.optimize_gscv())
gradient_boost_tuner.present_comparison()

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
{'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5}


Gradient Boosting Regressor mse train default ==> 5.75137
Gradient Boosting Regressor mse val default ==> 22.13504
Gradient Boosting Regressor r2 train default ==> 0.96406
Gradient Boosting Regressor r2 val default ==> 0.78727


Gradient Boosting Regressor mse train optimized ==> 4.73390
Gradient Boosting Regressor mse val optimized ==> 14.00940
Gradient Boosting Regressor r2 train optimized ==> 0.97042
Gradient Boosting Regressor r2 val optimized ==> 0.86536


In [19]:
params = {
    "learning_rate": [0.01, 0.1, 1],
    "loss": ["linear", "square", "exponential"],
    "n_estimators": [50, 100, 150, 200]
}
ada_boost_tuner = HyperParamTuner(model_instance=en.AdaBoostRegressor, params=params,train_data=train_do,
                                  train_label=train_lo, val_data=val_do, val_label=val_lo,
                                  name="Ada Boost Regressor")
print(ada_boost_tuner.optimize_gscv())
ada_boost_tuner.present_comparison()

Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'learning_rate': 1, 'loss': 'square', 'n_estimators': 50}


Ada Boost Regressor mse train default ==> 23.68108
Ada Boost Regressor mse val default ==> 34.43772
Ada Boost Regressor r2 train default ==> 0.85202
Ada Boost Regressor r2 val default ==> 0.66903


Ada Boost Regressor mse train optimized ==> 21.62466
Ada Boost Regressor mse val optimized ==> 27.49887
Ada Boost Regressor r2 train optimized ==> 0.86487
Ada Boost Regressor r2 val optimized ==> 0.73572
