Firstly, we need to import all the important modules and functions needed for our project.

In [1]:
# For data preprocessing and feature engineering
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# For model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# For evaluation
from sklearn.metrics import mean_absolute_error

# For saving the model
import pickle

Next, we will import the needed datasets for our project.

In [2]:
# Import datasets
fifa_21 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Intro to AI/players_21.csv")
fifa_22 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Intro to AI/players_22.csv")

  fifa_22 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Intro to AI/players_22.csv")


Now, we can start data preprocessing.

# <center>__Data Preprocessing__</center>
This entails:
*   Data Cleaning [removing useless variables]
*   Exploratory Data Analysis
*   Imputation
*   Encoding

Firstly, we will examine the dataset to have a general idea of how the features look like. This will help us know which columns to remove, impute, and encode.

In [3]:
# Examining the dataset
fifa_21.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,103500000.0,560000.0,33,...,52+3,52+3,52+3,62+3,19+3,https://cdn.sofifa.net/players/158/023/21_120.png,https://cdn.sofifa.net/teams/241/60.png,https://cdn.sofifa.net/flags/es.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",92,92,63000000.0,220000.0,35,...,54+3,54+3,54+3,61+3,20+3,https://cdn.sofifa.net/players/020/801/21_120.png,https://cdn.sofifa.net/teams/45/60.png,https://cdn.sofifa.net/flags/it.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
2,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,91,91,111000000.0,240000.0,31,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/21_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,,https://cdn.sofifa.net/flags/pl.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,132000000.0,270000.0,28,...,49+3,49+3,49+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/21_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CAM, CM",91,91,129000000.0,370000.0,29,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/21_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [4]:
# Checking for more information about the dataset
fifa_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 15.9+ MB


From the dataset, we see that there are 110 column entries and the dataset contains both numeric and non-numeric columns.

In [5]:
# Checking all the column entries
fifa_21.info(verbose=True) # more detailed description of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 110 columns):
 #    Column                       Dtype  
---   ------                       -----  
 0    sofifa_id                    int64  
 1    player_url                   object 
 2    short_name                   object 
 3    long_name                    object 
 4    player_positions             object 
 5    overall                      int64  
 6    potential                    int64  
 7    value_eur                    float64
 8    wage_eur                     float64
 9    age                          int64  
 10   dob                          object 
 11   height_cm                    int64  
 12   weight_kg                    int64  
 13   club_team_id                 float64
 14   club_name                    object 
 15   league_name                  object 
 16   league_level                 float64
 17   club_position                object 
 18   club_jersey_number      

The first thing to do after examining the dataset is to remove columns with more than 30% of their values missing.

In [6]:
# Define the percentage threshold
percent = 30

# Calculate the percentage of missing values in each column
missing_percent_per_column = (fifa_21.isnull().sum() / len(fifa_21)) * 100

# Drop columns exceeding the threshold
columns_to_drop = missing_percent_per_column[missing_percent_per_column > percent].index
fifa_21.drop(columns=columns_to_drop, inplace=True)

In [7]:
# Examining the new dataframe
fifa_21.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 102 columns):
 #    Column                       Dtype  
---   ------                       -----  
 0    sofifa_id                    int64  
 1    player_url                   object 
 2    short_name                   object 
 3    long_name                    object 
 4    player_positions             object 
 5    overall                      int64  
 6    potential                    int64  
 7    value_eur                    float64
 8    wage_eur                     float64
 9    age                          int64  
 10   dob                          object 
 11   height_cm                    int64  
 12   weight_kg                    int64  
 13   club_team_id                 float64
 14   club_name                    object 
 15   league_name                  object 
 16   league_level                 float64
 17   club_position                object 
 18   club_jersey_number      

Now we have 102 columns from 110, meaning 8 of the columns had more than 30% of their values missing.

We will split the remaining columns into numeric and non-numeric for imputation and encoding.



In [8]:
# Splitting into numeric columns
numeric_columns = fifa_21.select_dtypes(include=['number'])
numeric_columns.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    18944 non-null  int64  
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   value_eur                    18707 non-null  float64
 4   wage_eur                     18719 non-null  float64
 5   age                          18944 non-null  int64  
 6   height_cm                    18944 non-null  int64  
 7   weight_kg                    18944 non-null  int64  
 8   club_team_id                 18719 non-null  float64
 9   league_level                 18719 non-null  float64
 10  club_jersey_number           18719 non-null  float64
 11  club_contract_valid_until    18719 non-null  float64
 12  nationality_id               18944 non-null  int64  
 13  weak_foot       

We notice that all the int types are filled whereas the float types contain missing values. We will impute the missing values with the mean of the non-missing values.

In [9]:
# Imputing the missing values in the numeric dataframe
imp=SimpleImputer()
imp.fit(numeric_columns)
imputed_data=imp.fit_transform(numeric_columns)
numeric_columns=pd.DataFrame(imputed_data, columns=numeric_columns.columns)

In [10]:
# Examining the new dataframe
numeric_columns.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    18944 non-null  float64
 1   overall                      18944 non-null  float64
 2   potential                    18944 non-null  float64
 3   value_eur                    18944 non-null  float64
 4   wage_eur                     18944 non-null  float64
 5   age                          18944 non-null  float64
 6   height_cm                    18944 non-null  float64
 7   weight_kg                    18944 non-null  float64
 8   club_team_id                 18944 non-null  float64
 9   league_level                 18944 non-null  float64
 10  club_jersey_number           18944 non-null  float64
 11  club_contract_valid_until    18944 non-null  float64
 12  nationality_id               18944 non-null  float64
 13  weak_foot       

Now that we have imputed missing values and the numeric dataframe is complete, we can move on to the non-numeric columns.

In [11]:
# Splitting into non-numeric columns
categorical_columns = fifa_21.select_dtypes(exclude=['number'])
categorical_columns.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 45 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   player_url        18944 non-null  object
 1   short_name        18944 non-null  object
 2   long_name         18944 non-null  object
 3   player_positions  18944 non-null  object
 4   dob               18944 non-null  object
 5   club_name         18719 non-null  object
 6   league_name       18719 non-null  object
 7   club_position     18719 non-null  object
 8   club_joined       17961 non-null  object
 9   nationality_name  18944 non-null  object
 10  preferred_foot    18944 non-null  object
 11  work_rate         18944 non-null  object
 12  body_type         18944 non-null  object
 13  real_face         18944 non-null  object
 14  ls                18944 non-null  object
 15  st                18944 non-null  object
 16  rs                18944 non-null  object
 17  lw          

We will fill in the missing values of this dataframe with the values of the non-missing values ahead of it using forward fill.

In [12]:
# Imputing the missing object values
categorical_columns = categorical_columns.fillna(method='ffill', axis=0)
categorical_columns.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 45 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   player_url        18944 non-null  object
 1   short_name        18944 non-null  object
 2   long_name         18944 non-null  object
 3   player_positions  18944 non-null  object
 4   dob               18944 non-null  object
 5   club_name         18944 non-null  object
 6   league_name       18944 non-null  object
 7   club_position     18944 non-null  object
 8   club_joined       18944 non-null  object
 9   nationality_name  18944 non-null  object
 10  preferred_foot    18944 non-null  object
 11  work_rate         18944 non-null  object
 12  body_type         18944 non-null  object
 13  real_face         18944 non-null  object
 14  ls                18944 non-null  object
 15  st                18944 non-null  object
 16  rs                18944 non-null  object
 17  lw          

Now that the dataframe is complete, we can encode the non-numeric columns to convert categorical data into a numeric format that machine learning algorithms can process.

In [13]:
# Encoding the object values to numeric data types
le = LabelEncoder()
for col in categorical_columns.columns:
    categorical_columns[col] = le.fit_transform(categorical_columns[col])
categorical_columns.head()

Unnamed: 0,player_url,short_name,long_name,player_positions,dob,club_name,league_name,club_position,club_joined,nationality_name,...,lb,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_flag_url
0,270,10059,10302,539,942,238,41,0,8,6,...,110,59,59,59,110,18,313,494,15,7
1,4338,3261,3332,587,391,361,24,13,1135,123,...,104,66,66,66,104,20,14,557,23,127
2,1631,14329,14687,560,1271,240,19,27,289,122,...,104,96,96,96,104,18,1672,476,12,124
3,1978,13007,12964,331,2438,461,17,14,888,20,...,110,52,52,52,110,20,2019,639,17,21
4,2209,8987,9626,10,2229,407,15,19,487,14,...,185,158,158,158,185,22,2250,1,18,14


After imputing and encoding the numerical and non-numerical data types, we will combine them and check their correlation with the target variable using a linear regression model. This is where the feature engineering process starts.

In [14]:
# Combining the two dataframes
combined_df = pd.concat([numeric_columns, categorical_columns], axis=1)
combined_df.head()

Unnamed: 0,sofifa_id,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,lb,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_flag_url
0,158023.0,93.0,93.0,103500000.0,560000.0,33.0,170.0,72.0,241.0,1.0,...,110,59,59,59,110,18,313,494,15,7
1,20801.0,92.0,92.0,63000000.0,220000.0,35.0,187.0,83.0,45.0,1.0,...,104,66,66,66,104,20,14,557,23,127
2,188545.0,91.0,91.0,111000000.0,240000.0,31.0,184.0,80.0,21.0,1.0,...,104,96,96,96,104,18,1672,476,12,124
3,190871.0,91.0,91.0,132000000.0,270000.0,28.0,175.0,68.0,73.0,1.0,...,110,52,52,52,110,20,2019,639,17,21
4,192985.0,91.0,91.0,129000000.0,370000.0,29.0,181.0,70.0,10.0,1.0,...,185,158,158,158,185,22,2250,1,18,14


# <center>__Feature Engineering__</center>
This entails:
*   Correlation Analysis
*   Feature Importance
*   Feature Selection
*   Feature Scaling



Since we are using a RandomForest model to check the importance of the feature variables with the target variable, we will need to split the dataset.

In [19]:
# Split target and feature variables
y = combined_df['overall']
X = combined_df.drop('overall',axis=1)

Then, we will scale these features to ensure that they have consistent scales and to prevent features with larger magnitudes from dominating the modeling process.

In [20]:
# Scaling the data values for training
sc=StandardScaler()
scaled_data = sc.fit_transform(X)

# Transform it into a dataframe
X = pd.DataFrame(scaled_data, columns=X.columns)
X.head()

Unnamed: 0,sofifa_id,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,club_jersey_number,club_contract_valid_until,nationality_id,weak_foot,skill_moves,international_reputation,release_clause_eur,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,player_url,short_name,long_name,player_positions,dob,club_name,league_name,club_position,club_joined,nationality_name,preferred_foot,work_rate,body_type,real_face,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_flag_url
0,-2.510848,3.586563,13.073165,27.857178,1.655055,-1.639556,-0.427506,-0.889589,-0.483897,-0.624556,-0.784854,-0.104894,1.594026,2.135802,10.801035,13.695696,1.67247,3.009606,3.493625,3.433013,-0.860418,0.058838,1.949428,2.511461,1.046427,2.216206,2.566683,2.15332,2.51294,2.995738,2.522017,2.260399,1.789816,1.067021,1.891773,3.554438,2.208298,2.119026,0.286795,0.592251,0.340037,2.447123,-0.667856,-0.301386,2.198411,3.000047,1.719677,3.137573,-0.717436,-0.587881,-1.028304,-0.594307,-0.310862,-0.066793,-0.13081,-0.478324,-1.682587,0.223634,0.153247,1.568777,-1.582441,-0.515386,1.195339,-2.554737,-2.742977,-1.506,-1.79263,0.439865,2.986216,2.760207,3.333445,3.333445,3.333445,3.224305,3.32786,3.32786,3.32786,3.224305,3.221565,3.221565,3.221565,3.032407,3.312891,3.312891,3.312891,3.032407,1.088219,0.885213,0.885213,0.885213,1.088219,0.575984,-0.395538,-0.395538,-0.395538,0.575984,0.084727,-1.674724,0.772871,-0.462311,-1.388148
1,-7.562115,3.422893,7.809992,10.662997,2.080838,0.851107,1.13124,-0.893246,-0.483897,-0.80149,0.013303,-0.392398,1.594026,3.440521,10.801035,7.312715,2.058457,3.085367,2.461857,2.800094,-1.05426,1.363898,1.894339,2.511461,2.201078,1.598583,2.453439,1.727485,1.85471,1.9516,1.60126,2.019401,1.521828,1.815879,1.618265,3.664174,0.503128,2.719163,2.557962,1.347996,1.058919,2.395322,0.436628,-0.831824,2.301274,2.052781,2.293975,3.055051,-0.91576,-0.728055,-1.028304,-0.537414,-0.310862,-0.066793,-0.13081,-0.310517,-0.938712,-1.096741,-1.122359,1.831158,-1.933771,0.108869,-0.007105,-0.971621,-0.256372,0.907801,0.55784,-1.649167,2.986216,2.760207,3.357262,3.357262,3.357262,3.15001,3.287564,3.287564,3.287564,3.15001,3.165377,3.165377,3.165377,2.974117,2.846936,2.846936,2.846936,2.974117,0.936808,0.402104,0.402104,0.402104,0.936808,0.449876,-0.268013,-0.268013,-0.268013,0.449876,0.206313,-1.729399,1.092491,0.287455,1.253217
2,-1.387305,3.259222,14.047827,11.67442,1.229273,0.411579,0.706127,-0.893694,-0.483897,-0.683534,0.811461,-0.412934,1.594026,2.135802,8.037314,13.042079,0.996993,2.933845,2.152326,2.378148,-0.537348,1.907673,1.178188,2.460419,1.912415,1.735833,2.623304,1.567797,1.745006,2.473669,1.140882,1.778402,0.851859,0.930865,0.934495,3.444701,1.284664,2.344077,1.632672,0.844166,1.697925,1.980918,1.482982,0.13261,2.249843,1.834181,2.549219,2.477402,-0.568694,-0.260807,-1.266928,-0.08227,-0.607686,-0.248403,-0.483401,-0.366452,-1.433715,1.052997,0.955762,1.683569,-1.372662,-0.505235,-0.360766,0.733272,-2.122981,0.88717,0.55784,-1.300995,2.986216,2.760207,3.309628,3.309628,3.309628,3.001421,3.166675,3.166675,3.166675,3.001421,2.996812,2.996812,2.996812,2.721528,2.601696,2.601696,2.601696,2.721528,0.785397,0.885213,0.885213,0.885213,0.785397,0.449876,0.27852,0.27852,0.27852,0.449876,0.084727,-1.426217,0.68155,-0.743474,1.187183
3,-1.301683,3.259222,16.776879,13.191553,0.590598,-0.907008,-0.994323,-0.892724,-0.483897,-0.624556,0.013303,-0.063822,3.09302,3.440521,10.801035,16.565484,2.25145,2.47928,2.977741,3.327526,-0.989646,-0.593692,1.949428,2.103126,0.584566,1.941707,2.510061,2.100091,2.238678,2.7057,1.864334,2.20015,1.990807,1.679723,2.233658,3.225227,1.355713,1.668922,-0.217908,1.15906,-1.177602,1.929117,-0.260941,-0.494272,1.88982,2.635714,2.804462,2.890009,-0.568694,-0.821505,-0.78968,-0.423628,-0.429592,-0.066793,-0.072045,-0.310517,-1.370262,0.796225,0.640429,0.431791,-0.628555,0.616394,-0.50223,-0.849843,-0.801351,-1.217169,0.55784,-1.300995,2.986216,2.760207,3.119091,3.119091,3.119091,3.187157,3.247268,3.247268,3.247268,3.187157,3.202836,3.202836,3.202836,3.012977,3.067652,3.067652,3.067652,3.012977,1.218,0.513591,0.513591,0.513591,1.218,0.575984,-0.523062,-0.523062,-0.523062,0.575984,0.206313,-1.362765,1.508505,-0.27487,-1.079989
4,-1.223865,3.259222,16.387015,18.248665,0.80349,-0.02795,-0.710915,-0.893899,-0.483897,-0.211711,0.811461,-1.029016,3.09302,2.135802,8.037314,16.003782,0.804,2.55504,3.699979,2.694607,0.819544,1.472653,2.445225,1.847916,0.180439,2.42208,2.226953,1.727485,2.07412,2.357654,2.653554,2.019401,0.851859,0.794709,1.002872,3.225227,0.858372,2.494112,-0.133791,1.662889,0.739416,2.291721,1.192328,0.952378,1.941252,2.92718,2.293975,2.724966,1.067476,0.813864,0.355717,-0.08227,-0.192133,-0.672158,-0.365871,-0.198645,-1.328021,0.01542,0.02953,-1.322884,-0.761819,0.342331,-0.643694,-0.240953,-1.686115,-1.340954,0.55784,-1.997338,2.986216,2.760207,3.04764,3.04764,3.04764,3.112863,3.206972,3.206972,3.206972,3.112863,3.184106,3.184106,3.184106,2.993547,3.337415,3.337415,3.337415,2.993547,2.623959,2.668997,2.668997,2.668997,2.623959,2.152338,1.408023,1.408023,1.408023,2.152338,0.3279,-1.320524,-1.728285,-0.181149,-1.234069


After scaling, we will train the model with these variables.

In [21]:
# Train the model
model = RandomForestRegressor()
model.fit(X,y)

Next, we will retrieve the important features and sort them in descending order.

In [22]:
# Retrieving important features
feature_names = X.columns
feature_importance = model.feature_importances_

# Sorting the feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Adjusting the dataframe to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Sorting the features in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
2,value_eur,0.679787
15,release_clause_eur,0.129947
60,dob,0.110778
1,potential,0.050704
35,movement_reactions,0.019478
4,age,0.002091
96,gk,0.0007
56,player_url,0.000267
3,wage_eur,0.000225
91,lb,0.000206


Since most of the variables show low importance with the target variable, we will pick the top 5 variables, which have an importance greater than 0.01.

In [23]:
# Select the top 5 features based on importance
top_5_features = feature_importance_df['Feature'].values[:6]
top_5_features

array(['value_eur', 'release_clause_eur', 'dob', 'potential',
       'movement_reactions', 'age'], dtype=object)

Now, we will replace X with just the feature variables.

In [24]:
# Assign the top 5 features to X
X = X[top_5_features]

# Since age and dob are used to find a player's age, we will drop dob and use just age
X.drop('dob', axis=1, inplace=True)

X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('dob', axis=1, inplace=True)


Unnamed: 0,value_eur,release_clause_eur,potential,movement_reactions,age
0,13.073165,13.695696,3.586563,3.554438,1.655055
1,7.809992,7.312715,3.422893,3.664174,2.080838
2,14.047827,13.042079,3.259222,3.444701,1.229273
3,16.776879,16.565484,3.259222,3.225227,0.590598
4,16.387015,16.003782,3.259222,3.225227,0.80349


We are now ready to train the dataset.

# <center>__Training Models/Evaluation__</center>
This entails:
*   Data Splitting
*   Model Selection
*   Hyperparameter Tuning
*   Cross-Validation
*   Metrics - Mean Absolute Error (MAE)



In [25]:
# Splitting the dataset for training and testing
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,y,test_size=0.2,random_state=42)

**Model 1: Random Forest**

In [29]:
# Creating a RandomForestRegressor object
rf = RandomForestRegressor()
rf.fit(Xtrain,Ytrain)

In [27]:
# Evaluating the model
y_pred=rf.predict(Xtest)
mean_absolute_error(y_pred,Ytest)

0.24686145672309592

**Model 2: XGBRegressor**

In [28]:
# Creating an XGBRegressor object
xgb = XGBRegressor()
xgb.fit(Xtrain,Ytrain)

In [30]:
# Evaluating the model
y_pred=xgb.predict(Xtest)
mean_absolute_error(y_pred,Ytest)

0.3498297918122876

**Model 3: Gradient Boosting**

In [31]:
# Creating a Gradient Boosting Regressor object
gb = GradientBoostingRegressor()
gb.fit(Xtrain,Ytrain)

In [32]:
# Evaluating the model
y_pred=gb.predict(Xtest)
mean_absolute_error(y_pred,Ytest)

0.5761730664359772

After evaluating the model, we see that the Random Forest performs best with an MAE value of **0.246**. Next, we will use the **GridSearchCV** to combine cross-validation with the grid search for hyper-parameter tuning and optimization of the three models.

**Model 1: Random Forest**

In [33]:
# Defining the parameters
cv=KFold(n_splits=5)
PARAMETERS = {
    'n_estimators': [100,200],
    'max_depth': [10,20,30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True],
    'random_state': [0,45],
    'n_jobs': [-1]
}

In [None]:
# Defining and fitting the GridSearchCV object
gs_rf = GridSearchCV(rf, param_grid=PARAMETERS, cv=cv, scoring='neg_mean_squared_error')
gs_rf.fit(Xtrain,Ytrain)

In [None]:
# Checking the best model
best_regressor = gs_rf.best_estimator_
best_regressor

In [None]:
# Checking the best parameters
best_params = gs_rf.best_params_
best_params

{'bootstrap': True,
 'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200,
 'n_jobs': -1,
 'random_state': 0}

In [None]:
# Evaluating the RandomForestRegressorModel using MAE
y_pred = gs_rf.predict(Xtest)
mean_absolute_error(y_pred,Ytest)

0.24383171209893295

**Model 2: XGBRegressor**

In [None]:
# Defining the parameters
cv=KFold(n_splits=5)
PARAMETERS = {
    'n_estimators':[30,50,100,120],
    'max_depth':[2,5,10,12],
    'random_state':[0,2,15,45],
}

In [None]:
# Defining and fitting the GridSearchCV object
gs_xgb = GridSearchCV(xgb, param_grid=PARAMETERS, cv=cv, scoring='neg_mean_squared_error')
gs_xgb.fit(Xtrain,Ytrain)

In [None]:
# Checking the best model
best_regressor = gs_xgb.best_estimator_
best_regressor

In [None]:
# Checking the best parameters
best_params = gs_xgb.best_params_
best_params

{'max_depth': 10, 'n_estimators': 30, 'random_state': 0}

In [None]:
# Evaluating the XGBRegressorModel using MAE
y_pred = gs_xgb.predict(Xtest)
mean_absolute_error(y_pred,Ytest)

0.29063553255131014

**Model 3: Gradient Boosting**

In [None]:
cv=KFold(n_splits=5)
PARAMETERS = {
    'n_estimators':[30,50,100,120],
    'max_depth':[2,5,10,12],
    'random_state':[0,2,15,45],
}

In [None]:
# Defining and fitting the GridSearchCV object
gs_gb = GridSearchCV(gb, param_grid=PARAMETERS, cv=cv, scoring='neg_mean_squared_error')
gs_gb.fit(Xtrain,Ytrain)

In [None]:
# Checking the best model
best_regressor = gs_gb.best_estimator_
best_regressor

In [None]:
# Checking the best parameters
best_params = gs_gb.best_params_
best_params

{'max_depth': 10, 'n_estimators': 50, 'random_state': 15}

In [None]:
# Evaluating the GradientBoostingRegressorModel using MAE
y_pred = gs_gb.predict(Xtest)
mean_absolute_error(y_pred,Ytest)

0.26319730103070066

It seems the **Random Forest Model** still produces the best result, with an MAE score of **0.243**. Still, we will test the three fine tuned models on a completely new data set to see which one performs best.

# <center>__Test with new data set__</center>

First we need to make sure the new dataset contains only the columns used for training and testing the models.

In [34]:
# Selecting the important columns from the fifa 22 dataset
used_columns = ['overall','value_eur','release_clause_eur','potential','movement_reactions','age']
fifa_22 = fifa_22[used_columns]
fifa_22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   overall             19239 non-null  int64  
 1   value_eur           19165 non-null  float64
 2   release_clause_eur  18063 non-null  float64
 3   potential           19239 non-null  int64  
 4   movement_reactions  19239 non-null  int64  
 5   age                 19239 non-null  int64  
dtypes: float64(2), int64(4)
memory usage: 902.0 KB


From the 2022 FIFA dataset, we see that there are missing values in some of the columns. We need to impute those missing values. Since all the columns are numeric, we do not need to split the dataset.

In [35]:
# Imputing the missing data values
imp=SimpleImputer()
imp.fit(fifa_22)
imputed_data=imp.fit_transform(fifa_22)
fifa_22=pd.DataFrame(imputed_data, columns=fifa_22.columns)

In [36]:
# Examining the cleaned dataset
fifa_22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   overall             19239 non-null  float64
 1   value_eur           19239 non-null  float64
 2   release_clause_eur  19239 non-null  float64
 3   potential           19239 non-null  float64
 4   movement_reactions  19239 non-null  float64
 5   age                 19239 non-null  float64
dtypes: float64(6)
memory usage: 902.0 KB


Next, we will split the dataset into training and testing.

In [37]:
# Storing the target variable
Ytest_22 = fifa_22['overall']

# Drop the target variable from the database
fifa_22.drop('overall', axis=1, inplace=True)

Then we will scale the trained variables to ensure that they have consistent scales and to prevent features with larger magnitudes from dominating the modeling process.

In [38]:
# Scaling the data values for training
sc=StandardScaler()
scaled_data = sc.fit_transform(fifa_22)

# Save the scaled_data
with open('scaled_data.pkl', 'wb') as file:
    pickle.dump(sc, file)

fifa_22 = pd.DataFrame(scaled_data, columns=fifa_22.columns)

In [39]:
# Assign the feature variables to X
Xtest_22 = fifa_22

Testing with the fine tuned models.

In [None]:
# Using the RandomForestRegressor model to test the 2022 dataset
y_pred = gs_rf.predict(Xtest_22)
mean_absolute_error(y_pred,Ytest_22)

0.5677288057616102

In [None]:
# Using the XGBRegressor model to test the 2022 dataset
y_pred = gs_xgb.predict(Xtest_22)
mean_absolute_error(y_pred,Ytest_22)

0.6901379895622936

In [None]:
# Using the GradientBoostingRegressor model to test the 2022 dataset
y_pred = gs_gb.predict(Xtest_22)
mean_absolute_error(y_pred,Ytest_22)

0.6229223838210521

The **Random Forest Model** performs best with the new dataframe, giving an MAE of **0.567**, so we will save this model and use it for deployment.

In [None]:
# Saving the RandomForestRegressorModel
filename = 'rf_model.pkl'
pickle.dump(gs_rf, open(filename, 'wb'))

# <center>__Deployment__</center>

The deployment part of the code will be ran on a different file.

In [40]:
# Saving the ytest and ypred to load in our python file
combined_values = {'Ytest':Ytest, 'y_pred' : y_pred}
values_df = pd.DataFrame(combined_values)

# Changing the dataframe to a csv file to load
values_df.to_csv('Ytest_and_y_pred.csv',index = False)