## Import Modules

In [240]:
# import modules
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## Loading Data

In [241]:
# read data
training_data = pd.read_csv('data/players_21.csv')
testing_data = pd.read_csv('data/players_22.csv')

  testing_data = pd.read_csv('data/players_22.csv')


## Data Preprocessing

### Remove columns with na values that exceed 30%

In [242]:
# Calculate the percentage of na values in each column
na_percentages = training_data.isna().sum() / len(training_data) * 100

# Select the columns where the percentage of na values exceeds 30%
cols_to_drop = na_percentages[na_percentages > 30].index

# Drop the selected columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
testing_data.drop(cols_to_drop, axis=1, inplace=True)

### Remove columns where the values do not obviously contribute a player's overall rating

In [243]:
# urls do not help to predict a player's rating
# remove columns that have 'url' in their name
cols_to_drop = [col for col in training_data.columns if 'url' in col]
training_data.drop(cols_to_drop, axis=1, inplace=True)
testing_data.drop(cols_to_drop, axis=1, inplace=True)

In [244]:
# columns that obviously do not contribute to a player's rating
cols_to_drop = [
    "age",
    "sofifa_id",
    "short_name",
    "long_name",
    "real_face",
]

# drop the columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
testing_data.drop(cols_to_drop, axis=1, inplace=True)

In [245]:
# columns where input would not be available at the time of prediction
cols_to_drop = [
    "gk", "rb", "rcb", "cb", "lcb", "lb", "rwb", "rdm", "cdm", "ldm", "lwb", "rm", "rcm", "cm", "lcm", "lm", "ram", "cam", "lam", "rw", "rf", "cf", "lf", "lw", "rs", "st", "ls", "club_joined", "club_contract_valid_until"
]

# drop the columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
testing_data.drop(cols_to_drop, axis=1, inplace=True)

In [246]:
# remove club, national and league info. They do not explicitly determine a player's rating
cols_to_drop = [
    "club_name", "league_name", "league_level", "club_jersey_number", "nationality_id", "nationality_name", "value_eur", "release_clause_eur", "club_team_id"
]

# drop the columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
testing_data.drop(cols_to_drop, axis=1, inplace=True)

#### Encoding data

In [247]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_positions             18944 non-null  object 
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   wage_eur                     18719 non-null  float64
 4   dob                          18944 non-null  object 
 5   height_cm                    18944 non-null  int64  
 6   weight_kg                    18944 non-null  int64  
 7   club_position                18719 non-null  object 
 8   preferred_foot               18944 non-null  object 
 9   weak_foot                    18944 non-null  int64  
 10  skill_moves                  18944 non-null  int64  
 11  international_reputation     18944 non-null  int64  
 12  work_rate                    18944 non-null  object 
 13  body_type       

In [248]:
# use pd.factorize to convert categorical columns to numerical
# check if dtype is object

# get categorical columns
cat_cols = [col for col in training_data.columns if training_data[col].dtype == 'object']

# factorize the categorical columns
for col in cat_cols:
    training_data[col], c1 = pd.factorize(training_data[col])
    testing_data[col], c2 = pd.factorize(testing_data[col])

In [249]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_positions             18944 non-null  int64  
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   wage_eur                     18719 non-null  float64
 4   dob                          18944 non-null  int64  
 5   height_cm                    18944 non-null  int64  
 6   weight_kg                    18944 non-null  int64  
 7   club_position                18944 non-null  int64  
 8   preferred_foot               18944 non-null  int64  
 9   weak_foot                    18944 non-null  int64  
 10  skill_moves                  18944 non-null  int64  
 11  international_reputation     18944 non-null  int64  
 12  work_rate                    18944 non-null  int64  
 13  body_type       

#### Imputing Data

In [250]:
imputer = SimpleImputer(strategy='most_frequent')
training_data = pd.DataFrame(imputer.fit_transform(training_data), columns=training_data.columns)
testing_data = pd.DataFrame(imputer.transform(testing_data), columns=testing_data.columns)

In [251]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_positions             18944 non-null  float64
 1   overall                      18944 non-null  float64
 2   potential                    18944 non-null  float64
 3   wage_eur                     18944 non-null  float64
 4   dob                          18944 non-null  float64
 5   height_cm                    18944 non-null  float64
 6   weight_kg                    18944 non-null  float64
 7   club_position                18944 non-null  float64
 8   preferred_foot               18944 non-null  float64
 9   weak_foot                    18944 non-null  float64
 10  skill_moves                  18944 non-null  float64
 11  international_reputation     18944 non-null  float64
 12  work_rate                    18944 non-null  float64
 13  body_type       

### Setup training and testing data

In [252]:
trainX = training_data.drop('overall', axis=1)
trainY = training_data['overall']
testX = testing_data.drop('overall', axis=1)
testY = testing_data['overall']

#### Scaling the independent variables

In [253]:
scaler = StandardScaler()
trainX = pd.DataFrame(scaler.fit_transform(trainX), columns=trainX.columns)
testX = pd.DataFrame(scaler.transform(testX), columns=testX.columns)

In [254]:
trainX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 53 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_positions             18944 non-null  float64
 1   potential                    18944 non-null  float64
 2   wage_eur                     18944 non-null  float64
 3   dob                          18944 non-null  float64
 4   height_cm                    18944 non-null  float64
 5   weight_kg                    18944 non-null  float64
 6   club_position                18944 non-null  float64
 7   preferred_foot               18944 non-null  float64
 8   weak_foot                    18944 non-null  float64
 9   skill_moves                  18944 non-null  float64
 10  international_reputation     18944 non-null  float64
 11  work_rate                    18944 non-null  float64
 12  body_type                    18944 non-null  float64
 13  pace            

#### Create feature subsets that better correlate with the overall rating

In [256]:
# create feature subsets which show better correlation with the overall rating

# create a list of all the columns with a correlation greater than 0.5
feature_cols = list(trainX.corrwith(trainY)[abs(trainX.corrwith(trainY)) > 0.5].index)

print(feature_cols)
print(len(feature_cols))

['potential', 'wage_eur', 'dob', 'passing', 'dribbling', 'attacking_short_passing', 'movement_reactions', 'power_shot_power', 'mentality_vision', 'mentality_composure']
10


In [257]:
# set trainX and testX to the new feature subset
trainX = trainX[feature_cols]
testX = testX[feature_cols]

In [258]:
trainX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   potential                18944 non-null  float64
 1   wage_eur                 18944 non-null  float64
 2   dob                      18944 non-null  float64
 3   passing                  18944 non-null  float64
 4   dribbling                18944 non-null  float64
 5   attacking_short_passing  18944 non-null  float64
 6   movement_reactions       18944 non-null  float64
 7   power_shot_power         18944 non-null  float64
 8   mentality_vision         18944 non-null  float64
 9   mentality_composure      18944 non-null  float64
dtypes: float64(10)
memory usage: 1.4 MB


## Training Models

Using Cross validation to train the Gradient Boosting Classifer 


from sklearn.ensemble import GradientBoostingClassifier
gdc= GradientBoostingClassifier(learning_rate=0.001,random_state=42,)

cv=KFold(n_splits=3)
PARAMETERS ={
"max_depth":[2,5, 6, 12],
# "min_child_weight":[1,5,15],
"learning_rate":[0.3, 0.1, 0.03],
"n_estimators":[100,500,1000]}
gd=GradientBoostingClassifier()
model_gs = GridSearchCV(gd,param_grid=PARAMETERS,cv=cv,scoring="accuracy")
model_gs.fit(Xtrain,Ytrain)

In [None]:
# do cross validation training with either RandomForest, XGBoost, Gradient Boost Regressors that can predict a player rating.

# RandomForestRegressor cross validation training


cv = KFold(n_splits=3)
PARAMETERS = {
    "max_depth": [2, 5, 6, 12],
    "learning_rate": [0.3, 0.1, 0.03],