# MACHINE LEARNING PROJECT

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import xgboost as xgb

# Importing Regressor Models
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

#Importing metrics
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve, confusion_matrix, f1_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score, precision_score, recall_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.2 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.0


In [None]:
import tensorflow_addons as tfa

# Running Models

In [None]:
data_cleaned = pd.read_csv('Final_Data_Modeling.csv')

## Splitting the data

We will split the data into train and test based on the seasons. We will keep the players of FIFA 22, i.e players in the season 2020/21

In [None]:
test1= data_cleaned[data_cleaned['Season_fifa']=='2020/21']
train1= data_cleaned[data_cleaned['Season_fifa']!='2020/21']

In [None]:
print(test1.shape, train1.shape)

(1522, 116) (7431, 116)


## Creating functions to execute the different models

In [None]:
def run_model(X_train, y_train, epochs):
  #create model
  model = Sequential()

  #get number of columns in training data
  n_cols = X_train.shape[1]

  #add layers to model
  model.add(Dense(16, activation='relu', input_shape=(n_cols,)))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(64, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(256, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(1024, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(2048, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(1024, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(256, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(64, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(16, activation='relu'))
  # model.add(Dropout(0.4))
  model.add(BatchNormalization())

  model.add(Dense(1, activation='linear'))

  #compile model using accuracy to measure model performance
  model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError(), tfa.metrics.RSquare()])

  #set early stopping monitor so the model stops training when it won't improve anymore
  early_stopping_monitor = EarlyStopping(patience=3)

  # print (model.summary())

  #train model
  model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, callbacks=early_stopping_monitor)

  return model

## Data 1 - All Players

### 1.a. Using All features

In [None]:
%%time
train_1a= train1.copy(deep=True)
test_1a= test1.copy(deep=True)

X = train_1a.drop(['Season_fifa', 'overall', 'short_name'], axis=1)
y = train_1a['overall']

X_train, y_train = X, y

model = run_model(X_train, y_train, epochs=30)
test_1a['Overall_Predicted'] = model.predict(test_1a.drop(['Season_fifa', 'overall', 'short_name'], axis=1))
test_1a['Overall_Predicted'] = test_1a['Overall_Predicted'].apply(lambda x: round(x))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
CPU times: user 3min 21s, sys: 5.65 s, total: 3min 27s
Wall time: 2min 11s


In [None]:
test_1a[['short_name','overall', 'Overall_Predicted']].sort_values(by=['overall'], ascending=False).head(10)

Unnamed: 0,short_name,overall,Overall_Predicted
7431,L. Messi,93,68
7432,R. Lewandowski,92,70
7433,Cristiano Ronaldo,91,66
7434,Neymar Jr,91,71
7435,K. De Bruyne,91,71
7436,J. Oblak,91,68
7437,K. Mbappé,91,69
7438,M. Neuer,90,62
7439,M. ter Stegen,90,69
7440,H. Kane,90,70


### 1.b. Physical Attributes, Game attributes, wage, value

In [None]:
%%time

cols = [0,1,5,6,7,10,11, 115]

train_1b= train1.copy(deep=True)
test_1a= test1.copy(deep=True)

X = train_1b.drop(train_1b.columns[cols], axis=1)
y = train_1b['overall']

X_train, y_train = X, y

model = run_model(X_train, y_train, epochs=30)
test_1a['Overall_Predicted']=model.predict(test_1a.drop(test_1a.columns[cols], axis=1))
test_1a['Overall_Predicted'] = test_1a['Overall_Predicted'].apply(lambda x: round(x))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
CPU times: user 6min 13s, sys: 11.2 s, total: 6min 25s
Wall time: 3min 50s


In [None]:
test_1a[['short_name','overall', 'Overall_Predicted']].sort_values(by=['overall'], ascending=False).head(10)

Unnamed: 0,short_name,overall,Overall_Predicted
7431,L. Messi,93,92
7432,R. Lewandowski,92,95
7433,Cristiano Ronaldo,91,89
7434,Neymar Jr,91,96
7435,K. De Bruyne,91,96
7436,J. Oblak,91,94
7437,K. Mbappé,91,100
7438,M. Neuer,90,81
7439,M. ter Stegen,90,94
7440,H. Kane,90,96


### 1.c. Physical and Game Attributes

In [None]:
%%time

cols = [0,1,5,6,7,8,9,10,11, 115]

train_1c= train1.copy(deep=True)
test_1a= test1.copy(deep=True)

X = train_1c.drop(train_1c.columns[cols], axis=1)
y = train_1c['overall']

X_train, y_train = X, y

model = run_model(X_train, y_train, epochs=30)
test_1a['Overall_Predicted']=model.predict(test_1a.drop(test_1a.columns[cols], axis=1))
test_1a['Overall_Predicted'] = test_1a['Overall_Predicted'].apply(lambda x: round(x))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
CPU times: user 6min 35s, sys: 10.6 s, total: 6min 46s
Wall time: 4min 32s


In [None]:
test_1a[['short_name','overall', 'Overall_Predicted']].sort_values(by=['overall'], ascending=False).head(10)

Unnamed: 0,short_name,overall,Overall_Predicted
7431,L. Messi,93,74
7432,R. Lewandowski,92,73
7433,Cristiano Ronaldo,91,73
7434,Neymar Jr,91,73
7435,K. De Bruyne,91,73
7436,J. Oblak,91,72
7437,K. Mbappé,91,72
7438,M. Neuer,90,73
7439,M. ter Stegen,90,72
7440,H. Kane,90,72


## Data 2 - Players who have played more than 10 games

In [None]:
df2 = data_cleaned[data_cleaned['GP']>10]

In [None]:
df2.shape

(7020, 116)

In [None]:
test2= df2[df2['Season_fifa']=='2020/21']
train2= df2[df2['Season_fifa']!='2020/21']

In [None]:
print(test2.shape, train2.shape)

(1236, 116) (5784, 116)


### 2.a. All data

In [None]:
%%time

train_2a= train2.copy(deep=True)
test_2a= test2.copy(deep=True)

X = train_2a.drop(['Season_fifa', 'overall', 'short_name'], axis=1)
y = train_2a['overall']

X_train, y_train = X, y

model = run_model(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
CPU times: user 6min 3s, sys: 10 s, total: 6min 13s
Wall time: 3min 44s


In [None]:
test_2a['Overall_Predicted'] = model.predict(test_2a.drop(['Season_fifa', 'overall', 'short_name'], axis=1))
test_2a['Overall_Predicted'] = test_2a['Overall_Predicted'].apply(lambda x: round(x))

test_2a[['short_name','overall', 'Overall_Predicted']].sort_values(by=['overall'], ascending=False).head(10)

Unnamed: 0,short_name,overall,Overall_Predicted
7431,L. Messi,93,78
7432,R. Lewandowski,92,71
7433,Cristiano Ronaldo,91,79
7434,Neymar Jr,91,69
7435,K. De Bruyne,91,72
7436,J. Oblak,91,65
7437,K. Mbappé,91,54
7438,M. Neuer,90,72
7439,M. ter Stegen,90,72
7440,H. Kane,90,68


### 2.b Physical Attributes, Game Attributes, Values and Wages

In [None]:
%%time

cols = [0,1,5,6,7,10,11, 115]

train_2b= train2.copy(deep=True)
test_2a= test2.copy(deep=True)

X = train_2b.drop(train_2b.columns[cols], axis=1)
y = train_2b['overall']

X_train, y_train = X, y

model = run_model(X_train, y_train, epochs=30)
test_2a['Overall_Predicted']=model.predict(test_2a.drop(test_2a.columns[cols], axis=1))
test_2a['Overall_Predicted'] = test_2a['Overall_Predicted'].apply(lambda x: round(x))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
CPU times: user 8min 42s, sys: 15.9 s, total: 8min 58s
Wall time: 5min 18s


In [None]:
test_2a[['short_name','overall', 'Overall_Predicted']].sort_values(by=['overall'], ascending=False).head(10)

Unnamed: 0,short_name,overall,Overall_Predicted
7431,L. Messi,93,91
7432,R. Lewandowski,92,95
7433,Cristiano Ronaldo,91,87
7434,Neymar Jr,91,96
7435,K. De Bruyne,91,96
7436,J. Oblak,91,94
7437,K. Mbappé,91,101
7438,M. Neuer,90,78
7439,M. ter Stegen,90,93
7440,H. Kane,90,96


### 2.c Physical and Game Attributes

In [None]:
%%time

cols = [0,1,5,6,7,8,9,10,11, 115]

train_2c= train2.copy(deep=True)
test_2a= test2.copy(deep=True)

X = train_2c.drop(train_2c.columns[cols], axis=1)
y = train_2c['overall']

X_train, y_train = X, y

model = run_model(X_train, y_train, epochs=30)
test_2a['Overall_Predicted']=model.predict(test_2a.drop(test_2a.columns[cols], axis=1))
test_2a['Overall_Predicted'] = test_2a['Overall_Predicted'].apply(lambda x: round(x))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
CPU times: user 5min 42s, sys: 9.17 s, total: 5min 51s
Wall time: 3min 31s


In [None]:
test_2a[['short_name','overall', 'Overall_Predicted']].sort_values(by=['overall'], ascending=False).head(10)

Unnamed: 0,short_name,overall,Overall_Predicted
7431,L. Messi,93,100
7432,R. Lewandowski,92,88
7433,Cristiano Ronaldo,91,92
7434,Neymar Jr,91,90
7435,K. De Bruyne,91,89
7436,J. Oblak,91,93
7437,K. Mbappé,91,92
7438,M. Neuer,90,91
7439,M. ter Stegen,90,87
7440,H. Kane,90,90
