In [1]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split


from sklearn.neural_network import MLPClassifier
import tensorflow as tf

In [53]:
data = pd.read_csv('../input/fifa-2018-match-statistics/FIFA 2018 Statistics.csv')

In [54]:
data

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,,
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,,Group Stage,No,0,,
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,,Group Stage,No,0,,
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,,
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,,Group Stage,No,0,1.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,11-07-2018,England,Croatia,1,46,11,1,6,4,4,...,1,0,0,No,5.0,Semi- Finals,No,0,,
124,14-07-2018,Belgium,England,2,43,12,4,3,5,4,...,1,0,0,Yes,4.0,3rd Place,No,0,,
125,14-07-2018,England,Belgium,0,57,15,5,7,3,5,...,2,0,0,No,,3rd Place,No,0,,
126,15-07-2018,France,Croatia,4,39,8,6,1,1,2,...,2,0,0,Yes,18.0,Final,No,0,1.0,18.0


In [55]:
# preprocessing the data

In [56]:
# there is no use of date column in prediction 
data.drop('Date' , axis = 1 , inplace = True)

In [57]:
# checking for missing values
data.isnull().any()

Team                      False
Opponent                  False
Goal Scored               False
Ball Possession %         False
Attempts                  False
On-Target                 False
Off-Target                False
Blocked                   False
Corners                   False
Offsides                  False
Free Kicks                False
Saves                     False
Pass Accuracy %           False
Passes                    False
Distance Covered (Kms)    False
Fouls Committed           False
Yellow Card               False
Yellow & Red              False
Red                       False
Man of the Match          False
1st Goal                   True
Round                     False
PSO                       False
Goals in PSO              False
Own goals                  True
Own goal Time              True
dtype: bool

In [58]:
data.isnull().sum()

Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                   34
Round                       0
PSO                         0
Goals in PSO                0
Own goals                 116
Own goal Time             116
dtype: int64

In [59]:
# Own goals and Own goal Time has alot of missing values, so there is no use of it
data.drop(['Own goals' , "Own goal Time"] , axis = 1 , inplace = True)

In [60]:
# replacing NANwith  mean values for 1st goal 
data['1st Goal'] = data['1st Goal'].fillna(data['1st Goal'].mean())

In [61]:
# no missing values
data.isnull().any()

Team                      False
Opponent                  False
Goal Scored               False
Ball Possession %         False
Attempts                  False
On-Target                 False
Off-Target                False
Blocked                   False
Corners                   False
Offsides                  False
Free Kicks                False
Saves                     False
Pass Accuracy %           False
Passes                    False
Distance Covered (Kms)    False
Fouls Committed           False
Yellow Card               False
Yellow & Red              False
Red                       False
Man of the Match          False
1st Goal                  False
Round                     False
PSO                       False
Goals in PSO              False
dtype: bool

In [62]:
# Encoding the data

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Team                    128 non-null    object 
 1   Opponent                128 non-null    object 
 2   Goal Scored             128 non-null    int64  
 3   Ball Possession %       128 non-null    int64  
 4   Attempts                128 non-null    int64  
 5   On-Target               128 non-null    int64  
 6   Off-Target              128 non-null    int64  
 7   Blocked                 128 non-null    int64  
 8   Corners                 128 non-null    int64  
 9   Offsides                128 non-null    int64  
 10  Free Kicks              128 non-null    int64  
 11  Saves                   128 non-null    int64  
 12  Pass Accuracy %         128 non-null    int64  
 13  Passes                  128 non-null    int64  
 14  Distance Covered (Kms)  128 non-null    in

In [64]:
data['Team'].unique()

array(['Russia', 'Saudi Arabia', 'Egypt', 'Uruguay', 'Morocco', 'Iran',
       'Portugal', 'Spain', 'France', 'Australia', 'Argentina', 'Iceland',
       'Peru', 'Denmark', 'Croatia', 'Nigeria', 'Costa Rica', 'Serbia',
       'Germany', 'Mexico', 'Brazil', 'Switzerland', 'Sweden',
       'Korea Republic', 'Belgium', 'Panama', 'Tunisia', 'England',
       'Colombia', 'Japan', 'Poland', 'Senegal'], dtype=object)

In [65]:
data['Opponent'].unique()

array(['Saudi Arabia', 'Russia', 'Uruguay', 'Egypt', 'Iran', 'Morocco',
       'Spain', 'Portugal', 'Australia', 'France', 'Iceland', 'Argentina',
       'Denmark', 'Peru', 'Nigeria', 'Croatia', 'Serbia', 'Costa Rica',
       'Mexico', 'Germany', 'Switzerland', 'Brazil', 'Korea Republic',
       'Sweden', 'Panama', 'Belgium', 'England', 'Tunisia', 'Japan',
       'Colombia', 'Senegal', 'Poland'], dtype=object)

In [66]:
data['Man of the Match'].unique()

array(['Yes', 'No'], dtype=object)

In [67]:
data['Round'].unique()

array(['Group Stage', 'Round of 16', 'Quarter Finals', 'Semi- Finals',
       '3rd Place', 'Final'], dtype=object)

In [68]:
data['PSO'].unique()

array(['No', 'Yes'], dtype=object)

In [69]:
label_encoder = LabelEncoder()

# label_encoder.fit_transform(data['Man of the Match'])
# man_mapping = {index : label for index , label in enumerate(label_encoder.classes_)}

In [70]:
data['Man of the Match'] = label_encoder.fit_transform(data['Man of the Match'])
man_mapping = {index : label for index , label in enumerate(label_encoder.classes_)}

data['PSO'] = label_encoder.fit_transform(data['PSO'])
pso_mapping = {index : label for index , label in enumerate(label_encoder.classes_)}

In [71]:
man_mapping 

{0: 'No', 1: 'Yes'}

In [72]:
pso_mapping

{0: 'No', 1: 'Yes'}

In [73]:
data.head(3)

Unnamed: 0,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,...,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO
0,Russia,Saudi Arabia,5,40,13,7,3,3,6,3,...,118,22,0,0,0,1,12.0,Group Stage,0,0
1,Saudi Arabia,Russia,0,60,6,0,3,3,2,1,...,105,10,0,0,0,0,39.457447,Group Stage,0,0
2,Egypt,Uruguay,0,43,8,3,3,2,0,1,...,112,12,2,0,0,0,39.457447,Group Stage,0,0


In [74]:
# label_encoder.fit_transform(data['Round'])
# round_mapping = {index : label for index , label in enumerate(label_encoder.classes_)}
# round_mapping
# {0: '3rd Place',
#  1: 'Final',
#  2: 'Group Stage',
#  3: 'Quarter Finals',
#  4: 'Round of 16',
#  5: 'Semi- Finals'}

In [75]:
# we can see labelencoder is not encding in proper order whcih we want it to do
round_values = list(data['Round'].unique())
round_values

['Group Stage',
 'Round of 16',
 'Quarter Finals',
 'Semi- Finals',
 '3rd Place',
 'Final']

In [76]:
round_mapping = {label : index for index ,label in enumerate(round_values)}
round_mapping

{'Group Stage': 0,
 'Round of 16': 1,
 'Quarter Finals': 2,
 'Semi- Finals': 3,
 '3rd Place': 4,
 'Final': 5}

In [77]:
data['Round'].apply(lambda x : round_mapping[x])

0      0
1      0
2      0
3      0
4      0
      ..
123    3
124    4
125    4
126    5
127    5
Name: Round, Length: 128, dtype: int64

In [78]:
data['Round'] = data['Round'].apply(lambda x : round_mapping[x])

In [79]:
data.head(1)

Unnamed: 0,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,...,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO
0,Russia,Saudi Arabia,5,40,13,7,3,3,6,3,...,118,22,0,0,0,1,12.0,0,0,0


In [80]:
# using one hot encoding for data['Teams'] and data['Opponent']

team_df = pd.get_dummies(data['Team'])

In [81]:
data['Opponent'] = data['Opponent'].apply(lambda x : "opp_" + x)

In [82]:
opponent_df = pd.get_dummies(data['Opponent'])

In [85]:
team_df.drop('Uruguay' , axis = 1 , inplace = True)
team_df 

Unnamed: 0,Argentina,Australia,Belgium,Brazil,Colombia,Costa Rica,Croatia,Denmark,Egypt,England,...,Poland,Portugal,Russia,Saudi Arabia,Senegal,Serbia,Spain,Sweden,Switzerland,Tunisia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
124,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
126,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
opponent_df.drop('opp_Uruguay' , axis = 1 , inplace = True)
opponent_df

Unnamed: 0,opp_Argentina,opp_Australia,opp_Belgium,opp_Brazil,opp_Colombia,opp_Costa Rica,opp_Croatia,opp_Denmark,opp_Egypt,opp_England,...,opp_Poland,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
125,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
126,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
data_concat = pd.concat([data , team_df , opponent_df] , axis = 1)
data_concat.drop(['Team' , 'Opponent'] , axis = 1 , inplace = True)

In [88]:
data_concat

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,opp_Poland,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia
0,5,40,13,7,3,3,6,3,11,0,...,0,0,0,1,0,0,0,0,0,0
1,0,60,6,0,3,3,2,1,25,2,...,0,0,1,0,0,0,0,0,0,0
2,0,43,8,3,3,2,0,1,7,3,...,0,0,0,0,0,0,0,0,0,0
3,1,57,14,4,6,4,5,1,13,3,...,0,0,0,0,0,0,0,0,0,0
4,0,64,13,3,6,4,5,0,14,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,1,46,11,1,6,4,4,3,24,5,...,0,0,0,0,0,0,0,0,0,0
124,2,43,12,4,3,5,4,1,5,5,...,0,0,0,0,0,0,0,0,0,0
125,0,57,15,5,7,3,5,0,12,2,...,0,0,0,0,0,0,0,0,0,0
126,4,39,8,6,1,1,2,1,14,1,...,0,0,0,0,0,0,0,0,0,0


In [89]:
np.sum(data_concat.dtypes == 'object')

0

In [91]:
#scaling the data
y = data_concat['Man of the Match']
X = data_concat.drop('Man of the Match', axis=1)

In [92]:
scaler = RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X) , columns = X.columns)

In [93]:
X

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,opp_Poland,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia
0,2.0,-0.6250,0.166667,1.166667,-0.666667,0.000000,0.333333,1.0,-0.571429,-0.666667,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.5,0.6250,-1.000000,-1.166667,-0.666667,0.000000,-1.000000,0.0,1.428571,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.5,-0.4375,-0.666667,-0.166667,-0.666667,-0.444444,-1.666667,0.0,-1.142857,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.4375,0.333333,0.166667,0.333333,0.444444,0.000000,0.0,-0.285714,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.5,0.8750,0.166667,-0.166667,0.333333,0.444444,0.000000,-0.5,-0.142857,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0.0,-0.2500,-0.166667,-0.833333,0.333333,0.444444,-0.333333,1.0,1.285714,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.5,-0.4375,0.000000,0.166667,-0.666667,0.888889,-0.333333,0.0,-1.428571,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,-0.5,0.4375,0.500000,0.500000,0.666667,0.000000,0.000000,-0.5,-0.428571,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,1.5,-0.6875,-0.666667,0.833333,-1.333333,-0.888889,-1.000000,0.0,-0.142857,-0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
# splitting the data 
X_train , X_test , y_train , y_test = train_test_split(X , y ,train_size = 0.7)

In [97]:
# Training the model 
# MLPClassifier
sk_model = MLPClassifier(hidden_layer_sizes = (18 , 18))
sk_model.fit(X_train , y_train)



MLPClassifier(hidden_layer_sizes=(18, 18))

In [115]:
# Tensorflow
inputs = tf.keras.Input(shape = (83 , ))
x = tf.keras.layers.Dense(128 , activation = tf.keras.activations.relu)(inputs)
x = tf.keras.layers.Dense(128 , activation = tf.keras.activations.relu)(x)
x = tf.keras.layers.Dense(64 , activation = tf.keras.activations.relu)(x)
x = tf.keras.layers.Dense(64 , activation = tf.keras.activations.relu)(x)
x = tf.keras.layers.Dense(10 , activation = tf.keras.activations.relu)(x)
x = tf.keras.layers.Dense(5 , activation = tf.keras.activations.relu)(x)
x = tf.keras.layers.Dense(2 , activation = tf.keras.activations.relu)(x)
outputs = tf.keras.layers.Dense(2 , activation = tf.keras.activations.softmax)(x)

In [116]:
tf_model = tf.keras.Model(inputs = inputs , outputs = outputs)

tf_model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy() ,
                optimizer = tf.keras.optimizers.Adam() ,
                metrics = ['accuracy'])


In [117]:
tf_model.fit(X_train , y_train , epochs = 40 , validation_split = 0.2 , batch_size = 32)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f6165045650>

In [118]:
sk_score = sk_model.score(X_test, y_test)
tf_score = tf_model.evaluate(X_test, y_test, verbose=False)

In [119]:
print(f"sklearn Model: {sk_score}")
print(f"TensorFlow Model: {tf_score[1]}")

sklearn Model: 0.7435897435897436
TensorFlow Model: 0.7179487347602844
