## Predicting FIFA Man of the Match 

Data source: https://www.kaggle.com/datasets/mathan/fifa-2018-match-statistics

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
import tensorflow as tf

2025-03-23 08:14:41.328053: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv('FIFA 2018 Statistics.csv')
data

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,06-14-18,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,,
1,06-14-18,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,,Group Stage,No,0,,
2,06-15-18,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,,Group Stage,No,0,,
3,06-15-18,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,,
4,06-15-18,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,,Group Stage,No,0,1.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,11-07-18,England,Croatia,1,46,11,1,6,4,4,...,1,0,0,No,5.0,Semi- Finals,No,0,,
124,07-14-18,Belgium,England,2,43,12,4,3,5,4,...,1,0,0,Yes,4.0,3rd Place,No,0,,
125,07-14-18,England,Belgium,0,57,15,5,7,3,5,...,2,0,0,No,,3rd Place,No,0,,
126,07-15-18,France,Croatia,4,39,8,6,1,1,2,...,2,0,0,Yes,18.0,Final,No,0,1.0,18.0


### Preprocessing

In [3]:
df = data.copy()
df.drop('Date', axis=1, inplace=True)

#### Dealing with Missing Values

In [5]:
df.isnull().mean()

Team                      0.000000
Opponent                  0.000000
Goal Scored               0.000000
Ball Possession %         0.000000
Attempts                  0.000000
On-Target                 0.000000
Off-Target                0.000000
Blocked                   0.000000
Corners                   0.000000
Offsides                  0.000000
Free Kicks                0.000000
Saves                     0.000000
Pass Accuracy %           0.000000
Passes                    0.000000
Distance Covered (Kms)    0.000000
Fouls Committed           0.000000
Yellow Card               0.000000
Yellow & Red              0.000000
Red                       0.000000
Man of the Match          0.000000
1st Goal                  0.265625
Round                     0.000000
PSO                       0.000000
Goals in PSO              0.000000
Own goals                 0.906250
Own goal Time             0.906250
dtype: float64

In [6]:
df.drop(['Own goal Time', 'Own goals'], axis=1, inplace=True)

In [7]:
df['1st Goal'] = df['1st Goal'].fillna(df['1st Goal'].mean())

In [8]:
df.isnull().sum().sum()

0

#### Encoding

In [9]:
df.dtypes

Team                       object
Opponent                   object
Goal Scored                 int64
Ball Possession %           int64
Attempts                    int64
On-Target                   int64
Off-Target                  int64
Blocked                     int64
Corners                     int64
Offsides                    int64
Free Kicks                  int64
Saves                       int64
Pass Accuracy %             int64
Passes                      int64
Distance Covered (Kms)      int64
Fouls Committed             int64
Yellow Card                 int64
Yellow & Red                int64
Red                         int64
Man of the Match           object
1st Goal                  float64
Round                      object
PSO                        object
Goals in PSO                int64
dtype: object

In [12]:
{column: df[column].unique() for column in df.select_dtypes('object').columns}

{'Team': array(['Russia', 'Saudi Arabia', 'Egypt', 'Uruguay', 'Morocco', 'Iran',
        'Portugal', 'Spain', 'France', 'Australia', 'Argentina', 'Iceland',
        'Peru', 'Denmark', 'Croatia', 'Nigeria', 'Costa Rica', 'Serbia',
        'Germany', 'Mexico', 'Brazil', 'Switzerland', 'Sweden',
        'Korea Republic', 'Belgium', 'Panama', 'Tunisia', 'England',
        'Colombia', 'Japan', 'Poland', 'Senegal'], dtype=object),
 'Opponent': array(['Saudi Arabia', 'Russia', 'Uruguay', 'Egypt', 'Iran', 'Morocco',
        'Spain', 'Portugal', 'Australia', 'France', 'Iceland', 'Argentina',
        'Denmark', 'Peru', 'Nigeria', 'Croatia', 'Serbia', 'Costa Rica',
        'Mexico', 'Germany', 'Switzerland', 'Brazil', 'Korea Republic',
        'Sweden', 'Panama', 'Belgium', 'England', 'Tunisia', 'Japan',
        'Colombia', 'Senegal', 'Poland'], dtype=object),
 'Man of the Match': array(['Yes', 'No'], dtype=object),
 'Round': array(['Group Stage', 'Round of 16', 'Quarter Finals', 'Semi- Finals',


In [13]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(df['Man of the Match'])

array([1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0])

In [15]:
man_mappings = {index: label for index, label in enumerate(label_encoder.classes_)}
man_mappings

{0: 'No', 1: 'Yes'}

In [16]:
df['Man of the Match'] = label_encoder.fit_transform(df['Man of the Match'])

In [17]:
df['PSO'] = label_encoder.fit_transform(df['PSO'])
pso_mappings = {index: label for index, label in enumerate(label_encoder.classes_)}
pso_mappings

{0: 'No', 1: 'Yes'}

In [18]:
df

Unnamed: 0,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,...,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO
0,Russia,Saudi Arabia,5,40,13,7,3,3,6,3,...,118,22,0,0,0,1,12.000000,Group Stage,0,0
1,Saudi Arabia,Russia,0,60,6,0,3,3,2,1,...,105,10,0,0,0,0,39.457447,Group Stage,0,0
2,Egypt,Uruguay,0,43,8,3,3,2,0,1,...,112,12,2,0,0,0,39.457447,Group Stage,0,0
3,Uruguay,Egypt,1,57,14,4,6,4,5,1,...,111,6,0,0,0,1,89.000000,Group Stage,0,0
4,Morocco,Iran,0,64,13,3,6,4,5,0,...,101,22,1,0,0,0,39.457447,Group Stage,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,England,Croatia,1,46,11,1,6,4,4,3,...,148,14,1,0,0,0,5.000000,Semi- Finals,0,0
124,Belgium,England,2,43,12,4,3,5,4,1,...,108,11,1,0,0,1,4.000000,3rd Place,0,0
125,England,Belgium,0,57,15,5,7,3,5,0,...,110,5,2,0,0,0,39.457447,3rd Place,0,0
126,France,Croatia,4,39,8,6,1,1,2,1,...,99,14,2,0,0,1,18.000000,Final,0,0


In [19]:
label_encoder.fit_transform(df['Round'])
round_mappings = {index: label for index, label in enumerate(label_encoder.classes_)}
round_mappings

{0: '3rd Place',
 1: 'Final',
 2: 'Group Stage',
 3: 'Quarter Finals',
 4: 'Round of 16',
 5: 'Semi- Finals'}

In [20]:
round_values = list(df['Round'].unique())
round_values

['Group Stage',
 'Round of 16',
 'Quarter Finals',
 'Semi- Finals',
 '3rd Place',
 'Final']

In [21]:
round_mappings = {label: index for index, label in enumerate(round_values)}
round_mappings

{'Group Stage': 0,
 'Round of 16': 1,
 'Quarter Finals': 2,
 'Semi- Finals': 3,
 '3rd Place': 4,
 'Final': 5}

In [22]:
df['Round'] = df['Round'].apply(lambda x: round_mappings[x])
df['Round']

0      0
1      0
2      0
3      0
4      0
      ..
123    3
124    4
125    4
126    5
127    5
Name: Round, Length: 128, dtype: int64

In [24]:
{column: df[column].unique() for column in df.select_dtypes('object').columns}

{'Team': array(['Russia', 'Saudi Arabia', 'Egypt', 'Uruguay', 'Morocco', 'Iran',
        'Portugal', 'Spain', 'France', 'Australia', 'Argentina', 'Iceland',
        'Peru', 'Denmark', 'Croatia', 'Nigeria', 'Costa Rica', 'Serbia',
        'Germany', 'Mexico', 'Brazil', 'Switzerland', 'Sweden',
        'Korea Republic', 'Belgium', 'Panama', 'Tunisia', 'England',
        'Colombia', 'Japan', 'Poland', 'Senegal'], dtype=object),
 'Opponent': array(['Saudi Arabia', 'Russia', 'Uruguay', 'Egypt', 'Iran', 'Morocco',
        'Spain', 'Portugal', 'Australia', 'France', 'Iceland', 'Argentina',
        'Denmark', 'Peru', 'Nigeria', 'Croatia', 'Serbia', 'Costa Rica',
        'Mexico', 'Germany', 'Switzerland', 'Brazil', 'Korea Republic',
        'Sweden', 'Panama', 'Belgium', 'England', 'Tunisia', 'Japan',
        'Colombia', 'Senegal', 'Poland'], dtype=object)}

In [27]:
df['Opponent'] = df['Opponent'].apply(lambda x: 'opp_' + x)

In [28]:
df_concat = pd.concat([df, pd.get_dummies(df['Team']), pd.get_dummies(df['Opponent'])], axis=1)

In [29]:
df_concat

Unnamed: 0,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,...,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia,opp_Uruguay
0,Russia,opp_Saudi Arabia,5,40,13,7,3,3,6,3,...,False,False,True,False,False,False,False,False,False,False
1,Saudi Arabia,opp_Russia,0,60,6,0,3,3,2,1,...,False,True,False,False,False,False,False,False,False,False
2,Egypt,opp_Uruguay,0,43,8,3,3,2,0,1,...,False,False,False,False,False,False,False,False,False,True
3,Uruguay,opp_Egypt,1,57,14,4,6,4,5,1,...,False,False,False,False,False,False,False,False,False,False
4,Morocco,opp_Iran,0,64,13,3,6,4,5,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,England,opp_Croatia,1,46,11,1,6,4,4,3,...,False,False,False,False,False,False,False,False,False,False
124,Belgium,opp_England,2,43,12,4,3,5,4,1,...,False,False,False,False,False,False,False,False,False,False
125,England,opp_Belgium,0,57,15,5,7,3,5,0,...,False,False,False,False,False,False,False,False,False,False
126,France,opp_Croatia,4,39,8,6,1,1,2,1,...,False,False,False,False,False,False,False,False,False,False


In [30]:
df_concat.drop(['Team', 'Opponent'], axis=1, inplace=True)

In [34]:
np.sum(df_concat.dtypes == 'object')

0

### Scaling

In [35]:
y = df_concat['Man of the Match']
X = df_concat.drop('Man of the Match', axis=1)

In [36]:
y

0      1
1      0
2      0
3      1
4      0
      ..
123    0
124    1
125    0
126    1
127    0
Name: Man of the Match, Length: 128, dtype: int64

In [37]:
X

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia,opp_Uruguay
0,5,40,13,7,3,3,6,3,11,0,...,False,False,True,False,False,False,False,False,False,False
1,0,60,6,0,3,3,2,1,25,2,...,False,True,False,False,False,False,False,False,False,False
2,0,43,8,3,3,2,0,1,7,3,...,False,False,False,False,False,False,False,False,False,True
3,1,57,14,4,6,4,5,1,13,3,...,False,False,False,False,False,False,False,False,False,False
4,0,64,13,3,6,4,5,0,14,2,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,1,46,11,1,6,4,4,3,24,5,...,False,False,False,False,False,False,False,False,False,False
124,2,43,12,4,3,5,4,1,5,5,...,False,False,False,False,False,False,False,False,False,False
125,0,57,15,5,7,3,5,0,12,2,...,False,False,False,False,False,False,False,False,False,False
126,4,39,8,6,1,1,2,1,14,1,...,False,False,False,False,False,False,False,False,False,False


In [38]:
scaler = RobustScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [39]:
X

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,...,opp_Portugal,opp_Russia,opp_Saudi Arabia,opp_Senegal,opp_Serbia,opp_Spain,opp_Sweden,opp_Switzerland,opp_Tunisia,opp_Uruguay
0,2.0,-0.6250,0.166667,1.166667,-0.666667,0.000000,0.333333,1.0,-0.571429,-0.666667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.5,0.6250,-1.000000,-1.166667,-0.666667,0.000000,-1.000000,0.0,1.428571,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.5,-0.4375,-0.666667,-0.166667,-0.666667,-0.444444,-1.666667,0.0,-1.142857,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.4375,0.333333,0.166667,0.333333,0.444444,0.000000,0.0,-0.285714,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.5,0.8750,0.166667,-0.166667,0.333333,0.444444,0.000000,-0.5,-0.142857,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0.0,-0.2500,-0.166667,-0.833333,0.333333,0.444444,-0.333333,1.0,1.285714,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.5,-0.4375,0.000000,0.166667,-0.666667,0.888889,-0.333333,0.0,-1.428571,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,-0.5,0.4375,0.500000,0.500000,0.666667,0.000000,0.000000,-0.5,-0.428571,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
126,1.5,-0.6875,-0.666667,0.833333,-1.333333,-0.888889,-1.000000,0.0,-0.142857,-0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Splitting the data

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

### Training

In [41]:
sk_model = MLPClassifier(hidden_layer_sizes=(32,32))
sk_model.fit(X_train, y_train)



In [42]:
inputs = tf.keras.Input(shape=(85,))
x = tf.keras.layers.Dense(32, activation='relu')(inputs)
x = tf.keras.layers.Dense(32, activation='relu')(x)
outputs = tf.keras.layers.Dense(2, activation=tf.nn.softmax)(x)

tf_model = tf.keras.Model(inputs=inputs, outputs=outputs)

2025-03-23 09:37:15.525689: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [43]:
tf_model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [47]:
tf_model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x74fa9c239900>

### Results

In [48]:
sk_score = sk_model.score(X_train, y_train)
tf_score = tf_model.evaluate(X_train, y_train, verbose=False)

In [51]:
print(f"sklearn Model: {sk_score}")
print(f"Tensorflow Model: {tf_score[1]}")

sklearn Model: 1.0
Tensorflow Model: 1.0


In [52]:
X_test.shape

(39, 85)