# **FIFA 19 Dataset: Predict Players Positions**

**Import the libraries we will use:**

In [458]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [459]:
data = pd.read_csv('Fifa19.csv')

In [460]:
data.shape

(3517, 89)

In [461]:
data.head()

Unnamed: 0,Club,Playing_in_League,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,Real Madrid,Spain Primera Division (1),177003,L. Modrić,32,https://cdn.sofifa.org/players/4/19/177003.png,Croatia,https://cdn.sofifa.org/flags/10.png,91.0,91.0,...,84.0,60.0,76.0,73.0,13.0,9.0,7.0,14.0,9.0,€137.4M
1,Real Madrid,Spain Primera Division (1),155862,Sergio Ramos,32,https://cdn.sofifa.org/players/4/19/155862.png,Spain,https://cdn.sofifa.org/flags/45.png,91.0,91.0,...,82.0,87.0,92.0,91.0,11.0,8.0,9.0,7.0,11.0,€104.6M
2,Real Madrid,Spain Primera Division (1),182521,T. Kroos,28,https://cdn.sofifa.org/players/4/19/182521.png,Germany,https://cdn.sofifa.org/flags/21.png,90.0,90.0,...,85.0,72.0,79.0,69.0,10.0,11.0,13.0,7.0,10.0,€156.8M
3,Real Madrid,Spain Primera Division (1),192119,T. Courtois,26,https://cdn.sofifa.org/players/4/19/192119.png,Belgium,https://cdn.sofifa.org/flags/7.png,89.0,90.0,...,66.0,20.0,18.0,16.0,85.0,91.0,72.0,86.0,88.0,€113.7M
4,Real Madrid,Spain Primera Division (1),200145,Casemiro,26,https://cdn.sofifa.org/players/4/19/200145.png,Brazil,https://cdn.sofifa.org/flags/54.png,88.0,90.0,...,84.0,88.0,90.0,87.0,13.0,14.0,16.0,12.0,12.0,€126.4M


Create a "for" to view each of the attributes:

In [462]:
#create indexes: will start at zero until the number of columns

for i, col in enumerate(data.columns):
    print(i, col)

0 Club
1 Playing_in_League
2 ID
3 Name
4 Age
5 Photo
6 Nationality
7 Flag
8 Overall
9 Potential
10 Club Logo
11 Value
12 Wage
13 Special
14 Preferred Foot
15 International Reputation
16 Weak Foot
17 Skill Moves
18 Work Rate
19 Body Type
20 Real Face
21 Position
22 Jersey Number
23 Joined
24 Loaned From
25 Contract Valid Until
26 Height
27 Weight
28 LS
29 ST
30 RS
31 LW
32 LF
33 CF
34 RF
35 RW
36 LAM
37 CAM
38 RAM
39 LM
40 LCM
41 CM
42 RCM
43 RM
44 LWB
45 LDM
46 CDM
47 RDM
48 RWB
49 LB
50 LCB
51 CB
52 RCB
53 RB
54 Crossing
55 Finishing
56 HeadingAccuracy
57 ShortPassing
58 Volleys
59 Dribbling
60 Curve
61 FKAccuracy
62 LongPassing
63 BallControl
64 Acceleration
65 SprintSpeed
66 Agility
67 Reactions
68 Balance
69 ShotPower
70 Jumping
71 Stamina
72 Strength
73 LongShots
74 Aggression
75 Interceptions
76 Positioning
77 Vision
78 Penalties
79 Composure
80 Marking
81 StandingTackle
82 SlidingTackle
83 GKDiving
84 GKHandling
85 GKKicking
86 GKPositioning
87 GKReflexes
88 Release Clause


The 21 is the Position of the player. We will create a new variable to get the Position, Height, Weight and the technical characteristics. We will get the 21, 26, 27 and it'll concatenate with the 54 until 82 because it is the technical parts:

In [463]:
cols = [21, 26, 27]
cols += range(54, 83)

In [464]:
data = data.iloc[:, cols]

In [465]:
#missing values:
data.isna().sum(axis = 0)

Position           5
Height             5
Weight             5
Crossing           5
Finishing          5
HeadingAccuracy    5
ShortPassing       5
Volleys            5
Dribbling          5
Curve              5
FKAccuracy         5
LongPassing        5
BallControl        5
Acceleration       5
SprintSpeed        5
Agility            5
Reactions          5
Balance            5
ShotPower          5
Jumping            5
Stamina            5
Strength           5
LongShots          5
Aggression         5
Interceptions      5
Positioning        5
Vision             5
Penalties          5
Composure          5
Marking            5
StandingTackle     5
SlidingTackle      5
dtype: int64

We can see the 60 records that have no Position, also do not have the other records, so we will delete these 60 records with missing data

In [466]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3517 entries, 0 to 3516
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Position         3512 non-null   object 
 1   Height           3512 non-null   object 
 2   Weight           3512 non-null   object 
 3   Crossing         3512 non-null   float64
 4   Finishing        3512 non-null   float64
 5   HeadingAccuracy  3512 non-null   float64
 6   ShortPassing     3512 non-null   float64
 7   Volleys          3512 non-null   float64
 8   Dribbling        3512 non-null   float64
 9   Curve            3512 non-null   float64
 10  FKAccuracy       3512 non-null   float64
 11  LongPassing      3512 non-null   float64
 12  BallControl      3512 non-null   float64
 13  Acceleration     3512 non-null   float64
 14  SprintSpeed      3512 non-null   float64
 15  Agility          3512 non-null   float64
 16  Reactions        3512 non-null   float64
 17  Balance       

In [467]:
data = data.dropna()

In [468]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3512 entries, 0 to 3515
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Position         3512 non-null   object 
 1   Height           3512 non-null   object 
 2   Weight           3512 non-null   object 
 3   Crossing         3512 non-null   float64
 4   Finishing        3512 non-null   float64
 5   HeadingAccuracy  3512 non-null   float64
 6   ShortPassing     3512 non-null   float64
 7   Volleys          3512 non-null   float64
 8   Dribbling        3512 non-null   float64
 9   Curve            3512 non-null   float64
 10  FKAccuracy       3512 non-null   float64
 11  LongPassing      3512 non-null   float64
 12  BallControl      3512 non-null   float64
 13  Acceleration     3512 non-null   float64
 14  SprintSpeed      3512 non-null   float64
 15  Agility          3512 non-null   float64
 16  Reactions        3512 non-null   float64
 17  Balance       

In [469]:
data.isna().sum(axis = 0)

Position           0
Height             0
Weight             0
Crossing           0
Finishing          0
HeadingAccuracy    0
ShortPassing       0
Volleys            0
Dribbling          0
Curve              0
FKAccuracy         0
LongPassing        0
BallControl        0
Acceleration       0
SprintSpeed        0
Agility            0
Reactions          0
Balance            0
ShotPower          0
Jumping            0
Stamina            0
Strength           0
LongShots          0
Aggression         0
Interceptions      0
Positioning        0
Vision             0
Penalties          0
Composure          0
Marking            0
StandingTackle     0
SlidingTackle      0
dtype: int64

We combine related positions to reach just 12 different positions:

In [470]:
# We go for a 4-3-3 formation!
data.loc[data['Position']=='ST','Position']='CF'              # All strickers are now counte as Central Forward (CF)
data.loc[data['Position']=='RS','Position']='RF'              # All right strikers are counted as Right Forward (RF)
data.loc[data['Position']=='LS','Position']='LF'              # All left strikers are counted as Left Forward (RF)

# Midataield
data.loc[data['Position'].isin(['RCM','RAM','RDM']),'Position']='RM'      # All variants right Middataieldsers (i.e. attacking and defending) are now counted as right middataielders
data.loc[data['Position'].isin(['LCM','LAM','LDM']),'Position']='LM'      # All variants left Middataieldsers (i.e. attacking and defending) are now counted as left middataielders
data.loc[data['Position'].isin(['CAM','CDM']),'Position']='CM'              # All central midataielders grouped together

# Defense
data.loc[data['Position'].isin(['RW','RWB']),'Position']='RB'      # All variants right defenders (i.e. wing and wing back) are now counted as right defenders
data.loc[data['Position'].isin(['LW','LWB','CB']),'Position']='LB'      # All variants Left defenders (i.e. wing and wing back) are now counted as left defenders

#  **Studying player height**

In [471]:
data['Height'].head()

0    5'8
1    6'0
2    6'0
3    6'6
4    6'1
Name: Height, dtype: object

I will convert the height to centimeters:

In [472]:
data['Height'] = data['Height'].str.split('\'')
data['Height'] = [30.48 * int(elem[0]) + 2.54 * int(elem[1]) for elem in data['Height']]

# **Studying the weight of the players**

In [473]:
data['Weight'].head()

0    146lbs
1    181lbs
2    168lbs
3    212lbs
4    185lbs
Name: Weight, dtype: object

Let's convert to kilogram

In [474]:
data['Weight'] = data['Weight'].str.split('l')
data['Weight'] = [int(elem[0]) * 0.453 for elem in data['Weight']]

# **Preparing the database:**

In [475]:
position = np.array(data['Position'])
np.unique(position, return_counts = True)

(array(['CF', 'CM', 'GK', 'LB', 'LCB', 'LF', 'LM', 'RB', 'RCB', 'RF', 'RM'],
       dtype=object),
 array([391, 674, 390, 725, 114,  46, 323, 357, 125,  42, 325]))

Now we can delete the Position column from the database

In [476]:
data = data.drop(['Position'], axis = 1)
data.head()

Unnamed: 0,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,...,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle
0,172.72,66.138,86.0,72.0,55.0,93.0,76.0,90.0,85.0,78.0,...,82.0,62.0,83.0,79.0,92.0,82.0,84.0,60.0,76.0,73.0
1,182.88,81.993,66.0,60.0,91.0,78.0,66.0,63.0,74.0,72.0,...,59.0,88.0,90.0,60.0,63.0,75.0,82.0,87.0,92.0,91.0
2,182.88,76.104,88.0,76.0,54.0,92.0,82.0,81.0,86.0,84.0,...,92.0,60.0,82.0,79.0,86.0,73.0,85.0,72.0,79.0,69.0
3,198.12,96.036,14.0,14.0,13.0,33.0,12.0,13.0,19.0,20.0,...,17.0,23.0,15.0,13.0,44.0,27.0,66.0,20.0,18.0,16.0
4,185.42,83.805,52.0,59.0,76.0,85.0,53.0,69.0,59.0,74.0,...,79.0,87.0,87.0,69.0,77.0,66.0,84.0,88.0,90.0,87.0


We will now use a scale so that the values are between 0 and 1, we will do this because we will use KMeans to perform the grouping, as we can see the height has higher values than the Heading Accuracy, which affects the model. With that we will leave them in closer values for better results.

In [477]:
scaler = MinMaxScaler()
train = scaler.fit_transform(data)

In [478]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, position, test_size=0.2, random_state=35)

# **XGBoost**

In [479]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [480]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=40, random_state=20)
xgb.fit(X_train, y_train,eval_metric=f1_score)

In [481]:
from sklearn.metrics import f1_score
pred = xgb.predict(X_test)
f1 = f1_score(y_test, pred, average='macro')
print(f1)

0.4173446663577131


# **Decision Tree**

In [512]:
from sklearn.tree import DecisionTreeClassifier
md = DecisionTreeClassifier(random_state=37)
md.fit(X_train, y_train)

In [513]:
from sklearn.metrics import f1_score
pred = md.predict(X_test)
f1 = f1_score(y_test, pred, average='macro')
print(f1)

0.36575961693784914


# **Random Forest**

In [536]:
rf = RandomForestClassifier(max_features='log2', random_state=35)
rf.fit(X_train, y_train)

In [537]:
from sklearn.metrics import f1_score
pred = rf.predict(X_test)
f1 = f1_score(y_test, pred, average='macro')
print(f1)

0.3851997416002281
