So far, we have gathered two types of data. Now, it's time to merge all the collected data and begin the cleaning process.

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import requests
from bs4 import BeautifulSoup as bs
import re

In [41]:
df1 = pd.read_csv('transfer_market.csv')
df2 = pd.read_csv('stats.csv')

In [42]:
dataF = df1.merge(df2, on = "name")
dataF.head()

Unnamed: 0.1,Unnamed: 0,name,position,age,market_value,country_from,league_from,club_from,country_to,league_to,club_to,fee,loan,ATT,SKI,MOV,POW,MEN,DEF,GK
0,0,Erling Haaland,Centre-Forward,21,150.0,Germany,Bundesliga,Borussia Dortmund,England,Premier League,Manchester City,60.0,False,76.0,70.0,82.0,86.0,75.0,42.0,10.0
1,0,Erling Haaland,Centre-Forward,21,150.0,Germany,Bundesliga,Borussia Dortmund,England,Premier League,Manchester City,60.0,False,60.0,57.0,73.0,66.0,63.0,86.0,9.0
2,25,Erling Haaland,Centre-Forward,21,150.0,Germany,Bundesliga,Borussia Dortmund,England,Premier League,Manchester City,60.0,False,76.0,70.0,82.0,86.0,75.0,42.0,10.0
3,25,Erling Haaland,Centre-Forward,21,150.0,Germany,Bundesliga,Borussia Dortmund,England,Premier League,Manchester City,60.0,False,60.0,57.0,73.0,66.0,63.0,86.0,9.0
4,1,Antony,Right Winger,22,35.0,Netherlands,Eredivisie,Ajax Amsterdam,England,Premier League,Manchester United,95.0,False,54.0,63.0,66.0,51.0,49.0,42.0,11.0


In [43]:
# to handle the string values in 'fee' column like 750k and not 750000

def handle_k_values(value):
  try:
    return float(value)
  except ValueError:
    if value[-1] == 'k':
      return float(value[:-1]) * 1000
    else:
      return np.nan

In [44]:
dataF = dataF.drop(["Unnamed: 0", "market_value", "country_from", "league_from", "club_from", "country_to", "league_to", "club_to", "loan"], axis=1)

In [45]:
dataF['fee'] = dataF['fee'].apply(handle_k_values)
dataF['age'] = dataF['age'].astype(int)

In [46]:
dataF.isnull().sum()

name          0
position      0
age           0
fee           0
ATT         734
SKI         734
MOV         734
POW         734
MEN         734
DEF         734
GK          734
dtype: int64

In [47]:
dataF = dataF.dropna()

In [48]:
dataF = dataF.drop_duplicates()

In [49]:
dataF = dataF.reset_index(drop=True)
dataF.shape

(678, 11)

In [50]:
dataF.isnull().sum()

name        0
position    0
age         0
fee         0
ATT         0
SKI         0
MOV         0
POW         0
MEN         0
DEF         0
GK          0
dtype: int64

In [51]:
df_encoded = pd.get_dummies(dataF, columns = ['position'], dtype=int)

In [52]:
df_encoded.shape

(678, 23)

In [53]:
# Are there two different players with the same name?
# If not, then we can drop duplicates based on the 'name' column
duplicate_rows = df_encoded[df_encoded.duplicated()]
duplicate_rows

Unnamed: 0,name,age,fee,ATT,SKI,MOV,POW,MEN,DEF,GK,...,position_Centre-Forward,position_Defensive Midfield,position_Goalkeeper,position_Left Midfield,position_Left Winger,position_Left-Back,position_Right Midfield,position_Right Winger,position_Right-Back,position_Second Striker


In [54]:
df_encoded.drop_duplicates(subset='name', inplace=True)
df_encoded.dropna(inplace=True)

In [55]:
df_encoded = df_encoded.reset_index(drop=True)

In [56]:
df_encoded.head()

Unnamed: 0,name,age,fee,ATT,SKI,MOV,POW,MEN,DEF,GK,...,position_Centre-Forward,position_Defensive Midfield,position_Goalkeeper,position_Left Midfield,position_Left Winger,position_Left-Back,position_Right Midfield,position_Right Winger,position_Right-Back,position_Second Striker
0,Erling Haaland,21,60.0,76.0,70.0,82.0,86.0,75.0,42.0,10.0,...,1,0,0,0,0,0,0,0,0,0
1,Antony,22,95.0,54.0,63.0,66.0,51.0,49.0,42.0,11.0,...,0,0,0,0,0,0,0,1,0,0
2,Wesley Fofana,21,80.4,50.0,54.0,69.0,62.0,62.0,80.0,11.0,...,0,0,0,0,0,0,0,0,0,0
3,Alexander Isak,22,70.0,71.0,78.0,92.0,72.0,68.0,49.0,13.0,...,1,0,0,0,0,0,0,0,0,0
4,Matthijs de Ligt,22,67.0,81.0,75.0,71.0,80.0,72.0,34.0,11.0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   name                         613 non-null    object 
 1   age                          613 non-null    int64  
 2   fee                          613 non-null    float64
 3   ATT                          613 non-null    float64
 4   SKI                          613 non-null    float64
 5   MOV                          613 non-null    float64
 6   POW                          613 non-null    float64
 7   MEN                          613 non-null    float64
 8   DEF                          613 non-null    float64
 9   GK                           613 non-null    float64
 10  position_Attacking Midfield  613 non-null    int64  
 11  position_Central Midfield    613 non-null    int64  
 12  position_Centre-Back         613 non-null    int64  
 13  position_Centre-Forw

In [58]:
df_encoded.describe()

Unnamed: 0,age,fee,ATT,SKI,MOV,POW,MEN,DEF,GK,position_Attacking Midfield,...,position_Centre-Forward,position_Defensive Midfield,position_Goalkeeper,position_Left Midfield,position_Left Winger,position_Left-Back,position_Right Midfield,position_Right Winger,position_Right-Back,position_Second Striker
count,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,...,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0,613.0
mean,24.760196,15088.775628,58.154976,59.942904,68.525285,65.657423,59.226754,50.858075,13.67863,0.065253,...,0.190865,0.078303,0.057096,0.006525,0.096248,0.060359,0.008157,0.070147,0.055465,0.008157
std,3.65992,85296.806604,12.722932,13.156611,9.738313,8.751663,10.017915,21.385073,14.192289,0.247173,...,0.393304,0.268868,0.232216,0.080581,0.295172,0.238345,0.090018,0.255603,0.229072,0.090018
min,18.0,0.0,10.0,13.0,29.0,32.0,17.0,9.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,54.0,55.0,64.0,61.0,55.0,30.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24.0,3.0,61.0,62.0,70.0,67.0,61.0,58.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,27.0,7.1,66.0,69.0,75.0,72.0,66.0,70.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,35.0,890000.0,86.0,88.0,92.0,86.0,81.0,86.0,80.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [59]:
df_encoded.to_csv('model_training_data.csv')