In [27]:
import pandas as pd

In [28]:
df = pd.read_csv('data/fifa-21-player-datasets.csv')
print(df.head())
print(df.columns)
print(df.shape)

                name  age  height_cm  weight_kg nationality  \
0           L. Messi   33        170         72   Argentina   
1  Cristiano Ronaldo   35        187         83    Portugal   
2     R. Lewandowski   31        184         80      Poland   
3          Neymar Jr   28        175         68      Brazil   
4       K. De Bruyne   29        181         70     Belgium   

             club_name             league_name  overall  potential  value_eur  \
0         FC Barcelona  Spain Primera Division       93         93   67500000   
1             Juventus         Italian Serie A       92         92   46000000   
2    FC Bayern München    German 1. Bundesliga       91         91   80000000   
3  Paris Saint-Germain          French Ligue 1       91         91   90000000   
4      Manchester City  English Premier League       91         91   87000000   

   ...  preferred_foot weak_foot skill_moves    work_rate  pace shooting  \
0  ...            Left         4           4   Medium/Low 

In [29]:
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

In [30]:
df.columns

Index(['name', 'age', 'height_cm', 'weight_kg', 'nationality', 'club_name',
       'league_name', 'overall', 'potential', 'value_eur', 'wage_eur',
       'player_positions', 'preferred_foot', 'weak_foot', 'skill_moves',
       'work_rate', 'pace', 'shooting', 'passing', 'dribbling', 'defending',
       'physic'],
      dtype='object')

In [31]:
df.isna().sum()

name                0
age                 0
height_cm           0
weight_kg           0
nationality         0
club_name           0
league_name         0
overall             0
potential           0
value_eur           0
wage_eur            0
player_positions    0
preferred_foot      0
weak_foot           0
skill_moves         0
work_rate           0
pace                0
shooting            0
passing             0
dribbling           0
defending           0
physic              0
dtype: int64

In [32]:
df.drop_duplicates(inplace=True)

In [33]:
median_value = df['value_eur'].median()
df['high_value'] = (df['value_eur'] > median_value).astype(int)

In [34]:
df['age_group'] = pd.cut(df['age'], bins=[15,20,25,30,35,45], labels=['15-20','21-25','26-30','31-35','36+'])

In [35]:
df['height_weight_ratio'] = df['height_cm'] / df['weight_kg']

In [36]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['primary_position'] = df['player_positions'].apply(lambda x: x.split(',')[0])
df['primary_position_enc'] = le.fit_transform(df['primary_position'])

In [37]:
df = pd.get_dummies(df, columns=['preferred_foot','work_rate', 'age_group'], drop_first=True)

In [38]:
Q1 = df['value_eur'].quantile(0.25)
Q3 = df['value_eur'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['value_eur'] >= lower_bound) & (df['value_eur'] <= upper_bound)]

In [39]:
df.head()

Unnamed: 0,name,age,height_cm,weight_kg,nationality,club_name,league_name,overall,potential,value_eur,...,work_rate_Low/High,work_rate_Low/Low,work_rate_Low/Medium,work_rate_Medium/High,work_rate_Medium/Low,work_rate_Medium/Medium,age_group_21-25,age_group_26-30,age_group_31-35,age_group_36+
265,Pepe,37,188,81,Portugal,FC Porto,Portuguese Liga ZON SAGRES,81,81,3600000,...,False,False,False,True,False,False,False,False,False,True
377,B. Ivanović,36,185,91,Serbia,West Bromwich Albion,English Premier League,80,80,3000000,...,False,False,False,False,False,False,False,False,False,True
500,Quaresma,36,175,67,Portugal,Vitória Guimarães,Portuguese Liga ZON SAGRES,79,79,4200000,...,False,False,False,False,False,False,False,False,False,True
501,J. Mascherano,36,174,73,Argentina,Estudiantes de La Plata,Argentina Primera División,79,79,2700000,...,True,False,False,False,False,False,False,False,False,True
521,M. Parolo,35,184,75,Italy,Lazio,Italian Serie A,79,79,4000000,...,False,False,False,False,False,False,False,False,True,False


In [40]:
df.to_csv("data/fifa_players_preprocessed.csv", index=False)