# Data Preprocessing

In [19]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### Player Valuations dataset

In [21]:
player_valuations = pd.read_csv('../../data/preprocessed_valuations.csv')
player_valuations.drop(columns='Unnamed: 0', inplace=True)
player_valuations.head()

Unnamed: 0,name,last_season,average_value
0,A.J. Soares,2016.0,218750.0
1,AJ Leitch-Smith,2017.0,121428.6
2,Aaron Appindangoyé,2023.0,833333.3
3,Aaron Bastiaans,2020.0,165625.0
4,Aaron Boupendza,2020.0,4266667.0


In [22]:
# Global historical inflation rate. Min year is 2012
inflation_rates = {
    2012: 0.0373,
    2013: 0.0262,
    2014: 0.0235,
    2015: 0.0143,
    2016: 0.0155,
    2017: 0.0219,
    2018: 0.0244,
    2019: 0.0221,
    2020: 0.0193,
    2021: 0.0348,
    2022: 0.0827,
}

def adjust_for_inflation(value, year):
    adjusted_value = value
    for y in range(int(year) + 1, max(inflation_rates.keys()) + 1):
        adjusted_value *= (1 + inflation_rates[y])
    return adjusted_value

if 'adjusted_value' not in player_valuations.columns:
    player_valuations['adjusted_value'] = player_valuations.apply(lambda row: adjust_for_inflation(row['average_value'], row['last_season']), axis=1)
    player_valuations.drop(columns = 'average_value', inplace=True)
    


In [23]:
player_valuations

Unnamed: 0,name,last_season,adjusted_value
0,A.J. Soares,2016.0,2.672920e+05
1,AJ Leitch-Smith,2017.0,1.451946e+05
2,Aaron Appindangoyé,2023.0,8.333333e+05
3,Aaron Bastiaans,2020.0,1.855626e+05
4,Aaron Boupendza,2020.0,4.780279e+06
...,...,...,...
28686,Ümit Kurt,2016.0,1.043508e+06
28687,Ümit Türker,2016.0,6.109532e+04
28688,Ümran Zambak,2019.0,5.710006e+04
28689,Üstün Bilgi,2013.0,2.064856e+05


In [25]:
num_columns = ['last_season', 'adjusted_value']
num_data = player_valuations[num_columns]
scaler = StandardScaler().fit(num_data)
scaled_valuations = scaler.transform(num_data)

# Replace original data with scaled data 
player_valuations[num_columns] = scaled_valuations   
player_valuations

Unnamed: 0,name,last_season,adjusted_value
0,A.J. Soares,-0.752289,-0.333430
1,AJ Leitch-Smith,-0.479310,-0.362391
2,Aaron Appindangoyé,1.158566,-0.199166
3,Aaron Bastiaans,0.339628,-0.352816
4,Aaron Boupendza,0.339628,0.737044
...,...,...,...
28686,Ümit Kurt,-0.752289,-0.149313
28687,Ümit Türker,-0.752289,-0.382339
28688,Ümran Zambak,0.066649,-0.383287
28689,Üstün Bilgi,-1.571227,-0.347853


### Player skill dataset

In [26]:
player_ratings = pd.read_csv('../../FUT_player_data.csv')
player_ratings.head()

Unnamed: 0,id,futbin_id,name,height,weight,age,club,league,nation,rarity,...,phsyicality_jumping,physicality_stamina,physicality_strength,physicality_aggression,goalkeeper_diving,goalkeeper_handling,goalkeeper_kicking,goalkeeper_positioning,goalkeeper_reflexes,goalkeeper_speed
0,18949,54231,Kylian Mbappé,182,73,24,73,16,18,16,...,88.0,99.0,87.0,73.0,,,,,,99
1,18981,54251,Karim Benzema,185,81,35,607,350,18,164,...,99.0,99.0,99.0,90.0,,,,,,97
2,18982,54249,Zinedine Zidane,185,77,51,112658,2118,18,171,...,87.0,94.0,92.0,83.0,,,,,,92
3,18730,54005,Pelé,173,70,82,112658,2118,54,153,...,90.0,91.0,78.0,61.0,,,,,,96
4,19001,54277,Robert Lewandowski,185,81,35,241,53,37,164,...,99.0,97.0,99.0,99.0,,,,,,97


In [34]:
pd.set_option('display.max_columns', None)

ratings_num_data = player_ratings.copy()
ratings_num_data.drop(columns=['name', 'futbin_id', 'id'], inplace=True)
ratings_num_columns = ratings_num_data.columns

ratings_num_columns

Index(['height', 'weight', 'age', 'club', 'league', 'nation', 'rarity',
       'position', 'foot', 'attackWorkRate', 'defenseWorkRate', 'cardColor',
       'overallRating', 'pace', 'shooting', 'passing', 'dribbling',
       'defending', 'physicality', 'pace_acceleration', 'pace_sprintSpeed',
       'shooting_positioning', 'shooting_finishing', 'shooting_shotPower',
       'shooting_longShots', 'shooting_volleys', 'shooting_penalties',
       'passing_vision', 'passing_crossing', 'passing_freeKickAccuracy',
       'passing_shortPassing', 'passing_longPassing', 'passing_curve',
       'dribbling_agility', 'dribbling_balance', 'dribbling_reactions',
       'dribbling_ballControl', 'dribbling_dribbling', 'dribbling_composure',
       'defending_interceptions', 'defending_headingAccuracy',
       'defending_standingTackle', 'defending_slidingTackle',
       'defending_defenseAwareness', 'phsyicality_jumping',
       'physicality_stamina', 'physicality_strength', 'physicality_aggression',
  

In [38]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


assert set(ratings_num_columns).isdisjoint(set(categorical_features)), "Overlapping columns detected"

# Ensure all columns are in the dataframe
assert set(ratings_num_columns).issubset(player_ratings.columns), "Numeric columns missing from dataframe"
assert set(categorical_features).issubset(player_ratings.columns), "Categorical columns missing from dataframe"

# Fill or drop NaN values
player_ratings.dropna(subset=ratings_num_columns, inplace=True)
# Create transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Columns to be one-hot encoded
categorical_features = ['position', 'name', 'foot']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, ratings_num_columns),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Apply the column transformation
data_transformed = preprocessor.fit_transform(player_ratings)

# Convert one-hot encoded data back to a DataFrame
ohe_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(input_features=categorical_features)
all_columns = ratings_num_columns + list(ohe_columns)

data_transformed_df = pd.DataFrame(data_transformed, columns=all_columns)

# If you want to replace original dataframe:
player_ratings_transformed = data_transformed_df
player_ratings_transformed

AssertionError: Overlapping columns detected

In [None]:

ratings_scaler = StandardScaler().fit(ratings_num_data)
raitings_num_scaled = ratings_scaler.transform(ratings_num_data)
player_ratings[rating_num_columns] = ratings_num_scaled
player_ratings