In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [95]:
# read in the data
df = pd.read_csv('./players_22.csv')
# divide to train and test using train_test_split
train, test = train_test_split(df, test_size=0.2)

  df = pd.read_csv('./players_22.csv')


In [96]:
print(len(train))

15391


In [97]:
# get the count of missing values in each column, sort descendingly
missing_values_count = train.isnull().sum().sort_values(ascending=False)
# get the percentage of missing values in each column
missing_values_percentage = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
# get the columns with missing values over 50%
missing_columns = list(missing_values_percentage[missing_values_percentage > 0.5].index)
print(len(missing_columns))
# remove missing_columns from missing_values_percentage
# drop the columns with missing values over 50%
train = train.drop(columns=missing_columns)

test = test.drop(columns=missing_columns)

8


In [98]:
# get the percentage of missing values in each column
missing_values_percentage = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
print(missing_values_percentage)
# print the number of columns with missing values
print(len(missing_values_percentage[missing_values_percentage > 0]))
# drop the rows with missing values
train = train.dropna()
print(len(train))

test = test.dropna()
print(len(test))

pace                 0.109869
dribbling            0.109869
shooting             0.109869
passing              0.109869
physic               0.109869
                       ...   
skill_fk_accuracy    0.000000
skill_curve          0.000000
skill_dribbling      0.000000
attacking_volleys    0.000000
nation_flag_url      0.000000
Length: 102, dtype: float64
19
12839
3181


In [99]:
# get the categorical columns
categorical_columns = list(train.select_dtypes(include=['object']).columns)
print(categorical_columns)
# remove any attributes of 2 or 3 letters
categorical_columns = [col for col in categorical_columns if len(col) > 3]
# remove any columns with url in the name
categorical_columns = [col for col in categorical_columns if 'url' not in col]
# drop long name, club name, league name, club position,club joned, nationality name, real face and player tags
categorical_columns = ['short_name','preferred_foot','work_rate','body_type']
print(categorical_columns)

['player_url', 'short_name', 'long_name', 'player_positions', 'dob', 'club_name', 'league_name', 'club_position', 'club_joined', 'nationality_name', 'preferred_foot', 'work_rate', 'body_type', 'real_face', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url', 'club_flag_url', 'nation_flag_url']
['short_name', 'preferred_foot', 'work_rate', 'body_type']


In [100]:
# drop sofifaid column
train = train.drop(columns=['sofifa_id'])

test = test.drop(columns=['sofifa_id'])

# drop all the categorical columns except the categorical columns variable
categorical_columns_to_remove = [col for col in train.select_dtypes(include=['object']).columns if col not in categorical_columns]
train = train.drop(columns=categorical_columns_to_remove)

test = test.drop(columns=categorical_columns_to_remove)

In [101]:
print(len(train))
print(len(test))

12839
3181


In [102]:
# print the columns that are of type string
print(train.select_dtypes(include=['object']).columns)
categorical_columns.remove('short_name')

Index(['short_name', 'preferred_foot', 'work_rate', 'body_type'], dtype='object')


In [103]:
# use label encoding to convert categorical columns to numerical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train[categorical_columns] = train[categorical_columns].apply(lambda col: le.fit_transform(col))
test[categorical_columns] = test[categorical_columns].apply(lambda col: le.fit_transform(col))

In [104]:
train[categorical_columns].sample(5)

Unnamed: 0,preferred_foot,work_rate,body_type
14644,1,8,5
9772,1,8,1
10489,1,6,1
17647,1,8,3
15495,0,8,1


In [105]:
# use standard scaler to scale the data
scaler = StandardScaler()
numeric_columns = list(train.select_dtypes(include=['float64','int64']).columns)
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
test[numeric_columns] = scaler.fit_transform(test[numeric_columns])


In [106]:
train.sample(5)

Unnamed: 0,short_name,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
3348,João Schmidt,0.90136,0.15764,-0.107441,-0.000936,0.601637,0.402889,1.329759,1.119686,-0.468682,...,1.658369,1.072965,1.215093,0.839243,1.084167,0.21512,0.196758,-1.444647,1.529762,-0.123003
9562,L. Leroy,0.016912,0.656436,-0.133601,-0.153095,-0.902297,1.012642,1.180607,-0.932021,-0.468682,...,0.0485,0.490524,0.235385,0.294691,0.371992,-0.445628,-0.137702,0.527197,-0.461446,0.207036
4724,E. Larsson,0.606544,-0.17489,-0.199003,-0.203814,0.816485,-0.66418,-0.310907,-0.927427,-0.468682,...,-0.112486,0.296377,0.811684,0.675877,0.974602,1.866989,-0.806622,-1.444647,-0.461446,0.207036
13040,D. Krezic,-0.425312,-0.83995,-0.300376,-0.411764,-0.042906,0.860204,-0.460059,1.156674,-0.468682,...,-0.756434,-0.286064,0.408275,0.076871,-0.175835,1.206242,0.196758,1.184479,-1.45705,-0.78308
12692,E. Sylisufaj,-0.425312,0.490171,-0.238244,-0.409228,-0.902297,-0.206864,0.882304,1.151438,-0.468682,...,-0.273473,0.781745,-1.839291,-1.556783,-1.874098,-0.445628,1.200139,-0.787366,-0.461446,0.537074


In [107]:
df_nameless = train.drop(columns=['short_name'])
df_nameless.sample(5)

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_team_id,league_level,club_jersey_number,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
9891,-0.130496,-1.006215,-0.323266,-0.355972,1.675876,0.402889,-0.758362,0.918925,-0.468682,-0.733462,...,0.128994,0.781745,0.408275,0.621422,0.591123,-1.106376,-0.137702,0.527197,-1.125182,1.857228
5013,0.606544,0.323905,-0.146682,0.3541,0.171942,1.469957,1.329759,-0.749028,-0.468682,-0.335021,...,-0.19298,0.975892,0.638794,1.111518,0.919819,-0.115254,-0.137702,1.84176,-1.125182,0.537074
17774,-1.457168,-0.50742,-0.339617,-0.406692,-1.331992,-0.054426,-0.310907,-0.931635,0.885674,1.144902,...,-1.400381,-1.936314,0.235385,0.512512,0.755471,-0.776002,-0.472162,-1.116007,0.20229,-0.123003
6612,0.311728,-0.50742,-0.225164,-0.051656,1.031333,0.402889,0.584001,-0.928603,-0.468682,1.031061,...,0.370474,0.296377,0.523535,0.458057,0.810254,-0.115254,0.865679,0.855838,1.86163,-0.123003
14460,-0.720128,-1.671276,-0.339617,-0.254533,0.816485,-0.66418,-1.056665,1.135158,-0.468682,0.632621,...,1.014421,-0.091917,-0.974842,-1.883514,-1.819315,-1.106376,-0.472162,-1.116007,0.534158,-0.78308


In [108]:
# apply pca on the data and use 95% of the variance
pca = PCA(n_components=0.95)
pca.fit(df_nameless)
print(pca.n_components_)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
print(pca.components_)
print(pca.mean_)
print(pca.noise_variance_)
print(pca.get_params())



30
[0.26288555 0.1465354  0.10930058 0.07313701 0.04463683 0.03416411
 0.02924647 0.02404916 0.01921219 0.01689889 0.01428846 0.0141608
 0.01376236 0.01368886 0.01350914 0.01322398 0.01295512 0.01167796
 0.01031606 0.00966608 0.00898381 0.00818101 0.00744541 0.00730461
 0.00623499 0.00596139 0.00567343 0.00542075 0.0045894  0.00421709]
[480.1750632  358.49864203 309.61891433 253.27064535 197.86223807
 173.10167505 160.15960458 145.23328763 129.80897143 121.74337036
 111.94608151 111.44488866 109.86586287 109.57205559 108.85041617
 107.69544841 106.595035   101.20447139  95.1203141   92.07492359
  88.76594913  84.70706494  80.80913394  80.04140889  73.94933891
  72.30862486  70.54061009  68.95187704  63.44452829  60.81665015]
[[ 0.17260555  0.11681082  0.11790042 ...  0.01221247  0.00860521
   0.00951506]
 [ 0.17209064  0.07585473  0.09913386 ...  0.01745513  0.01309341
   0.01537934]
 [ 0.03567856  0.03873212  0.03763987 ...  0.00191985 -0.00028013
  -0.00477451]
 ...
 [ 0.08047383  0.

In [150]:
train_pca = pca.transform(df_nameless)

In [151]:
train_pca.shape

(12839, 30)

In [152]:
# convert the pca array to a dataframe
train_pca = pd.DataFrame(train_pca)
train_pca.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
9023,0.031515,-1.548779,3.19808,-1.113661,-2.317861,0.490217,-0.063,0.284818,-0.258021,-0.879687,...,-0.268442,-0.746722,-0.676899,0.156676,0.849707,1.441876,0.038005,-0.69174,-0.002894,0.170751
171,1.624277,-3.266645,-2.703879,2.440591,0.873319,0.182451,0.080161,-0.877491,0.119227,0.987787,...,-0.432708,-0.672486,0.728326,-0.646874,-0.206027,-0.269213,0.390381,0.62996,-0.041353,0.256458
606,-1.464122,3.698597,-0.377039,-0.208589,0.010966,-1.128559,-0.043608,0.38317,-1.027047,2.785101,...,1.636541,0.790464,0.407434,-0.228693,-0.482963,0.779123,0.131339,-0.137016,-0.506485,-0.139653
5500,1.187419,3.764529,0.958845,-2.059514,-3.086503,1.339733,-1.592737,0.676501,1.098082,0.392648,...,0.31482,-0.265639,-0.502237,-0.951986,-0.871214,0.342072,-0.177463,1.054406,-0.71613,-1.244288
4078,-0.067549,1.575593,2.685465,-2.082032,-1.303614,0.641639,-1.883178,0.191451,-0.47654,1.860474,...,-0.040949,0.554211,-0.515384,-1.167449,1.305067,0.062174,0.671154,0.660347,-0.130084,-0.264242


In [148]:
## concatenate the pca dataframe with the short_name column
## this causes names to have null values?
# train_pca = pd.concat([train_pca, train['short_name']], axis=1)

In [153]:
test_pca = pca.transform(test.drop(columns=['short_name']))


In [154]:
# save the pca dataframe to a csv file
train_pca.to_csv('./players_22_train_pca.csv', index=True)

test_pca = pd.DataFrame(test_pca)
test_pca.to_csv('./players_22_test_pca.csv', index=True)


In [28]:
df = pd.read_csv('./../../players_22.csv')
missing_values_percentage = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_columns = list(missing_values_percentage[missing_values_percentage > 0.5].index)
df = df.drop(columns=missing_columns)
df = df.dropna()
categorical_columns = ['short_name','preferred_foot','work_rate','body_type']
categorical_columns_to_remove = [col for col in df.select_dtypes(include=['object']).columns if col not in categorical_columns]
df = df.drop(columns=categorical_columns_to_remove)
df = df.drop(columns=['sofifa_id'])
df = df.drop(columns=['short_name'])
categorical_columns.remove('short_name')

  df = pd.read_csv('./../../players_22.csv')


In [29]:
(df.columns)

Index(['overall', 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm',
       'weight_kg', 'club_team_id', 'league_level', 'club_jersey_number',
       'club_contract_valid_until', 'nationality_id', 'preferred_foot',
       'weak_foot', 'skill_moves', 'international_reputation', 'work_rate',
       'body_type', 'release_clause_eur', 'pace', 'shooting', 'passing',
       'dribbling', 'defending', 'physic', 'attacking_crossing',
       'attacking_finishing', 'attacking_heading_accuracy',
       'attacking_short_passing', 'attacking_volleys', 'skill_dribbling',
       'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
       'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
       'movement_agility', 'movement_reactions', 'movement_balance',
       'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength',
       'power_long_shots', 'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentali

In [30]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(lambda col: le.fit_transform(col))
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_columns = list(df.select_dtypes(include=['float64','int64']).columns)
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
pca = PCA(n_components=0.95)
pca.fit(df)
df_pca = pca.transform(df)
df_pca = pd.DataFrame(df_pca)


In [31]:
df_pca.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
15114,-5.34064,-1.636278,1.092029,-2.253073,0.784604,2.174958,-0.074415,-0.997131,-0.057418,0.434744,...,-0.407995,-0.86905,-0.273544,-0.523642,0.381732,-0.335676,-0.011904,0.004103,0.362587,0.166641
10997,0.339706,-2.747153,-4.183483,4.929261,-0.984822,1.279098,-0.065314,0.81078,-0.482651,-0.403598,...,-0.399511,0.475691,-0.03582,-1.930057,-0.277877,0.660204,-0.398834,-0.541929,-0.069745,-0.108741
273,-2.331264,8.991601,1.3365,0.057934,3.544622,2.70658,0.26656,2.586116,1.022293,0.368612,...,0.417465,-0.22978,-0.178939,-0.247087,0.154309,0.512205,0.037044,-0.116822,-0.599175,-0.430902
2977,1.943768,-0.729656,3.238504,-0.120191,2.113176,-2.599903,-0.243521,-0.300121,-1.804469,0.058433,...,0.779525,0.759983,1.459865,1.115376,-0.983564,-0.673855,-1.207145,-1.525459,-0.413797,-0.120364
6773,-4.431956,2.747937,-1.498545,0.245185,-0.303889,-0.789247,-0.724879,1.412167,0.829571,1.169687,...,-1.077634,-0.331809,0.196172,0.982255,-0.11237,0.673946,-0.070463,-0.094776,-0.603322,-0.32879


In [32]:
# save the pca dataframe to a csv file
df_pca.to_csv('./players_22_pca.csv', index=True)

In [2]:
df = pd.read_csv('./../../players_22.csv')
missing_values_percentage = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_columns = list(missing_values_percentage[missing_values_percentage > 0.5].index)
df = df.drop(columns=missing_columns)
df = df.dropna()
categorical_columns = ['short_name','preferred_foot','work_rate','body_type']
categorical_columns_to_remove = [col for col in df.select_dtypes(include=['object']).columns if col not in categorical_columns]
df = df.drop(columns=categorical_columns_to_remove)
df = df.drop(columns=['sofifa_id'])

  df = pd.read_csv('./../../players_22.csv')


In [6]:
# save to csv
df.to_csv('./players_22_yousef_anwar.csv', index=False)
# get the count of null values in each column
df.isnull().sum()