## Data Cleaning & Manipulations

In [1]:
import numpy as np
import pandas as pd
import tempfile
from math import sqrt
import scipy.stats
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
df = pd.read_csv('players_22.csv')

  df = pd.read_csv('players_22.csv')


In [44]:
df.shape

(19239, 110)

In [45]:
df = df.iloc[:, :-33]

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 77 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    19239 non-null  int64  
 1   player_url                   19239 non-null  object 
 2   short_name                   19239 non-null  object 
 3   long_name                    19239 non-null  object 
 4   player_positions             19239 non-null  object 
 5   overall                      19239 non-null  int64  
 6   potential                    19239 non-null  int64  
 7   value_eur                    19165 non-null  float64
 8   wage_eur                     19178 non-null  float64
 9   age                          19239 non-null  int64  
 10  dob                          19239 non-null  object 
 11  height_cm                    19239 non-null  int64  
 12  weight_kg                    19239 non-null  int64  
 13  club_team_id    

In [48]:
num_imputer = SimpleImputer(strategy='mean')  # or strategy='median'
df['value_eur'] = num_imputer.fit_transform(df[['value_eur']])
df['wage_eur'] = num_imputer.fit_transform(df[['wage_eur']])

In [49]:
#Drop
to_drop = ['club_team_id',
'club_name',
'league_name',
'league_level',
'club_position',
'club_jersey_number',
'club_loaned_from',
'club_joined',
'club_contract_valid_until',
'nation_team_id',
'nation_position',
'nation_jersey_number',
'release_clause_eur',
'player_tags',
'player_traits',
          'player_positions']

In [50]:
#Mean
df['pace'] = num_imputer.fit_transform(df[['pace']])
df['shooting'] = num_imputer.fit_transform(df[['shooting']])
df['passing'] = num_imputer.fit_transform(df[['passing']])
df['defending'] = num_imputer.fit_transform(df[['defending']])
df['dribbling'] = num_imputer.fit_transform(df[['dribbling']])
df['physic'] = num_imputer.fit_transform(df[['physic']])

### Handling missing values

In [51]:
df = df.drop(columns=to_drop)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 61 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sofifa_id                    19239 non-null  int64  
 1   player_url                   19239 non-null  object 
 2   short_name                   19239 non-null  object 
 3   long_name                    19239 non-null  object 
 4   overall                      19239 non-null  int64  
 5   potential                    19239 non-null  int64  
 6   value_eur                    19239 non-null  float64
 7   wage_eur                     19239 non-null  float64
 8   age                          19239 non-null  int64  
 9   dob                          19239 non-null  object 
 10  height_cm                    19239 non-null  int64  
 11  weight_kg                    19239 non-null  int64  
 12  nationality_id               19239 non-null  int64  
 13  nationality_name

##### Encoding Categorical Variables: 
If your dataset contains categorical variables, they need to be converted into a numerical format. One common approach is one-hot encoding, where each category is transformed into a new binary column.

###### Checking for Multicollinearity: 
Multicollinearity occurs when two or more independent variables are highly correlated with each other. This can be problematic in linear regression models. You can use correlation matrices or Variance Inflation Factor (VIF) to detect multicollinearity.

###### Feature Scaling: 
Standardizing or normalizing the features so they're on the same scale. This is especially important if you're using regularization techniques in linear regression.

In [59]:
df['body_type'].unique()

array(['Unique', 'Normal (170-185)', 'Lean (170-185)', 'Normal (185+)',
       'Lean (185+)', 'Normal (170-)', 'Stocky (185+)', 'Lean (170-)',
       'Stocky (170-185)', 'Stocky (170-)'], dtype=object)

In [63]:
data = df
data = data[['overall', 'value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg', 'nationality_id', 'preferred_foot', 'skill_moves',
            'international_reputation', 'body_type','pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',
            'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle',
       'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
       'goalkeeping_positioning', 'goalkeeping_reflexes']]

In [64]:
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [65]:
categorical_cols

['preferred_foot', 'body_type']

In [67]:
numerical_cols.remove('overall') 

In [69]:
# Handling categorical variables: One-Hot Encoding
one_hot_encoder = OneHotEncoder()
categorical_data = one_hot_encoder.fit_transform(data[categorical_cols])

# Handling numerical variables: Scaling
scaler = StandardScaler()
numerical_data = scaler.fit_transform(data[numerical_cols])

# Combine processed numerical and categorical data
processed_data = pd.concat([pd.DataFrame(numerical_data, columns=numerical_cols), 
                            pd.DataFrame(categorical_data.toarray(), 
                            columns=one_hot_encoder.get_feature_names_out(categorical_cols))], axis=1)


In [72]:
processed_data['overall'] = data['overall']

In [74]:
processed_data.to_csv('cleaned_data.csv', index = False)

In [2]:
reading = pd.read_csv('cleaned_data.csv')

In [3]:
reading.head()

Unnamed: 0,value_eur,wage_eur,age,height_cm,weight_kg,nationality_id,skill_moves,international_reputation,pace,shooting,...,body_type_Lean (170-185),body_type_Lean (185+),body_type_Normal (170-),body_type_Normal (170-185),body_type_Normal (185+),body_type_Stocky (170-),body_type_Stocky (170-185),body_type_Stocky (185+),body_type_Unique,overall
0,9.889601,15.998022,1.851089,-1.646467,-0.416315,-0.131273,2.146241,10.525295,1.628331,2.992852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,93
1,15.350958,13.425844,1.429869,0.539166,0.856805,-0.4295,2.146241,10.525295,0.949332,2.992852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,92
2,5.546836,13.425844,2.272309,0.830584,1.13972,-0.409618,3.448937,10.525295,1.822331,3.143798,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,91
3,16.601147,13.425844,0.798039,-0.917923,-0.982145,-0.09151,3.448937,10.525295,2.210331,2.313597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,91
4,16.140551,17.541329,1.008649,-0.043669,-0.69923,-1.025953,2.146241,7.83052,0.755332,2.540015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,91
