# Features Deduction - FIFA 2019 Data

In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import math
import re
warnings.filterwarnings('ignore')
%matplotlib inline

In [138]:
# Read original data
sample_data = pd.read_csv('fifa19_ready_data.csv', encoding='utf-8')
# Remove ID
sample_data = sample_data.drop('ID', axis=1)
# Separate internation rating result with rest
y = sample_data['International Reputation']
X = sample_data.drop('International Reputation', axis=1)
print('X: ', X.shape)

X:  (3563, 56)


# Step 1. Remove Features with low variance

In [139]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold = (0.8 * (1 - 0.8)))
sel.fit_transform(X)
print('X: ', X.shape)

X:  (3563, 56)


In [140]:
# Low variance feature is not found, so no feature is removed 

# Step 2. Remove Features with high correlation

In [141]:
# Compute correlation and show
def compute_correlation(df, show=False):
    colnames = list(df.columns)
    corm = df.corr(method='pearson')
    corm_df = pd.DataFrame(np.round(corm, decimals=2),index=colnames,columns=colnames)
    
    if show:
        print('Correlation Matrix: ')
        print(corm_df, '\n')
        print()
    
    return corm_df

In [142]:
def show_high_correlation(df, threshold=0.9, max_cols=0):
    # Show correlation value > threshold or < -threshold which are high correlations
    high_corr_df = df[ (df > threshold) | (df < -threshold) ]
    print('high_corr_df: ', high_corr_df.shape)
    start_i = 0
    end_i = 5

    total_cols = max_cols if max_cols > 0 else high_corr_df.shape[1]
    if end_i > total_cols:
        end_i = total_cols
    
    for i in range(high_corr_df.shape[1]):
        high_corr_df.iloc[i, i] = np.nan

    high_corr_df = high_corr_df.dropna(how='all', axis=0)
    high_corr_df = high_corr_df.dropna(how='all', axis=1)


    while end_i < total_cols:
        curr_high_corr_df = high_corr_df.iloc[:, start_i : end_i]
        if not curr_high_corr_df.empty:
            print(curr_high_corr_df, '\n')
        
        start_i = end_i
        end_i += 5
        if start_i < high_corr_df.shape[1] - 1 and end_i > high_corr_df.shape[1] - 1:
            end_i = high_corr_df.shape[1] - 1

    print()

In [143]:
# Compute correlation 
corr_df = compute_correlation(X)

In [144]:
# Show correclation > 0.9 or < -0.9
show_high_correlation(corr_df, threshold=0.9, max_cols=0)

high_corr_df:  (56, 56)
                Special    ST    RS    RF    RW
Special             NaN   NaN   NaN   NaN   NaN
ST                  NaN   NaN  1.00  0.97  0.93
RS                  NaN  1.00   NaN  0.97  0.93
RF                  NaN  0.97  0.97   NaN  0.99
RW                  NaN  0.93  0.93  0.99   NaN
RAM                 NaN  0.93  0.93  0.99  0.98
RCM                0.94   NaN   NaN   NaN   NaN
RM                  NaN   NaN   NaN  0.97  0.99
RDM                 NaN   NaN   NaN   NaN   NaN
RWB                 NaN   NaN   NaN   NaN   NaN
RCB                 NaN   NaN   NaN   NaN   NaN
RB                  NaN   NaN   NaN   NaN   NaN
ShortPassing        NaN   NaN   NaN   NaN   NaN
Dribbling           NaN   NaN   NaN  0.91  0.94
BallControl         NaN   NaN   NaN   NaN   NaN
Interceptions       NaN   NaN   NaN   NaN   NaN
StandingTackle      NaN   NaN   NaN   NaN   NaN
SlidingTackle       NaN   NaN   NaN   NaN   NaN 

                 RAM   RCM    RM   RDM   RWB
Special          

In [145]:
# Remove features with high correlation
# ['Special', 'RCM']
# [ 'ST', 'RS' ] [ 'ST', 'RF' ] [ 'ST', 'RW' ] [ 'ST', 'RAM' ] 
# [ 'RS', 'ST' ] [ 'RS', 'RF' ] [ 'RS', 'RW' ] [ 'ST', 'RAM' ] 
# ['RM', 'Dribbling'] ['RM', 'BallControl']
# [ 'RDM', 'RWB' ] [ 'RDM', 'RCB' ] [ 'RDM', 'RB' ] 
# [ 'Interceptions', 'RCB' ] [ 'Interceptions', 'StandingTackle' ] [ 'Interceptions', 'SlidingTackle' ] 
drop_features = [ 'RCM', 'RS', 'RF', 'RW', 'RAM', 
                  'Dribbling', 'BallControl', 'RWB', 'RCB', 'RB', 
                  'StandingTackle', 'SlidingTackle']

X = X.drop(drop_features, axis = 1)
print('X: ', X.shape)

X:  (3563, 44)


In [146]:
# Compute correlation 
corr_df = compute_correlation(X)

In [147]:
# Show correclation > 0.9 or < -0.9
show_high_correlation(corr_df, threshold=0.9, max_cols=0)

high_corr_df:  (44, 44)



In [148]:
# Show correclation > 0.8 or < -0.8
show_high_correlation(corr_df, threshold=0.8, max_cols=0)

high_corr_df:  (44, 44)
               Overall  Value  Wage  Special    ST
Overall            NaN    NaN   NaN      NaN   NaN
Value              NaN    NaN  0.86      NaN   NaN
Wage               NaN   0.86   NaN      NaN   NaN
Special            NaN    NaN   NaN      NaN   NaN
ST                 NaN    NaN   NaN      NaN   NaN
RM                 NaN    NaN   NaN     0.88  0.90
RDM                NaN    NaN   NaN      NaN   NaN
Crossing           NaN    NaN   NaN      NaN   NaN
Finishing          NaN    NaN   NaN      NaN  0.89
ShortPassing       NaN    NaN   NaN     0.81   NaN
Volleys            NaN    NaN   NaN      NaN  0.83
LongPassing        NaN    NaN   NaN      NaN   NaN
Acceleration       NaN    NaN   NaN      NaN   NaN
SprintSpeed        NaN    NaN   NaN      NaN   NaN
Reactions         0.85    NaN   NaN      NaN   NaN
ShotPower          NaN    NaN   NaN      NaN  0.83
LongShots          NaN    NaN   NaN      NaN  0.86
Interceptions      NaN    NaN   NaN      NaN   NaN
Positio

In [149]:
# Remove features with high correlation
# ['Overall', 'Reactions']
# ['Value', 'Wage']
# [ 'Special', 'RM'] [ 'Special', 'ShortPassing' ]
# [ 'ST', 'RM'] [ 'ST', 'Finishing' ] [ 'ST', 'Volleys'], 
# ['ST', 'ShotPower'], ['ST', 'LongShots'], ['ST', 'Positioning']
# [ 'RDM', 'Interceptions'], ['RDM', 'Marking'], 
# [ 'Acceleration', 'SprintSpeed']

drop_features = [ 'Reactions', 'Value', 'RM', 'ShortPassing', 
                  'Finishing', 'Volleys', 'ShotPower', 'LongShots', 'Positioning',
                  'Interceptions', 'Marking', 'SprintSpeed']

X = X.drop(drop_features, axis = 1)
print('X: ', X.shape)

X:  (3563, 32)


In [150]:
# Compute correlation 
corr_df = compute_correlation(X)

In [151]:
# Show correclation > 0.8 or < -0.8
show_high_correlation(corr_df, threshold=0.8, max_cols=0)

high_corr_df:  (32, 32)



In [152]:
# Since no more feature in high correlation, so we stop and save data

# Step 3. Export features reduced data

In [153]:
# Combine features with result
sample_data = pd.concat([X, y], axis = 1)

out_file = 'fifa19_features_reduced_data.csv'
print('Save features reduced sample data {} to file: {}'.format(sample_data.shape, out_file))
export_csv = sample_data.to_csv (out_file, index = None, header=True)
if export_csv is not None:
    print(export_csv)

Save features reduced sample data (3563, 33) to file: fifa19_features_reduced_data.csv
