# Classification - Determine International Rating 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import math
import re
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# Read original data
sample_data = pd.read_csv('fifa19_data.csv', encoding='utf-8')
print('sample_data: ', sample_data.shape)
#print(sample_data.head(n=1))

sample_data:  (3000, 89)


In [3]:
# 1. Pre-processing data
# 1.1 Remove empty value data

# Remove data without 'International Reputation'
sample_data.drop(sample_data[sample_data['International Reputation'] != sample_data['International Reputation']].index, 
                 inplace=True)
print('sample_data: ', sample_data.shape)

sample_data:  (2991, 89)


In [4]:
# 1.2 Remove unused features

# Remove index columns
cols = [0, 1]
sample_data = sample_data.drop(sample_data.columns[cols], axis=1)
print('after removing index columns, sample_data: ', sample_data.shape)
#print(sample_data.head(n=1))

# Remove unsed features
removed_features = [ 'Photo', 'Flag', 'Club Logo', 'Real Face', 'Jersey Number', 
                    'Joined', 'Loaned From', 'Contract Valid Until' ]
sample_data = sample_data.drop(removed_features, axis = 1)
print('after removing unused features, sample data: ', sample_data.shape)
#print(sample_data.head(n=1))

after removing index columns, sample_data:  (2991, 87)
after removing unused features, sample data:  (2991, 79)


In [5]:
# Define function to count values with descending order
def count_values_descending(data_series):
    return data_series.value_counts(sort=True, ascending=False)

# Define function to retrieve number of empty empty value 
def count_nan_values(count_values_data):
    if 'nan' in count_values_data:
        return count_values_data['nan']
            
    return 0

# Show count values
def show_count_values(data_series, rows=None):
    count_data = count_values_descending(data_series)
    if rows is None:
        print(count_data)
    else:
        print(count_data.head(rows))

# Define function to fill nan value with maximum number of values
# or using specified value to fill; Also check threshold to avoid
# too many nan values
def fill_nan_with_max_freq_value(data_series, threshold=0.2, default_value=None, 
                                 show_count_values=False, count_values_rows=3, in_place=True):
    data_size = len(data_series)
    #print("data_size: ", data_size)
    
    # Replace inf and -inf to nan
    data_series.replace([math.inf, -math.inf], math.nan)
    
    count_data = count_values_descending(data_series)
    if show_count_values:
        print(count_data.head(count_values_rows))
    
    nan_count = count_nan_values(count_data)
    print("NaN count: ", nan_count)

    if nan_count > 0:
        nan_freq = nan_count / data_size
        if nan_freq >= threshold:
            print("Warning: Not filling Nan since NaN freq: {}% >= max threshold: {}%", 
                  nan_freq * 100, threshold * 100)
        else:     
            fill_value = None
            if default_value is not None:
                fill_value = default_value
            else:
                fill_value = count_data.index[0]
                # Avoid to fill in 'nan' value if maximum number of values are nan values
                if fill_value == 'nan':
                    fill_value = count_data.index[1]

            print("fill '{}' to nan".format(fill_value))
            data_series.fillna(fill_value, inplace=in_place)
        
    print("   ")
    
    return data_series

In [6]:
# 1.3 Convert feature values

from sklearn.preprocessing import LabelEncoder
cat_encoder = LabelEncoder()

# Convert 'Nationality' to category code
fill_nan_with_max_freq_value(sample_data['Nationality'], default_value='NA')
sample_data['Nationality_code'] = cat_encoder.fit_transform(sample_data['Nationality'])
print(sample_data[['Nationality', 'Nationality_code']].head(3))
print("---------")

# Convert 'Club' to club code
fill_nan_with_max_freq_value(sample_data['Club'], default_value='NA')
sample_data['Club'] = sample_data['Club'].astype(str)
sample_data['Club_code'] = cat_encoder.fit_transform(sample_data['Club'])
print(sample_data[['Club', 'Club_code']].head(3))
print("---------")

# Convert 'Preferred Foot' to 'Foot_code' code
fill_nan_with_max_freq_value(sample_data['Preferred Foot'])
sample_data['Preferred Foot'] = sample_data['Preferred Foot'].astype(str)
sample_data['Foot_code'] = cat_encoder.fit_transform(sample_data['Preferred Foot'])
print(sample_data[['Preferred Foot', 'Foot_code']].head(3))
print("---------")

# Convert 'Body Type' to 'Body_type_code' code
fill_nan_with_max_freq_value(sample_data['Body Type'])
sample_data['Body Type'] = sample_data['Body Type'].astype(str)
sample_data['Body_type_code'] = cat_encoder.fit_transform(sample_data['Body Type'])
print(sample_data[['Body Type', 'Body_type_code']].head(3))
print("---------")

# Convert 'Position' to 'Position_code' code
fill_nan_with_max_freq_value(sample_data['Position'])
sample_data['Position'] = sample_data['Position'].astype(str)
sample_data['Position_code'] = cat_encoder.fit_transform(sample_data['Position'])
print(sample_data[['Position', 'Position_code']].head(3))
print("---------")

NaN count:  0
   
  Nationality  Nationality_code
0   Argentina                 5
1    Slovenia               101
2     England                38
---------
NaN count:  0
   
                Club  Club_code
0       FC Barcelona        209
1    Atlético Madrid         61
2  Tottenham Hotspur        576
---------
NaN count:  0
   
  Preferred Foot  Foot_code
0           Left          0
1          Right          1
2          Right          1
---------
NaN count:  0
   
  Body Type  Body_type_code
0     Messi               1
1    Normal               2
2    Normal               2
---------
NaN count:  0
   
  Position  Position_code
0       RF             21
1       GK              5
2       ST             26
---------


In [7]:
# Remove features which have been represented by other featers
#nationality_data = sample_data['Nationality', 'Nationality_code']
#club_data = sample_data['Club', 'Club_code']
#foot_data = sample_data['Preferred Foot', 'Foot_code']
#body_data = sample_data['Body Type', 'Body_type_code']
#pos_data = sample_data['Position', 'Position_code']

converted_features = ['Nationality', 'Club', 'Preferred Foot', 'Body Type', 'Position']
sample_data = sample_data.drop(converted_features, axis = 1, errors='ignore')
print("After change, data shape: ", sample_data.shape)

After change, data shape:  (2991, 79)


In [8]:
# Extract 'Work Rate' level value
def get_attack_work_rate(value):
    return get_work_rate('attack work rate', value)

def get_defence_work_rate(value):
    return get_work_rate('defence work rate', value)

def get_work_rate(rate_type, value):
    if not isinstance(value, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if value == '' or value.strip() == '':
        return 0
    
    level_values = value.split("/")
    level_label = ''
    if len(level_values) == 2:
        if rate_type == 'attack work rate':
            level_label = level_values[0].strip().lower()
        elif rate_type == 'defence work rate':
            level_label = level_values[1].strip().lower()
        
        if level_label == 'high':
            return 3
        if level_label == 'medium':
            return 2
        if level_label == 'low':
            return 1

    return 0

# Convert 'Work Rate' to 'Attack_work_rate' and 'Defence_work_rate'
sample_data['Attack_work_rate'] = sample_data['Work Rate'].map(get_attack_work_rate)
attack_work_rate_mean = int(sample_data['Attack_work_rate'].mean())
print('attack_work_rate_mean: ', attack_work_rate_mean)
sample_data['Attack_work_rate'].replace(to_replace=0, value=attack_work_rate_mean, inplace=True)
show_count_values(sample_data['Attack_work_rate'])
print(" ")

sample_data['Defence_work_rate'] = sample_data['Work Rate'].map(get_defence_work_rate)
defence_work_rate_mean = int(sample_data['Defence_work_rate'].mean())
print('Defence_work_rate_mean: ', defence_work_rate_mean)
sample_data['Defence_work_rate'].replace(to_replace=0, value=attack_work_rate_mean, inplace=True)
show_count_values(sample_data['Defence_work_rate'])
print(" ")

print(sample_data[['Attack_work_rate', 'Defence_work_rate', 'Work Rate']].head(n=3))

attack_work_rate_mean:  2
2    2041
3     814
1     136
Name: Attack_work_rate, dtype: int64
 
Defence_work_rate_mean:  2
2    2210
3     524
1     257
Name: Defence_work_rate, dtype: int64
 
   Attack_work_rate  Defence_work_rate       Work Rate
0                 2                  2  Medium/ Medium
1                 2                  2  Medium/ Medium
2                 3                  3      High/ High


In [9]:
# Remove features which have been represented by other featers
converted_features = ['Work Rate']
sample_data = sample_data.drop(converted_features, axis = 1, errors='ignore')
print("After change, data shape: ", sample_data.shape)

After change, data shape:  (2991, 80)


In [10]:
# Convert money string to float number
def convert_money_value(value):
    if isinstance(value, str):
        if value == '' or value.strip() == '':
            return 0
        
        if value[0] == '€':
            value = value[1:]
            
        if value[-1] == 'K':
            return int(float(value[:-1]) * 1000)
        
        if value[-1] == 'M':
            return int(float(value[:-1]) * 1000000)
        
        return int(value)
    else:
        return int(value)

# Convert 'Value' to market value number
sample_data['Market_value'] = sample_data['Value'].map(convert_money_value)
print(sample_data[['Value', 'Market_value']].head(n=3))
print(" ")

# Convert 'Wage' to market value number
sample_data['Wage_value'] = sample_data['Wage'].map(convert_money_value)
print(sample_data[['Wage', 'Wage_value']].head(n=3))

     Value  Market_value
0  €110.5M     110500000
1     €68M      68000000
2   €83.5M      83500000
 
    Wage  Wage_value
0  €565K      565000
1   €94K       94000
2  €205K      205000


In [11]:
# Remove features which have been represented by other featers
converted_features = ['Value', 'Wage']
sample_data = sample_data.drop(converted_features, axis = 1, errors='ignore')
print("After change, data shape: ", sample_data.shape)

After change, data shape:  (2991, 80)


In [12]:
# Convert height
def convert_height(value):
    if not isinstance(value, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if value == '' or value.strip() == '':
        return 0
    
    height_values = value.split("'")
    h_ft = 0
    h_inch = 0
    if len(height_values) == 2:
        h_ft = int(height_values[0])
        h_inch = int(height_values[1])
    elif len(height_values) == 1:
        h_ft = int(height_values[0])
    
    h_total = h_ft * 12 + h_inch
    
    return h_total

# Convert 'Height' to Height value number
sample_data['Height_value'] = sample_data['Height'].map(convert_height)
height_mean = int(sample_data['Height_value'].mean())
print('height_mean: ', height_mean)
print(" ")

sample_data['Height_value'].replace(to_replace=0, value=height_mean, inplace=True)
print(sample_data[['Height_value', 'Height']].head(n=3))
print(" ")

height_mean:  71
 
   Height_value Height
0            67    5'7
1            74    6'2
2            74    6'2
 


In [14]:
# Remove features which have been represented by other featers
converted_features = ['Height']
sample_data = sample_data.drop(converted_features, axis = 1, errors='ignore')
print("After change, data shape: ", sample_data.shape)

After change, data shape:  (2991, 80)


In [15]:
# Convert weight
def convert_weight(value):
    if not isinstance(value, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if value == '' or value.strip() == '':
        return 0
    
    weight_value = int(value.replace('lbs', ''))
        
    return weight_value

# Convert 'Weight' to Weight value number
sample_data['Weight_value'] = sample_data['Weight'].map(convert_weight)
weight_mean = int(sample_data['Weight_value'].mean())
print('weight_mean: ', weight_mean)
print(" ")

sample_data['Weight_value'].replace(to_replace=0, value=weight_mean, inplace=True)
print(sample_data[['Weight_value', 'Weight']].head(n=3))
print(" ")

weight_mean:  165
 
   Weight_value  Weight
0           159  159lbs
1           192  192lbs
2           196  196lbs
 


In [16]:
# Remove features which have been represented by other featers
converted_features = ['Weight']
sample_data = sample_data.drop(converted_features, axis = 1, errors='ignore')
print("After change, data shape: ", sample_data.shape)

After change, data shape:  (2991, 80)


In [12]:
# Convert position value to remove "+<n>" and convert to integer
extra_pos_pattern = re.compile('\\+\\d+')

def convert_position_value(position_value_str):
    if not isinstance(position_value_str, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if position_value_str == '' or position_value_str.strip() == '':
        return 0
    
    position_value = int(extra_pos_pattern.sub('', position_value_str))
        
    return position_value


# Convert all value at position features
position_feature_names = [ "CAM", "CB", "CDM", "CF", "CM", "LAM", "LB", "LCB", 
                           "LCM", "LDM", "LF", "LM", "LS", "LW", "LWB", "RAM", 
                           "RB", "RCB", "RCM", "RDM", "RF", "RM", "RS", "RW", "RWB", "ST"]
for pos_name in position_feature_names:
    print("Position: ", pos_name)
    sample_data[pos_name] = sample_data[pos_name].map(convert_position_value)
    pos_mean = int(sample_data[pos_name].mean())
    print('mean: ', pos_mean)
    sample_data[pos_name].replace(to_replace=0, value=pos_mean, inplace=True)
    print(sample_data[pos_name].head(n=3))
    print(" ")

Position:  CAM
mean:  52
0    93
1    52
2    82
Name: CAM, dtype: int64
 
Position:  CB
mean:  49
0    47
1    49
2    60
Name: CB, dtype: int64
 
Position:  CDM
mean:  50
0    61
1    50
2    66
Name: CDM, dtype: int64
 
Position:  CF
mean:  52
0    93
1    52
2    84
Name: CF, dtype: int64
 
Position:  CM
mean:  52
0    84
1    52
2    79
Name: CM, dtype: int64
 
Position:  LAM
mean:  52
0    93
1    52
2    82
Name: LAM, dtype: int64
 
Position:  LB
mean:  50
0    59
1    50
2    62
Name: LB, dtype: int64
 
Position:  LCB
mean:  49
0    47
1    49
2    60
Name: LCB, dtype: int64
 
Position:  LCM
mean:  52
0    84
1    52
2    79
Name: LCM, dtype: int64
 
Position:  LDM
mean:  50
0    61
1    50
2    66
Name: LDM, dtype: int64
 
Position:  LF
mean:  52
0    93
1    52
2    84
Name: LF, dtype: int64
 
Position:  LM
mean:  53
0    91
1    53
2    81
Name: LM, dtype: int64
 
Position:  LS
mean:  51
0    88
1    51
2    86
Name: LS, dtype: int64
 
Position:  LW
mean:  52
0    92
1    52

In [18]:
# 1.4 Fill in missing data with average value

# International Reputation, Weak Foot, Skill Moves
sample_data['International Reputation'].fillna(int(sample_data['International Reputation'].mean()), inplace=True)
sample_data['Weak Foot'].fillna(int(sample_data['Weak Foot'].mean()), inplace=True)
sample_data['Skill Moves'].fillna(int(sample_data['Skill Moves'].mean()), inplace=True)