In [65]:
# Pre-processing FIFA 2019 Data

In [66]:
import random

# Define function to count total number of records
def count_lines_in_file(data_input_file):
    line_num = 0
    with open(data_input_file, 'r', encoding='utf-8') as infile:
        line = infile.readline()
        while line:
            line_num += 1
            # Read next line
            line = infile.readline()

    return line_num

# Define function to extract records based on number of samples
def filtered_by_sample_size(num_of_samples=3500, num_of_input_file=0, 
                            data_input_file="data.csv",
                            data_output_file="fifa_data.csv"):
    
    if num_of_input_file == 0:
        num_of_input_file = count_lines_in_file(data_input_file)
        print("Total records: ", num_of_input_file)
        
    # Randomly choose records index
    sample_index_list = random.sample(range(2, num_of_input_file), num_of_samples)

    # Read and save data by sample index
    line_num = 0
    out_line_num = 0
    with open(data_output_file, mode='w+', encoding='utf-8') as outfile:
        with open(data_input_file, 'r', encoding='utf-8') as infile:
            line = infile.readline()
            while line:
                line_num += 1
                # Show progress
                if line_num % 10000 == 0:
                    print("Processing line number: ", line_num)

                # Write title line (first line) or selected sample lines
                if line_num == 1 or line_num in sample_index_list:
                    out_line_num += 1
                    outfile.write(line)

                # Read next line
                line = infile.readline()
            
    # Completed
    print("Total input lines: ", line_num, ", total output lines: ", out_line_num)

In [67]:
# Step 1. Randomly Choosing 4000 records for processing
#         and save to CSV file
sample_size = 4000
filtered_by_sample_size(num_of_samples=sample_size, data_output_file='fifa19_data.csv')

Total records:  18208
Processing line number:  10000
Total input lines:  18208 , total output lines:  4001


In [68]:
# Step 2. Pre-processing data

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import math
import numbers
import re
warnings.filterwarnings('ignore')
%matplotlib inline

In [70]:
# Read original data
sample_data = pd.read_csv('fifa19_data.csv', encoding='utf-8')
print('sample_data: ', sample_data.shape)
#print(sample_data.head(n=1))

sample_data:  (4000, 89)


In [71]:
# 2.1 Remove invalid data

def is_nan_or_empty_value(value):
    if value != value:
        return True
    if isinstance(value, str):
        if value == '' or value.strip() == '':
            return True
    return False    

# 2.1.1 Since this is the classification result so we will only consider valid result 'International Reputation'
drop_cond = sample_data['International Reputation'].apply(is_nan_or_empty_value)
sample_data.drop(sample_data[drop_cond].index, inplace=True)
print('sample_data: ', sample_data.shape)

# 2.2.2 Removing data without root postition scores
root_position_features = [ 'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM', 'RS', 'RW', 'RWB', 'ST' ]
for pos_name in root_position_features:
    drop_cond = sample_data[pos_name].apply(is_nan_or_empty_value)
    sample_data.drop(sample_data[drop_cond].index, inplace=True)
    print('clear invalid data at position score: ', pos_name, ', sample_data: ', sample_data.shape)

sample_data:  (3990, 89)
clear invalid data at position score:  RAM , sample_data:  (3526, 89)
clear invalid data at position score:  RB , sample_data:  (3526, 89)
clear invalid data at position score:  RCB , sample_data:  (3526, 89)
clear invalid data at position score:  RCM , sample_data:  (3526, 89)
clear invalid data at position score:  RDM , sample_data:  (3526, 89)
clear invalid data at position score:  RF , sample_data:  (3526, 89)
clear invalid data at position score:  RM , sample_data:  (3526, 89)
clear invalid data at position score:  RS , sample_data:  (3526, 89)
clear invalid data at position score:  RW , sample_data:  (3526, 89)
clear invalid data at position score:  RWB , sample_data:  (3526, 89)
clear invalid data at position score:  ST , sample_data:  (3526, 89)


In [72]:
# 2.2 Remove features

# 2.2.1 Remove Row number column

# Rename unamed column name to 'Row_number'
row_number_col = 'Row_number'
sample_data.rename( columns={'Unnamed: 0': row_number_col}, inplace=True)
sample_data = sample_data.drop([row_number_col], axis=1, errors='ignore')
print('after removing row number column, sample_data: ', sample_data.shape)
#print(sample_data.head(n=1))

# Set index to ID column
sample_data.set_index('ID')

# 2.2.2 Remove unsed features
removed_features = [ 'Name', 'Photo', 'Flag', 'Club Logo', 'Real Face', 'Jersey Number', 
                    'Joined', 'Loaned From', 'Contract Valid Until', 'Release Clause' ]
sample_data = sample_data.drop(removed_features, axis = 1, errors='ignore')
print('after removing unused features, sample data: ', sample_data.shape)
#print(sample_data.head(n=1))

# 2.2.3 Remove irrelevant features
irrelevant_features = [ 'Nationality', 'Club', 'Preferred Foot', 'Body Type', 'Position', 'Weak Foot' ]
sample_data = sample_data.drop(irrelevant_features, axis = 1, errors='ignore')
print('after removing irrelevant features, sample data: ', sample_data.shape)

# 2.2.4 Remove features represented by other features

# 'Work Rate', 'Overall', 'Potential', 'Special' is represented by postion scores
redundant_features = [ 'Work Rate', 'Overall', 'Potential', 'Special' ]
sample_data = sample_data.drop(redundant_features, axis = 1, errors='ignore')

# Since position left, center and right are condidered the same score, 
# such as LAM, CAM, and RAM are all same, so we reduce the position scores to
# the root positions only: 'ST', RS', 'RW', 'RF', 'RAM', 'RM', 'RCM', 'RWM', 'RDM', 'RB', 'RCB';
# and remove the rest of position scores
redudant_position_features = [ 'CAM', 'CB', 'CDM', 'CF', 'CM', 'LAM', 'LB', 'LCB', 'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB' ]
sample_data = sample_data.drop(redudant_position_features, axis = 1, errors='ignore')
print('after removing redundant features, sample data: ', sample_data.shape)

after removing row number column, sample_data:  (3526, 88)
after removing unused features, sample data:  (3526, 78)
after removing irrelevant features, sample data:  (3526, 72)
after removing redundant features, sample data:  (3526, 53)


In [73]:
# 2.3 Convert features to valid number


# 2.3.1 Convert money value

# Convert money string to float number
def convert_money_value(value):
    if value != value:
        return 0
    
    if isinstance(value, numbers.Number):
        return value
    
    if isinstance(value, str):
        if value == '' or value.strip() == '':
            return 0
        
        if isinstance(value, numbers.Number):
            return value
        
        if value[0] == '€':
            value = value[1:]
            
        if value[-1] == 'K':
            return int(float(value[:-1]) * 1000)
        
        if value[-1] == 'M':
            return int(float(value[:-1]) * 1000000)
        
        return int(value)
    else:
        return int(value)

# Convert 'Value' to market value number
sample_data['Value'] = sample_data['Value'].map(convert_money_value)
market_value_mean = int(sample_data['Value'].mean())
print('market_value_mean: ', market_value_mean)
sample_data['Value'].replace(to_replace=0, value=market_value_mean, inplace=True)
print(sample_data['Value'].head(3))
print(" ")


# Convert 'Wage' to market value number
sample_data['Wage'] = sample_data['Wage'].map(convert_money_value)
wage_value_mean = int(sample_data['Wage'].mean())
print('wage_value_mean: ', wage_value_mean)
sample_data['Wage'].replace(to_replace=0, value=wage_value_mean, inplace=True)
print(sample_data['Wage'].head(3))

market_value_mean:  2549085
0    77000000
1    93000000
2    83500000
Name: Value, dtype: int64
 
wage_value_mean:  10537
0    405000
1    340000
2    205000
Name: Wage, dtype: int64


In [74]:
# 2.3.2 Convert height value

# Convert height
def convert_height(value):
    if value != value:
        return 0
    
    if isinstance(value, numbers.Number):
        return value
    
    if not isinstance(value, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if value == '' or value.strip() == '':
        return 0
    
    height_values = value.split("'")
    h_ft = 0
    h_inch = 0
    if len(height_values) == 2:
        h_ft = int(height_values[0])
        h_inch = int(height_values[1])
    elif len(height_values) == 1:
        h_ft = int(height_values[0])
    
    h_total = h_ft * 12 + h_inch
    
    return h_total

# Convert 'Height' to Height value number
sample_data['Height'] = sample_data['Height'].map(convert_height)
height_mean = int(sample_data['Height'].mean())
print('height_mean: ', height_mean)

sample_data['Height'].replace(to_replace=0, value=height_mean, inplace=True)
print(sample_data['Height'].head(3))
print(" ")

height_mean:  71
0    74
1    68
2    74
Name: Height, dtype: int64
 


In [75]:
# 2.3.3 Convert weight value

# Convert weight
def convert_weight(value):
    if value != value:
        return 0

    if isinstance(value, numbers.Number):
        return value
    
    if not isinstance(value, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if value == '' or value.strip() == '':
        return 0
    
    weight_value = int(value.replace('lbs', ''))
        
    return weight_value

# Convert 'Weight' to Weight value number
sample_data['Weight'] = sample_data['Weight'].map(convert_weight)
weight_mean = int(sample_data['Weight'].mean())
print('weight_mean: ', weight_mean)

sample_data['Weight'].replace(to_replace=0, value=weight_mean, inplace=True)
print(sample_data['Weight'].head(3))
print(" ")

weight_mean:  164
0    183
1    163
2    196
Name: Weight, dtype: int64
 


In [76]:
# 2.3.4 Convert position value

# Convert position value to remove "+<n>" and convert to integer
extra_pos_pattern = re.compile('\\+\\d+')

def convert_position_value(position_value):
    if position_value != position_value:
        return 0
    
    if isinstance(position_value, numbers.Number):
        return position_value
    
    if not isinstance(position_value, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if position_value == '' or position_value.strip() == '':
        return 0
    
    converted_value = int(extra_pos_pattern.sub('', position_value))
        
    return converted_value


# Convert all value at position features
root_position_features = [ 'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM', 'RS', 'RW', 'RWB', 'ST' ]
for pos_name in root_position_features:
    print("Position: ", pos_name)
    sample_data[pos_name] = sample_data[pos_name].map(convert_position_value)
    pos_mean = int(sample_data[pos_name].mean())
    print('mean: ', pos_mean)
    sample_data[pos_name].replace(to_replace=0, value=pos_mean, inplace=True)
    sample_data[pos_name] = sample_data[pos_name].astype(int)
    print(sample_data[pos_name].head(3))
    print(" ")

Position:  RAM
mean:  58
0    88
1    89
2    82
Name: RAM, dtype: int32
 
Position:  RB
mean:  56
0    61
1    60
2    62
Name: RB, dtype: int32
 
Position:  RCB
mean:  55
0    53
1    49
2    60
Name: RCB, dtype: int32
 
Position:  RCM
mean:  58
0    81
1    82
2    79
Name: RCM, dtype: int32
 
Position:  RDM
mean:  56
0    61
1    63
2    66
Name: RDM, dtype: int32
 
Position:  RF
mean:  58
0    90
1    88
2    84
Name: RF, dtype: int32
 
Position:  RM
mean:  59
0    88
1    89
2    81
Name: RM, dtype: int32
 
Position:  RS
mean:  57
0    91
1    83
2    86
Name: RS, dtype: int32
 
Position:  RW
mean:  58
0    89
1    89
2    82
Name: RW, dtype: int32
 
Position:  RWB
mean:  57
0    65
1    66
2    65
Name: RWB, dtype: int32
 
Position:  ST
mean:  57
0    91
1    83
2    86
Name: ST, dtype: int32
 


In [77]:
# Convert other scores value

def convert_score_value(score_value):
    if score_value != score_value:
        return 0
    
    if isinstance(score_value, numbers.Number):
        return score_value
    
    if not isinstance(score_value, str):
        #print('Invalid value: {}'.format(str(value)))
        return 0
    
    if score_value == '' or score_value.strip() == '':
        return 0
    
    converted_value = int(score_value)
        
    return converted_value

other_scores_features = [ 'Age', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 
                         'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 
                         'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 
                         'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression', 
                         'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 
                         'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 
                         'GKKicking', 'GKPositioning', 'GKReflexes' ]

for score_col in other_scores_features:
    print("score column: ", score_col)
    sample_data[score_col] = sample_data[score_col].map(convert_score_value)
    col_mean = int(sample_data[score_col].mean())
    print('mean: ', col_mean)
    sample_data[score_col] = sample_data[score_col].replace(to_replace=0, value=col_mean)
    sample_data[score_col] = sample_data[score_col].astype(int)
    print(sample_data[score_col].head(1))
    print(" ")

score column:  Age
mean:  25
0    33
Name: Age, dtype: int32
 
score column:  Crossing
mean:  54
0    84
Name: Crossing, dtype: int32
 
score column:  Finishing
mean:  49
0    94
Name: Finishing, dtype: int32
 
score column:  HeadingAccuracy
mean:  57
0    89
Name: HeadingAccuracy, dtype: int32
 
score column:  ShortPassing
mean:  62
0    81
Name: ShortPassing, dtype: int32
 
score column:  Volleys
mean:  46
0    87
Name: Volleys, dtype: int32
 
score column:  Dribbling
mean:  60
0    88
Name: Dribbling, dtype: int32
 
score column:  Curve
mean:  51
0    81
Name: Curve, dtype: int32
 
score column:  FKAccuracy
mean:  46
0    76
Name: FKAccuracy, dtype: int32
 
score column:  LongPassing
mean:  56
0    77
Name: LongPassing, dtype: int32
 
score column:  BallControl
mean:  63
0    94
Name: BallControl, dtype: int32
 
score column:  Acceleration
mean:  67
0    89
Name: Acceleration, dtype: int32
 
score column:  SprintSpeed
mean:  67
0    91
Name: SprintSpeed, dtype: int32
 
score column:

In [78]:
# Step 3. Export pre-processed data

out_file = 'fifa19_ready_data.csv'
print('Save pre-processed sample data {} to file: {}'.format(sample_data.shape, out_file))
export_csv = sample_data.to_csv (out_file, index = None, header=True)
if export_csv is not None:
    print(export_csv)

Save pre-processed sample data (3526, 53) to file: fifa19_ready_data.csv
