In [13]:
def top_lists (data, number=20):
    
    top_clubs = data['Club'].value_counts()[:number].index.tolist()
    top_nationalities = data['Nationality'].value_counts()[:number].index.tolist()
    
    return top_clubs, top_nationalities


def data_cleaning(data, top_clubs, top_nationalities):
    import pandas as pd
    import datetime as dt
    import numpy as np
    
    data.drop_duplicates()
    data = data.drop(['ID', 'Name', 'Team & Contract', 'Contract', 'Position', 'Loan Date End'], axis=1)
    data.columns = [colname.lower().replace(' ','_') for colname in data.columns]
    data = data.drop(['total_stats','crossing','finishing','heading_accuracy','short_passing','volleys','dribbling','curve',
                      'fk_accuracy','long_passing','ball_control','acceleration','sprint_speed','agility','reactions',
                      'balance','shot_power','jumping','stamina','strength','long_shots','aggression','interceptions',
                      'marking', 'standing_tackle', 'sliding_tackle', 'goalkeeping', 'gk_diving', 'gk_handling',
                      'gk_kicking', 'gk_positioning', 'gk_reflexes', 
                      'positioning','vision','penalties','composure', 'ls', 'rs', 'lf', 'rf', 'lam', 'ram', 'lcm', 'rcm',
                      'lm', 'rm', 'lwb', 'rwb', 'ldm', 'rdm', 'lb', 'lcb', 'rcb', 'rb'], axis=1)
    for col in ['st','lw','cf', 'rw', 'cam', 'cm', 'cdm', 'cb', 'gk']:
        extra_col = col + '_evol'
        data[[col,extra_col]] = data[col].str.split('+',expand=True).astype(float)
        data = data.drop([extra_col], axis=1)
    data['weight'] = pd.to_numeric(data['weight'].map(lambda x: x.rstrip('lbs')))
    for d in ['value', 'wage', 'release_clause']:
        data[d] = data[d].str.lstrip("€")
        data[d] = data[d].apply(lambda x: x.rstrip('K') if x.strip()[-1] == "K" else float(x.rstrip('M')) * 1000)
    work_rate_mapper = { 'Low':1, 'Medium':2, 'High':3 }
    data['a/w'] = data['a/w'].replace(work_rate_mapper)
    data['d/w'] = data['d/w'].replace(work_rate_mapper)
    
    for c in ['ir', 'w/f', 'sm']:
        data[c] = data[c].str[:1].astype(int)
    
    for i in range(data['height'].size):
        h = data['height'][i].rstrip('"').split("'")
        data['height'][i] = round(((float(h[0])*12) + float(h[1])) * 2.54, 1)
        if data['club'][i] in(top_clubs):
            continue
        else:
            data['club'][i] = "Other"
            
    for j in range(data['nationality'].size):
        if data['nationality'][j] in(top_nationalities):
            data['nationality'][j] = data['nationality'][j]
        else:
            data['nationality'][j] = "Other"
            
    data = data.dropna()
    data = data.reset_index(drop=True)
    data['joined'] = pd.to_datetime(data['joined'], errors='coerce')
    data['joined']=data['joined'].map(dt.datetime.toordinal)
    for i in range(data['hits'].size):
        if data['hits'][i].find('K') > 0:
            data['hits'][i] = float(data['hits'][i].rstrip('K'))* 1000
        else:
            data['hits'][i] = float(data['hits'][i])
    data['hits'] = pd.to_numeric(data['hits'])
    data['wage'] = pd.to_numeric(data['wage'])
    data['value'] = pd.to_numeric(data['value'])
    data['release_clause'] = pd.to_numeric(data['release_clause'])
    data['height'] = pd.to_numeric(data['height'])    

    return data

In [22]:
import pandas as pd
train_data = pd.read_csv('fifa21_train.csv')

top_clubs, top_nationalities = top_lists(train_data, 20)

cleaned_data = data_cleaning(train_data, top_clubs, top_nationalities)
cleaned_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['height'][i] = round(((float(h[0])*12) + float(h[1])) * 2.54, 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['club'][i] = "Other"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['nationality'][j] = "Other"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['nationality'][j] = data['national

Unnamed: 0,age,nationality,club,bp,height,weight,foot,growth,joined,value,...,st,lw,cf,rw,cam,cm,cdm,cb,gk,ova
0,26,Other,Other,CM,175.3,161,Right,1,735780,525.0,...,58.0,61.0,62.0,61.0,63.0,63.0,59.0,54.0,15.0,64
1,30,Other,Other,ST,182.9,159,Right,0,735614,8500.0,...,77.0,77.0,77.0,77.0,76.0,68.0,53.0,48.0,18.0,77
2,33,Italy,Other,CAM,162.6,134,Right,0,737090,9000.0,...,73.0,80.0,79.0,80.0,80.0,74.0,56.0,41.0,12.0,80
3,22,Other,Other,CDM,177.8,152,Right,13,736146,275.0,...,50.0,51.0,51.0,51.0,53.0,56.0,58.0,58.0,14.0,59
4,23,France,Other,CDM,180.3,150,Right,8,736876,725.0,...,56.0,59.0,59.0,59.0,61.0,63.0,64.0,61.0,15.0,65


In [23]:
import numpy as np
data_num = cleaned_data._get_numeric_data()
data_cat = cleaned_data.select_dtypes(['object'])
    
# Create correlation matrix
corr_matrix = data_num.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features 
data_num = data_num.drop(to_drop, axis=1)
cleaned_data = cleaned_data.drop(to_drop, axis=1)

# DATA ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(data_cat)
cols = encoder.get_feature_names(input_features = data_cat.columns)
data_cat_encoded = pd.DataFrame(encoder.transform(data_cat).toarray(),columns=cols)    

# DATA SCALING
from sklearn.preprocessing import MinMaxScaler
y = cleaned_data['ova']
X_num = data_num.drop(['ova'], axis = 1)
X = pd.concat([X_num, data_cat_encoded], axis=1)

min_max_scaler = MinMaxScaler().fit(X)
x_min_max = min_max_scaler.transform(X)
pd.DataFrame(x_min_max, columns=X.columns)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Unnamed: 0,age,height,weight,growth,joined,value,wage,attacking,skill,movement,...,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,foot_Right
0,0.370370,0.401575,0.349206,0.038462,0.823896,0.005833,0.000004,0.546835,0.569087,0.663818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.518519,0.551181,0.333333,0.000000,0.808663,0.094444,0.000024,0.817722,0.777518,0.829060,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.629630,0.151575,0.134921,0.000000,0.944113,0.100000,0.000052,0.744304,0.892272,0.886040,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.222222,0.450787,0.277778,0.500000,0.857484,0.003056,0.000004,0.506329,0.505855,0.481481,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.259259,0.500000,0.261905,0.307692,0.924475,0.008056,0.000002,0.524051,0.555035,0.601140,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11637,0.481481,0.301181,0.293651,0.000000,0.957970,0.006111,0.526316,0.627848,0.562061,0.726496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
11638,0.333333,0.600394,0.404762,0.192308,0.911352,0.006944,0.000003,0.460759,0.480094,0.626781,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11639,0.407407,0.600394,0.452381,0.153846,0.957970,0.017778,0.000004,0.106329,0.124122,0.353276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11640,0.222222,0.250000,0.206349,0.307692,0.943379,0.012222,0.000002,0.617722,0.580796,0.783476,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [24]:
from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(x_min_max, y, test_size=0.2, random_state=42)

from sklearn import linear_model
lm_1 = linear_model.LinearRegression()
lm_1.fit(X_train_1,y_train_1)


LinearRegression()

In [25]:
from sklearn.metrics import r2_score
predictions_1 = lm_1.predict(X_test_1)
R2_1 = r2_score(y_test_1, predictions_1)
print(R2_1)

0.8960195647458918


In [26]:
data_validate = pd.read_csv('fifa21_validate.csv')

In [27]:
cleaned_data_2 = data_cleaning(data_validate, top_clubs, top_nationalities)
cleaned_data_2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['height'][i] = round(((float(h[0])*12) + float(h[1])) * 2.54, 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['club'][i] = "Other"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['nationality'][j] = data['nationality'][j]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['nationality'][j] =

Unnamed: 0,age,nationality,club,bp,height,weight,foot,growth,joined,value,...,st,lw,cf,rw,cam,cm,cdm,cb,gk,ova
0,23,United States,Other,CB,188.0,194,Right,7,736733,975.0,...,48.0,48.0,47.0,48.0,48.0,51.0,61.0,67.0,16.0,67
1,22,Other,Other,CAM,172.7,154,Right,5,737444,1200.0,...,64.0,68.0,68.0,68.0,68.0,61.0,47.0,38.0,17.0,68
2,19,United States,Other,GK,188.0,176,Right,17,737026,120.0,...,18.0,15.0,17.0,15.0,17.0,18.0,18.0,18.0,53.0,54
3,16,England,Other,CDM,190.5,170,Right,23,737434,160.0,...,46.0,47.0,46.0,47.0,47.0,49.0,54.0,54.0,11.0,55
4,24,Other,Other,CDM,188.0,170,Right,5,737241,2300.0,...,63.0,66.0,66.0,66.0,68.0,70.0,72.0,68.0,18.0,70


In [28]:
data_num_2 = cleaned_data_2._get_numeric_data()
data_cat_2 = cleaned_data_2.select_dtypes(['object'])
    
data_num_2 = data_num_2.drop(to_drop, axis=1)
cleaned_data_2 = cleaned_data_2.drop(to_drop, axis=1)

# DATA ENCODING
cols_2 = encoder.get_feature_names(input_features = data_cat_2.columns)
data_cat_encoded_2 = pd.DataFrame(encoder.transform(data_cat_2).toarray(),columns=cols)    

# DATA SCALING
y_2 = cleaned_data_2['ova']
X_num_2 = data_num_2.drop(['ova'], axis = 1)
X_2 = pd.concat([X_num_2, data_cat_encoded_2], axis=1)

x_min_max_2 = min_max_scaler.transform(X_2)
pd.DataFrame(x_min_max_2, columns=X_2.columns)


Unnamed: 0,age,height,weight,growth,joined,value,wage,attacking,skill,movement,...,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,foot_Right
0,0.259259,0.651575,0.611111,0.269231,0.911352,0.010833,0.000005,0.475949,0.433255,0.541311,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.222222,0.350394,0.293651,0.192308,0.976599,0.013333,0.000003,0.648101,0.672131,0.746439,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.111111,0.651575,0.468254,0.653846,0.938240,0.001333,0.526316,0.015190,0.021077,0.148148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.000000,0.700787,0.420635,0.884615,0.975681,0.001778,0.526316,0.437975,0.423888,0.461538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.296296,0.651575,0.420635,0.192308,0.957970,0.025556,0.000014,0.640506,0.665105,0.589744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1986,0.185185,0.350394,0.206349,0.423077,0.995136,0.003611,0.000001,0.392405,0.384075,0.658120,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1987,0.185185,0.801181,0.468254,0.346154,0.874369,0.002111,0.736842,0.088608,0.063232,0.225071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1988,0.407407,0.301181,0.230159,0.000000,0.945306,0.088889,0.000009,0.734177,0.772834,0.678063,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1989,0.666667,0.500000,0.468254,0.000000,0.995503,0.001556,0.000004,0.572152,0.510539,0.498575,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
predictions_2 = lm_1.predict(x_min_max_2)
R2_2 = r2_score(y_2, predictions_2)
print(R2_2)

0.8850216119547526
