In [207]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import pickle
import pandas as pd
import numpy as np
import statistics as stats
import re
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
isTrain=True

In [208]:
fifa21_df=pd.read_csv('fifa21_df_cleaned.csv')
fifa21_df = fifa21_df.drop(['contract_start','contract_end','nationality','club'], axis=1)

In [209]:
def preprocess_model_input(df,output_col=None,is_train_test_split=False):
    # 1. Split the numerical, categorical and the output column:
    X_num=df.select_dtypes(include=np.number)
    X_cat=df.select_dtypes(include=object)
    if output_col is not None:
        y = X_num[output_col]
        X_num = X_num.drop([output_col], axis=1)
    
    # 2. Normalizing data:
    transformer = MinMaxScaler().fit(X_num)
    X_normalized = transformer.transform(X_num)
    X_normalized = pd.DataFrame(X_normalized,columns=X_num.columns)
    
    # 3. Encoding categorical columns:
    encoder = OneHotEncoder().fit(X_cat)
    encoded = encoder.transform(X_cat).toarray()
    cols = encoder.get_feature_names_out(input_features=X_cat.columns)
    onehot_encoded = pd.DataFrame(encoded, columns=cols)
    
    #######################################################
    #X_normalized=X_normalized[['age','height','weight','growth','value','wage','release_clause','attacking','crossing','finishing','heading_accuracy','short_passing','volleys','skill','dribbling','curve','fk_accuracy','long_passing','ball_control','movement','acceleration','sprint_speed','agility','reactions','balance','power','shot_power','jumping','stamina','strength','long_shots','mentality','aggression','interceptions','positioning','vision','penalties','composure','defending','marking','standing_tackle','sliding_tackle','goalkeeping']]
    # defending,standing_tackle,sliding_tackle is the culprit!
    X_normalized=X_normalized[['age','height','weight','value','wage','release_clause','attacking','crossing','finishing','heading_accuracy','short_passing','volleys','skill','dribbling','curve','fk_accuracy','long_passing','ball_control','movement','acceleration','sprint_speed','agility','reactions','balance','power','shot_power','jumping','stamina','strength','long_shots','mentality','aggression','interceptions','positioning','vision','penalties','composure','marking','goalkeeping']]
    #######################################################
    
    # 4. Concatenating normalized numeric columns and encoded categorical columns
    
    #######################################################
    #X = pd.concat([X_normalized, onehot_encoded], axis=1)
    X = pd.concat([X_normalized], axis=1)
    #######################################################
    if is_train_test_split:
        # 5. Creating a Train-Test Split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

        return X_train, X_test, y_train, y_test
    else:
        if output_col is not None:
            return X,y
        else:
            return X
    
def produce_metrics_lm(lm,X_test,y_test,isPrint=False):
    predictions_test = lm.predict(X_test)
    r2=r2_score(y_test, predictions_test),
    mas=mean_absolute_error(y_test, predictions_test)
    mse=mean_squared_error(y_test,predictions_test)
    rmse=np.sqrt(mean_squared_error(y_test,predictions_test))
    if isPrint:
        print("r2_score: ",r2)
        print("mean absolute error :",mas)
        print("mean square error :",mse)
        print("root mean square error :",rmse)
    return r2,mas,mse,rmse,predictions_test

def preprocess(df,filename):
    # 1. Standardize header names.
    df=standardize_headers(df)
    
    # 2. Drop position of the following columns
    df.drop(['id','name','position','team_&_contract','loan_date_end'], axis=1,inplace=True)
    
    # 3. Converting Height to cm
    df=convert_ftin_to_cm(df,'height')
    
    # 4. Convert weight in lbs to kgs
    df=convert_lbs_to_kg(df,'weight')
    
    # 5. Convert joined col to Datetime
    df=replace_datecol_with_yearcol(df,'joined')
    
    # 6. Convert value, wage and release_clause to float
    df=convert_currency_col_to_int(df,'value')
    df=convert_currency_col_to_int(df,'wage')
    df=convert_currency_col_to_int(df,'release_clause')
    
    # 7. Add contract start and end date
    df=split_year_range_to_cols(df,'contract')
    
    # 8. Add column is_on_loan
    df['is_on_loan']=df['contract_start'].apply(lambda x : 0 if x.find('On Loan')==-1 else 1)
    
    # 9. Eliminate the ★ and convert to int
    df['w/f']=df['w/f'].apply(lambda x : int(x.replace("★","")))
    df['sm']=df['sm'].apply(lambda x : int(x.replace("★","")))
    df['ir']=df['ir'].apply(lambda x : int(x.replace("★","")))
    
    # 10. Extract contract start date from contract_start column containing other strings
    df=extract_year_from_str_col(df,'contract_start')
    
    # 11. Convert columns contract_start and contract_end to int
    df['contract_start']=pd.to_numeric(df['contract_start'],errors='coerce')
    df['contract_end']=pd.to_numeric(df['contract_end'],errors='coerce')
    # Drop column contract
    df.drop(['contract'], axis=1,inplace=True)
    
    # 12. Convert other string columns containing only ints
    df.select_dtypes(include=object)
    df['hits']=pd.to_numeric(df['hits'],errors='coerce')
    
    # 13. Convert skill columns to numeric after adding the bonus to a single integer value
    skill_cols=['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk']
    for col in skill_cols:
        df=sumup_operands(df,col)
        
    # 14. Check for NaNs
    nan_rep_df=identify_cols_nan(df)
    
    # 15. Handling of NaNs for below columns:
    
    # We choose median value for Nan values in composure
    df['composure'].value_counts()
    df['composure'] = df['composure'].fillna(stats.median(df['composure']))

    # We choose mode value for Nan values in a/w
    df['a/w'].value_counts()
    df['a/w'] = df['a/w'].fillna(stats.mode(df['a/w']))
    
    # We choose mode value for Nan values in d/w
    df['d/w'].value_counts()
    df['d/w'] = df['d/w'].fillna(stats.mode(df['d/w']))
    
    # We choose mode value for Nan values in hits
    df['hits'].value_counts()
    df['hits'] = df['hits'].fillna(stats.mode(df['hits']))
    
    # Remove Nans
    df = df[df['contract_start'].notna()]
    
    # Export the cleaned dataset:
    df.to_csv(filename+".csv",index=False)
    
    return df

def standardize_headers(df):
    df.columns = [col.lower().replace(' ','_') for col in df.columns]
    return df

def identify_cols_nan(df):
    df=pd.DataFrame(df.isna().sum(),columns=['count'])
    return df[df['count']>0]

def sumup_operands(df,col_name):
    df[col_name]=df[col_name].apply(lambda x : int(x.split("+")[0])+int(x.split("+")[1]))
    return df

def extract_year_from_str_col(df,col_name):
    df[col_name]=df[col_name].apply(lambda x : re.findall("\d{4}",x)[0] if len(re.findall("\d{4}",x))>0 else np.nan)
    return df

def split_year_range_to_cols(df,col_name):
    df[col_name+'_start'] = ""
    df[col_name+'_end'] = ""
    df[col_name+'_start']=df[col_name].apply(lambda x : x.split("~")[0].strip())
    df[col_name+'_end']=df[col_name].apply(lambda x : np.nan if x.find('~') == -1 else x.split("~")[1].strip())
    return df

def convert_currency_col_to_int(df,col_name):
    df[col_name]=df[col_name].apply(lambda x : int(x.replace("€","").replace("K","000").replace("M","000000").replace(".","")))
    return df

def replace_datecol_with_yearcol(df,col_name):
    df[col_name]=pd.to_datetime(df[col_name],errors='coerce')
    df[col_name+'_'+'year']=df[col_name].apply(lambda x : x.year)

    # Drop position of the joined column
    df.drop([col_name], axis=1,inplace=True)

    # Check how many are NaN
    df[col_name+'_'+'year'].isna().sum()

    # Keep only the non NaN for joined_year
    df = df[df[col_name+'_'+'year'].notna()]

    # Change joined_year to int
    df[col_name+'_'+'year']=df[col_name+'_'+'year'].apply(lambda x : int(x))
    
    return df

def convert_lbs_to_kg(df,col_name):
    df[col_name]=df[col_name].apply(lambda x : round(int(x.replace("lbs",""))*0.45359237,0))
    return df

def convert_ftin_to_cm(df,col_name):
    df[col_name]=df[col_name].apply(lambda x : int(x.split("'")[0])*30.48+int(x.split("'")[1].replace("\"",""))*2.54)
    df[col_name]=df[col_name].apply(lambda x : round(x,0))
    return df



In [210]:
X_train, X_test, y_train, y_test = preprocess_model_input(fifa21_df,'ova',True)

In [211]:
isTrain=True
# Creating and fitting a Linear Regression Model
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
if isTrain==True:
    filename = 'fifa21_lm.sav'
    pickle.dump(lm, open(filename, 'wb'))
else:
    lm = pickle.load(open(filename, 'rb'))

In [212]:
print("Training Metrics:")
produce_metrics_lm(lm,X_train,y_train,True)
print("Test Metrics:")
produce_metrics_lm(lm,X_test,y_test,True)

Training Metrics:
r2_score:  (0.8923673787532845,)
mean absolute error : 1.7378951512541703
mean square error : 5.019027366488217
root mean square error : 2.24031858593554
Test Metrics:
r2_score:  (0.8935489519416288,)
mean absolute error : 1.7322744412364557
mean square error : 5.167513999364535
root mean square error : 2.273216663533095


((0.8935489519416288,),
 1.7322744412364557,
 5.167513999364535,
 2.273216663533095,
 array([62.40021557, 70.7072977 , 71.0724332 , ..., 75.24935656,
        68.90827255, 55.68155459]))

In [213]:
isTrain=False
fifa21_df_validation=pd.read_csv("fifa21_validate.csv")

In [214]:
fifa21_df_validation_cleaned=preprocess(fifa21_df_validation,'fifa21_df_valid_cleaned')
fifa21_df_validation_cleaned = fifa21_df_validation_cleaned.drop(['contract_start','contract_end','nationality','club'], axis=1)

In [215]:
X_valid,y_valid=preprocess_model_input(fifa21_df_validation_cleaned,'ova',False)

In [216]:
# Loading the Linear Regression Model
if isTrain==True:
    filename = 'fifa21_lm.sav'
    pickle.dump(lm, open(filename, 'wb'))
else:
    lm = pickle.load(open(filename, 'rb'))

In [217]:
print("Validation Metrics:")
produce_metrics_lm(lm,X_valid,y_valid,True)

Validation Metrics:
r2_score:  (0.8661428579667874,)
mean absolute error : 1.9179418662566536
mean square error : 6.144826729915442
root mean square error : 2.478876102171192


((0.8661428579667874,),
 1.9179418662566536,
 6.144826729915442,
 2.478876102171192,
 array([65.15958201, 65.85396632, 50.08343017, ..., 71.94452079,
        64.98371144, 62.27879893]))