## Dokumentation

data=uncleaned csv file
df=cleaned csv file

For data:

*basic_information(data)*
*basic_information2(data)*

-> both will only print information

*master_cleaner(data,drop_list,condition=None)*

-> will return df (cleaned data)

*scaling(df,target,scaler)*

-> took in cleaned dataframe from master_cleaner-function, importing necessary libaries, returning X_scaled,Y

-> Key words for scaler: "StandardScaler","MinMax","Normalizer"

### Importing libaries and csv-file

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

In [2]:
#stays as an input for all following models
data=pd.read_csv("fifa21_trainning.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Nationality,Club,BP,Position,Team & Contract,Height,...,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,OVA
0,1954,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,CM CDM,FC Lausanne-Sport 2015 ~ 2020,"5'9""",...,59+1,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64
1,2225,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,ST LW LM,"Beijing Sinobo Guoan FC Dec 31, 2020 On Loan","6'0""",...,53+2,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77
2,1959,184431,S. Giovinco,33,Italy,Al Hilal,CAM,CAM CF,Al Hilal 2019 ~ 2022,"5'4""",...,56+2,56+2,59+2,53+2,41+2,41+2,41+2,53+2,12+2,80
3,9815,233796,J. Evans,22,Wales,Swansea City,CDM,CDM CM,Swansea City 2016 ~ 2021,"5'10""",...,58+2,58+2,56+2,57+2,58+2,58+2,58+2,57+2,14+2,59
4,10074,234799,Y. Demoncy,23,France,US Orléans Loiret Football,CDM,CDM CM,US Orléans Loiret Football 2018 ~ 2021,"5'11""",...,64+2,64+2,64+2,63+2,61+2,61+2,61+2,63+2,15+2,65


### 1. Creating basic_information()-function and basic_information2()-function

In [3]:
#getting basic information
def basic_information(df):
    df_cols=list(df.columns)
    print("Basic information")
    print("Shape of DF:", df.shape)

    for col in df_cols:
        print("----------------")
        print("Name: {}".format(df[col].name))
        print("Type: {}".format(df[col].dtype))
        print("Unique values: {}".format(len(df[col].unique())))
        print("NaN values: {}".format(df[col].isna().sum()))
        print("0 values: {}".format(df[col].isin([0]).sum()))
        plt.hist(df[col],bins=20)
        plt.show()


In [4]:
#basic_information(data)
#comment out if basic_information(data) is requested

In [5]:
#getting basic information
def basic_information2(df):
    df_cols=list(df.columns)

    for col in df_cols:
        print("Name: {}".format(df[col].name))
        print("NaN values: {}".format(df[col].isna().sum()))
        print("0 values: {}".format(df[col].isin([0]).sum()))

In [6]:
basic_information2(data)

Name: Unnamed: 0
NaN values: 0
0 values: 0
Name: ID
NaN values: 0
0 values: 0
Name: Name
NaN values: 0
0 values: 0
Name: Age
NaN values: 0
0 values: 0
Name: Nationality
NaN values: 0
0 values: 0
Name: Club
NaN values: 21
0 values: 0
Name: BP
NaN values: 0
0 values: 0
Name: Position
NaN values: 343
0 values: 0
Name: Team & Contract
NaN values: 0
0 values: 0
Name: Height
NaN values: 0
0 values: 0
Name: Weight
NaN values: 0
0 values: 0
Name: foot
NaN values: 0
0 values: 0
Name: Growth
NaN values: 0
0 values: 4631
Name: Joined
NaN values: 44
0 values: 0
Name: Loan Date End
NaN values: 12961
0 values: 0
Name: Value
NaN values: 0
0 values: 0
Name: Wage
NaN values: 0
0 values: 0
Name: Release Clause
NaN values: 0
0 values: 0
Name: Contract
NaN values: 0
0 values: 0
Name: Attacking
NaN values: 0
0 values: 0
Name: Crossing
NaN values: 0
0 values: 0
Name: Finishing
NaN values: 0
0 values: 0
Name: Heading Accuracy
NaN values: 0
0 values: 0
Name: Short Passing
NaN values: 0
0 values: 0
Name: Volle

### 2. Creating master_cleaner()-function

In [7]:
#dropping rows and columns
#Level 2
def dropping(df,drop_list,condition=None):#type list_columns_drop=list, condition=(df[enter_column]==enter_condition)
    
    df.drop_duplicates() #dropping rows
    df.drop(drop_list,axis=1,inplace=True)
    
    #if (condition != None):
        #example condition = (df["BP"] != "GK")
        #list_to_drop = list(df[condition].index)
        #df.drop(list_to_drop, axis= 0, inplace = True)
        #return df
    
    #else:
    return df

In [8]:
#condition = (test["BP"] != "GK")
#list_to_drop = list(test[condition].index)
#test.drop(list_to_drop , axis= 0, inplace = True)
#test.shape

In [9]:
#cleaning columns weight and height
#Level 3
def weight(i):
    return int(str(i)[:-3])

def parse_ht(ht):
    ht_ = ht.split("'")
    ft_ = float(ht_[0])
    in_ = float(ht_[1].replace("\"",""))
    return (12*ft_) + in_

In [10]:
#cleaning columns weight and height
#Level 2
def clean_weight(df):
    df2 = df.copy()
    df2['Weight'] = df2['Weight'].map(lambda x : weight(x))
    return df2

def clean_height(df):
    df2 = df.copy()
    df2['Height'] = df2['Height'].apply(lambda x:parse_ht(x))
    return df2

In [11]:
#cleaning money
#Level 3
def transform(s):
    string = str(s)[1:]
    if ("K") in string:
        string = str(string)[:-1]
        string = float(string)
        string=string*1000
    elif ("M") in string:
        string = str(string)[:-1]
        string = float(string)
        string=string*1000000   
    return float(string)

In [12]:
#cleaning money
#Level 2
def transform_money(df):
    df2 = df.copy()
    list_col=['Value','Wage']
    for col in list_col:
        df2[col]=df2[col].apply(lambda x:transform(x))
    return df2

In [13]:
#cleaning money
#Level 2
def clean_stars(df):
    df2 = df.copy()
    df2['W/F'] = df2['W/F'].map(lambda x: str(x)[:-1])
    df2['SM'] = df2['SM'].map(lambda x: str(x)[:-1])
    df2['IR'] = df2['IR'].map(lambda x: str(x)[:-1])
    df2['W/F'] = df2['W/F'].astype(int)
    df2['SM'] = df2['SM'].astype(int)
    df2['IR'] = df2['IR'].astype(int)
    return df2

In [14]:
#splitting skills
#Level 3
def trim_plus(s): #one cell of one list
    a_list=str(s).split("+")
    a=int(a_list[0])
    return a

def trim_plus2(s): #one cell of one list
    a_list=str(s).split("+")
    a=int(a_list[1])
    return a

In [15]:
#splitting skills
#Level 2

def split_skills(df):
    df2 = df.copy()
    skill_cols=['LS', 'ST', 'RS', 'LW', 'LF', 'CF','RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB','LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
    #skill_cols is a list
    
    for col in skill_cols:
        df2[col+"1"]=df2[col].apply(lambda x: trim_plus(x))
        df2[col+"2"]=df2[col].apply(lambda x: trim_plus2(x))
        df2.drop(columns=col, inplace = True)
        
    return df2

In [16]:
#cleaning hits
#level 3

def transform_hits(s):
    if ("K") in s:
        string=str(s)[:-1]
        string=float(string)
        string=string*1000
    else:
        string=float(s)
    return string

In [17]:
#cleaning hits
#level 2

def clean_hits(df):
    df2 = df.copy()
    df2["Hits"]=df2["Hits"].apply(lambda x: transform_hits(x))
    return df2

In [18]:
#replace nas
#level 2
def replace_nas(df):
    df['D/W'] = df['D/W'].replace(np.nan, "Medium")
    df['A/W'] = df['A/W'].replace(np.nan, "Medium")
    
    list_nas=["Growth", "Volleys", "Curve", "Agility", "Balance", "Jumping", "Interceptions", "Positioning","Vision", "Composure", "Sliding Tackle"]
    for col in list_nas:
        df[col]=df[col].fillna(df[col].mean())
    
    return df

In [19]:
#making ordinals
#level 2

def make_ordinals(df):
    df2=df.copy()
    df2_ordi=df2[["BP",'foot','A/W','D/W']]
    
    one_hot_data = pd.get_dummies(df2_ordi,drop_first = True)
    df2.drop(df2_ordi, axis=1, inplace = True)
    df2=pd.concat([df2,one_hot_data],axis=1)
    
    return df2

### 2.1 Finished master_cleaner()-function

In [20]:
#Top level function for cleaning
#Level 1
def master_cleaner(df,drop_list,condition=None): 
    
    dropping(df,drop_list,condition=None)
    df = clean_weight(df)
    df = clean_height(df)
    df = transform_money(df)
    df = clean_stars(df)
    df = split_skills(df)
    df = clean_hits(df)
    df = replace_nas(df)
    df = make_ordinals(df)
    
    return df

### 2.2 Applying master_cleaner() and basic_information2()

In [21]:
drop_list=['Unnamed: 0', 'ID',"Position",'Name','Nationality','Club','Team & Contract','Joined','Loan Date End','Release Clause', 'Contract',]

In [22]:
#df=cleaned data
df_clean=master_cleaner(data,drop_list) #no condition requested for dropping rows
df_clean.head()

Unnamed: 0,Age,Height,Weight,Growth,Value,Wage,Attacking,Crossing,Finishing,Heading Accuracy,...,BP_RB,BP_RM,BP_RW,BP_RWB,BP_ST,foot_Right,A/W_Low,A/W_Medium,D/W_Low,D/W_Medium
0,26,69.0,161,1,525000.0,4000.0,258,54,47,43,...,0,0,0,0,0,1,0,0,0,1
1,30,72.0,159,0,8500000.0,23000.0,365,66,79,76,...,0,0,0,0,1,1,0,0,1,0
2,33,64.0,134,0,9000000.0,49000.0,336,73,76,34,...,0,0,0,0,0,1,0,0,0,1
3,22,70.0,152,13,275000.0,4000.0,242,44,42,58,...,0,0,0,0,0,1,0,1,0,1
4,23,71.0,150,8,725000.0,2000.0,249,49,37,61,...,0,0,0,0,0,1,1,0,0,1


### 3. Function for scaling and x-y-split

In [23]:
def scaling(df,target,scaler):
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import Normalizer
    #x-y-split
    Y=df[target]
    X=df.drop([target],axis=1)
    
    #chosing scaler
    #StandardScaler
    if scaler=="StandardScaler":
        scaler_x = StandardScaler().fit(X)
        X_scaled = pd.DataFrame(scaler_x.transform(X),columns=X.columns)
        
        return X_scaled,Y
    #MinMaxScaler()
    elif scaler=="MinMax":
        X_scaled = MinMaxScaler().fit_transform(X.values)
        X_scaled = pd.DataFrame(X_scaled,columns=X.columns)
        
        return X_scaled,Y
    #Normalizer
    elif scaler=="Normalizer":
        transformer = Normalizer().fit(X)
        X_scaled = pd.DataFrame(transformer.transform(X),columns=X.columns)
        
        return X_scaled,Y

### 3.1 Applying scaling()

In [24]:
X_scaled,Y=scaling(df_clean,"OVA","StandardScaler")

### 4. Function for linear_reg()

In [25]:
def linear_reg(X_scaled,Y):
    #applying linear regression
    from sklearn.model_selection import train_test_split
    from sklearn import linear_model
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.3, random_state=100)
    lm = linear_model.LinearRegression()
    model = lm.fit(X_train,y_train)
    predictions  = lm.predict(X_test)
    predictions_df = pd.DataFrame(predictions,columns=["OVA"])
    
    return model, predictions_df, predictions, y_test, X_test

### 4.1 Applying linear_reg()

In [26]:
model,predictions_df, predictions, y_test, X_test=linear_reg(X_scaled,Y)

In [27]:
predictions_df.head()

Unnamed: 0,OVA
0,70.840671
1,73.483494
2,84.280216
3,72.176353
4,64.10226


### 5. Funtion for saving_model()
#https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

In [28]:
def saving_model(df,model,filename):
    from sklearn import model_selection
    import pickle
    
    pickle.dump(df, open(filename, 'wb'))
    
    print("Model safed")

### 5.1 Applying saving_model()

In [29]:
saving_model(df_clean,model,"model1")

Model safed


### 6. Funtion for error_matrix()

In [30]:
def error_matrix(model,predictions,y_test, X_test):
    from sklearn.metrics import mean_squared_error, r2_score
    import math
    
    mse = mean_squared_error(y_test, predictions)
    print("The MSE value is: ",round(mse,4))
    
    rmse = math.sqrt(mse)
    print("The RMSE value is: ",round(rmse,4))
    
    r2 = r2_score(y_test, predictions)
    print("The R2 value is: ",round(r2,4))
    
    N = len(X_test)
    p = X_test.shape[1]
    adj_r2 = 1-((1-r2)*(N-1)/(N-p-1))
    print("The R_adj^{2} value  is: ",round(adj_r2,4))
    
    results_error_matrix=[mse,rmse,r2,adj_r2]
    
    return results_error_matrix

### 6.1 Applying error_matrix()

In [31]:
results_model1=error_matrix(model,predictions,y_test, X_test)

The MSE value is:  3.4994
The RMSE value is:  1.8707
The R2 value is:  0.9253
The R_adj^{2} value  is:  0.9228


# Model with MinMax

In [32]:
data2=pd.read_csv("fifa21_trainning.csv")

In [33]:
drop_list=['Unnamed: 0', 'ID',"Position",'Name','Nationality','Club','Team & Contract','Joined','Loan Date End','Release Clause', 'Contract',]
condition=(data2["BP"] != "GK")

In [34]:
df_2=master_cleaner(data2,drop_list,condition)

In [35]:
df_2.shape

(13700, 133)

In [37]:
X_scaled2,Y2=scaling(df_2,"OVA","MinMax")
model2, predictions2_df, predictions2, y_test2, X_test2=linear_reg(X_scaled2,Y2)
saving_model(df_2,model2,"model2")

Model safed


In [38]:
results_model2=error_matrix(model2,predictions2,y_test2, X_test2)

The MSE value is:  3.5016
The RMSE value is:  1.8712
The R2 value is:  0.9252
The R_adj^{2} value  is:  0.9228


In [40]:
X_scaled3,Y3=scaling(df_2,"OVA","Normalizer")
model3, predictions3_df, predictions3, y_test3, X_test3=linear_reg(X_scaled3,Y3)
saving_model(df_2,model3,"model3")

Model safed


In [41]:
results_model3=error_matrix(model3,predictions3,y_test3, X_test3)

The MSE value is:  40.1277
The RMSE value is:  6.3346
The R2 value is:  0.1433
The R_adj^{2} value  is:  0.1149
