In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
!pip install scikit-learn
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [2]:
data = pd.read_csv("C:\\Users\\Santiago\\Desktop\\Ironhack Files\\fifa21_train.csv")
data.shape

(11701, 101)

In [3]:
def clean_data(data):
    # NULL VALUES
    # Loan Date End
    data = data.drop(columns=['Loan Date End'], axis=1)

    # Joined, Volleys, Curve, Agility, Balance, Jumping, and Vision
    data = data.dropna(subset=['Joined']).copy()

    # A/W and D/W
    data = data.dropna(subset=['A/W']).copy()

    # Club
    data['Club'].fillna('Unknown/Retire', inplace=True)

    # Position
    data['Position'].fillna(data['BP'], inplace=True)

    # Composure
    data["Composure"] = data.groupby("BP")["Composure"].transform(lambda x: x.fillna(x.mean()))

    # NUMERICAL
    # Positions
    columns = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
    for column in columns:
        data[column] = data[column].astype(str).str.extract(r'(\d+)\+').astype(float)

    # Height
    def height_to_cm(height_str):
        feet, inches = height_str.split("'")
        feet = int(feet)
        inches = int(inches.replace("\"", ""))
        total_inches = (feet * 12) + inches
        return total_inches * 2.54

    data['Height'] = data['Height'].apply(height_to_cm)
    data['Height'] = data['Height'].astype('float')

    # Weight
    def lbs_to_kg(weight):
        weight_in_lbs = int(weight.replace('lbs', ''))
        weight_in_kg = round(weight_in_lbs * 0.45359237, 2)
        return weight_in_kg

    data['Weight'] = data['Weight'].apply(lbs_to_kg)
    data['Weight'] = data['Weight'].astype('float')

    # Value
    money_columns = ["Value", "Wage", "Release Clause"]

    def money_in_numbers(value_str):
        value_str = value_str.replace("€", "")
        if "K" in value_str:
            return float(value_str.replace("K", "")) * 1e3
        elif "M" in value_str:
            return float(value_str.replace("M", "")) * 1e6
        else:
            return float(value_str)

    for col in money_columns:
        data[col] = data[col].apply(money_in_numbers)

    # Hits
    data['Hits'] = data['Hits'].apply(lambda x: float(x.replace('K', '')) * 1e3 if 'K' in str(x) else x)
    data['Hits'] = data['Hits'].astype(float)

    # BP_value
    def get_corresponding_value(row):
        column_name = row['BP']
        return row[column_name]

    data['BP_Value'] = data.apply(get_corresponding_value, axis=1)

    new_position = 6
    column_to_move = data.pop('BP_Value')
    data.insert(new_position, 'BP_Value', column_to_move)

    # ID
    data['ID'] = data['ID'].astype('object')

    # OVA
    data['OVA'] = data['OVA'].astype('float')

    return data

data = clean_data(data)

## FIFA | Modeling

### FIFA | Modelin | Linear Regression

In [4]:
# separate the features from the labels
y = data['OVA']
columns_to_drop=['OVA','ID','Name','Team & Contract','Contract','Nationality','Club','Joined', 'Position','BP_Value']
X = data.drop(columns_to_drop,axis=1)

In [5]:
#categorical features and numerical ones are going to be treated differently
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = object)

### FIFA | Modeling | numerical variables

In [6]:
X_num.columns

Index(['Age', 'Height', 'Weight', 'Growth', 'Value', 'Wage', 'Release Clause',
       'Attacking', 'Crossing', 'Finishing', 'Heading Accuracy',
       'Short Passing', 'Volleys', 'Skill', 'Dribbling', 'Curve',
       'FK Accuracy', 'Long Passing', 'Ball Control', 'Movement',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Power', 'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
       'Mentality', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Defending', 'Marking', 'Standing Tackle',
       'Sliding Tackle', 'Goalkeeping', 'GK Diving', 'GK Handling',
       'GK Kicking', 'GK Positioning', 'GK Reflexes', 'Total Stats',
       'Base Stats', 'PAC', 'SHO', 'PAS', 'DRI', 'DEF', 'PHY', 'Hits', 'LS',
       'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM',
       'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB',
       'LCB', 'CB', 'RCB', 'RB', 'GK'],
      dtype='obj

In [7]:
X_num.head()

Unnamed: 0,Age,Height,Weight,Growth,Value,Wage,Release Clause,Attacking,Crossing,Finishing,...,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK
0,26,175.26,73.03,1,525000.0,4000.0,801000.0,258,54,47,...,59.0,59.0,59.0,59.0,58.0,54.0,54.0,54.0,58.0,15.0
1,30,182.88,72.12,0,8500000.0,23000.0,0.0,365,66,79,...,53.0,53.0,53.0,57.0,53.0,48.0,48.0,48.0,53.0,18.0
2,33,162.56,60.78,0,9000000.0,49000.0,15300000.0,336,73,76,...,56.0,56.0,56.0,59.0,53.0,41.0,41.0,41.0,53.0,12.0
3,22,177.8,68.95,13,275000.0,4000.0,694000.0,242,44,42,...,58.0,58.0,58.0,56.0,57.0,58.0,58.0,58.0,57.0,14.0
4,23,180.34,68.04,8,725000.0,2000.0,1400000.0,249,49,37,...,64.0,64.0,64.0,64.0,63.0,61.0,61.0,61.0,63.0,15.0


In [8]:
MinMaxtransformer = MinMaxScaler().fit(X_num)
X_normalized = MinMaxtransformer.transform(X_num)
print(type(X_normalized))
X_normalized = pd.DataFrame(X_normalized,columns=X_num.columns)
display(X_normalized.head())
print(type(X_normalized))

<class 'numpy.ndarray'>


Unnamed: 0,Age,Height,Weight,Growth,Value,Wage,Release Clause,Attacking,Crossing,Finishing,...,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK
0,0.37037,0.4,0.349256,0.038462,0.005833,0.007143,0.004811,0.546835,0.545455,0.478261,...,0.614286,0.614286,0.614286,0.642857,0.637681,0.527778,0.527778,0.527778,0.637681,0.075949
1,0.518519,0.55,0.333333,0.0,0.094444,0.041071,0.0,0.817722,0.681818,0.826087,...,0.528571,0.528571,0.528571,0.614286,0.565217,0.444444,0.444444,0.444444,0.565217,0.113924
2,0.62963,0.15,0.134908,0.0,0.1,0.0875,0.091892,0.744304,0.761364,0.793478,...,0.571429,0.571429,0.571429,0.642857,0.565217,0.347222,0.347222,0.347222,0.565217,0.037975
3,0.222222,0.45,0.277865,0.5,0.003056,0.007143,0.004168,0.506329,0.431818,0.423913,...,0.6,0.6,0.6,0.6,0.623188,0.583333,0.583333,0.583333,0.623188,0.063291
4,0.259259,0.5,0.261942,0.307692,0.008056,0.003571,0.008408,0.524051,0.488636,0.369565,...,0.685714,0.685714,0.685714,0.714286,0.710145,0.625,0.625,0.625,0.710145,0.075949


<class 'pandas.core.frame.DataFrame'>


### FIFA | Modeling | categorical variables

In [9]:
X_cat.columns

Index(['BP', 'foot', 'W/F', 'SM', 'A/W', 'D/W', 'IR'], dtype='object')

In [10]:
X_cat.head()

Unnamed: 0,BP,foot,W/F,SM,A/W,D/W,IR
0,CM,Right,4 ★,2★,High,Medium,1 ★
1,ST,Right,3 ★,4★,High,Low,2 ★
2,CAM,Right,4 ★,4★,High,Medium,2 ★
3,CDM,Right,2 ★,2★,Medium,Medium,1 ★
4,CDM,Right,2 ★,3★,Low,Medium,1 ★


In [11]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_cat)  # Dropping the first varaible of each column
encoded = encoder.transform(X_cat).toarray()
column_names = encoder.get_feature_names_out(input_features=X_cat.columns) # assigning column names
onehot_encoded = pd.DataFrame(encoded, columns=column_names)
onehot_encoded.head()

Unnamed: 0,BP_CB,BP_CDM,BP_CF,BP_CM,BP_GK,BP_LB,BP_LM,BP_LW,BP_LWB,BP_RB,...,SM_4★,SM_5★,A/W_Low,A/W_Medium,D/W_Low,D/W_Medium,IR_2 ★,IR_3 ★,IR_4 ★,IR_5 ★
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### FIFA | Modeling | Concatenate

In [12]:
# let's merge all this information together into a single dataset with all features, now numerical
X = pd.concat([X_normalized, onehot_encoded], axis=1)  # np.concatenate()
X.head()

Unnamed: 0,Age,Height,Weight,Growth,Value,Wage,Release Clause,Attacking,Crossing,Finishing,...,SM_4★,SM_5★,A/W_Low,A/W_Medium,D/W_Low,D/W_Medium,IR_2 ★,IR_3 ★,IR_4 ★,IR_5 ★
0,0.37037,0.4,0.349256,0.038462,0.005833,0.007143,0.004811,0.546835,0.545455,0.478261,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.518519,0.55,0.333333,0.0,0.094444,0.041071,0.0,0.817722,0.681818,0.826087,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.62963,0.15,0.134908,0.0,0.1,0.0875,0.091892,0.744304,0.761364,0.793478,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.222222,0.45,0.277865,0.5,0.003056,0.007143,0.004168,0.506329,0.431818,0.423913,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.259259,0.5,0.261942,0.307692,0.008056,0.003571,0.008408,0.524051,0.488636,0.369565,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
print(X.shape)
print(y.shape)

(11642, 115)
(11642,)


### FIFA | Modeling | Train-test

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train = X_train.dropna()
y_train = y_train.dropna()

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9313, 115)
(2329, 115)
(9313,)
(2329,)


In [17]:
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [18]:
#4.1. R_square
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.9154687935951792

In [19]:
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test) # R-squared(Coefficient of determination) good value > 0.95

0.9188390031878713

In [20]:
#4.2 MSE
mse=mean_squared_error(y_test,predictions_test)
mse # MSE(Mean Squared Error) Lower (close to 0) the MSE, the closer is forecast to actual

3.836176995506303

In [21]:
#4.3 RMSE
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse # RMSE(Root Mean Squared Error) values between 0.2 and 0.5

1.958616091914468

In [22]:
predictions_test = lm.predict(X_test) # start here for the new data 
predictions_test[:5]

array([71.53515625, 65.94921875, 70.6796875 , 69.07421875, 70.08203125])

In [23]:
y_test[:5]

5163    72.0
855     66.0
9665    72.0
4819    71.0
9453    73.0
Name: OVA, dtype: float64

### FIFA | Predicting

### FIFA | Predicting | Cleaning data

In [24]:
data_for_p = pd.read_csv("C:\\Users\\Santiago\\Desktop\\Ironhack Files\\fifa21_validate.csv")
data_for_p.shape

(1999, 101)

In [25]:
def clean_data(data):
    # NULL VALUES
    # Loan Date End
    data = data.drop(columns=['Loan Date End'], axis=1)

    # Joined, Volleys, Curve, Agility, Balance, Jumping, and Vision
    data = data.dropna(subset=['Joined']).copy()

    # A/W and D/W
    data = data.dropna(subset=['A/W']).copy()

    # Club
    data['Club'].fillna('Unknown/Retire', inplace=True)

    # Position
    data['Position'].fillna(data['BP'], inplace=True)

    # Composure
    data["Composure"] = data.groupby("BP")["Composure"].transform(lambda x: x.fillna(x.mean()))

    # NUMERICAL
    # Positions
    columns = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']
    for column in columns:
        data[column] = data[column].astype(str).str.extract(r'(\d+)\+').astype(float)

    # Height
    def height_to_cm(height_str):
        feet, inches = height_str.split("'")
        feet = int(feet)
        inches = int(inches.replace("\"", ""))
        total_inches = (feet * 12) + inches
        return total_inches * 2.54

    data['Height'] = data['Height'].apply(height_to_cm)
    data['Height'] = data['Height'].astype('float')

    # Weight
    def lbs_to_kg(weight):
        weight_in_lbs = int(weight.replace('lbs', ''))
        weight_in_kg = round(weight_in_lbs * 0.45359237, 2)
        return weight_in_kg

    data['Weight'] = data['Weight'].apply(lbs_to_kg)
    data['Weight'] = data['Weight'].astype('float')

    # Value
    money_columns = ["Value", "Wage", "Release Clause"]

    def money_in_numbers(value_str):
        value_str = value_str.replace("€", "")
        if "K" in value_str:
            return float(value_str.replace("K", "")) * 1e3
        elif "M" in value_str:
            return float(value_str.replace("M", "")) * 1e6
        else:
            return float(value_str)

    for col in money_columns:
        data[col] = data[col].apply(money_in_numbers)

    # Hits
    data['Hits'] = data['Hits'].apply(lambda x: float(x.replace('K', '')) * 1e3 if 'K' in str(x) else x)
    data['Hits'] = data['Hits'].astype(float)

    # BP_value
    def get_corresponding_value(row):
        column_name = row['BP']
        return row[column_name]

    data['BP_Value'] = data.apply(get_corresponding_value, axis=1)

    new_position = 6
    column_to_move = data.pop('BP_Value')
    data.insert(new_position, 'BP_Value', column_to_move)

    # ID
    data['ID'] = data['ID'].astype('object')

    # OVA
    data['OVA'] = data['OVA'].astype('float')

    return data

data_for_p = clean_data(data_for_p)

### FIFA | Predicting | Droping columns

In [26]:
#making the new data look like the transformed one may not be the most immediate exercise...
#we make the same separation into numerical and categorical

data_for_test = data_for_p['OVA'].head()
data_for_p = data_for_p.drop(columns_to_drop,axis=1)
X_for_p_num = data_for_p.select_dtypes(include = np.number)
X_for_p_cat = data_for_p.select_dtypes(include = object)

### FIFA | Predicting | Categorical variables

In [27]:
X_for_p_cat.head()

Unnamed: 0,BP,foot,W/F,SM,A/W,D/W,IR
0,CB,Right,2 ★,2★,Low,High,1 ★
1,CAM,Right,4 ★,3★,High,Low,1 ★
2,GK,Right,2 ★,1★,Medium,Medium,1 ★
3,CDM,Right,3 ★,2★,Medium,Medium,1 ★
4,CDM,Right,4 ★,2★,Medium,Medium,1 ★


In [28]:
encoded_for_p = encoder.transform(X_for_p_cat).toarray()
encoded_for_p
encoder.categories_
onehot_encoded_for_p = pd.DataFrame(encoded_for_p)
onehot_encoded_for_p.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


### FIFA | Predicting | numerical variables

In [29]:
X_for_p_normalized = MinMaxtransformer.transform(X_for_p_num)
X_for_p_normalized = pd.DataFrame(X_for_p_normalized,columns=X_for_p_num.columns)

#merge back all of our labels
X_for_p = pd.concat([X_for_p_normalized, onehot_encoded_for_p], axis=1)

X_for_p.head()

Unnamed: 0,Age,Height,Weight,Growth,Value,Wage,Release Clause,Attacking,Crossing,Finishing,...,21,22,23,24,25,26,27,28,29,30
0,0.259259,0.65,0.611199,0.269231,0.010833,0.008929,0.0,0.475949,0.465909,0.195652,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.222222,0.35,0.293613,0.192308,0.013333,0.005357,0.013213,0.648101,0.636364,0.619565,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.111111,0.65,0.468241,0.653846,0.001333,0.000893,0.001495,0.01519,0.011364,0.021739,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.7,0.420647,0.884615,0.001778,0.000893,0.002787,0.437975,0.363636,0.304348,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.296296,0.65,0.420647,0.192308,0.025556,0.023214,0.025826,0.640506,0.579545,0.608696,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [30]:
#predict and inspect results
results_for_p = lm.predict(X_for_p)
pd.concat([data_for_p,pd.Series(results_for_p, name='estimate')],axis=1).head()



Unnamed: 0,Age,BP,Height,Weight,foot,Growth,Value,Wage,Release Clause,Attacking,...,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,GK,estimate
0,23.0,CB,187.96,88.0,Right,7.0,975000.0,5000.0,0.0,230.0,...,61.0,61.0,59.0,61.0,67.0,67.0,67.0,61.0,16.0,66.449219
1,22.0,CAM,172.72,69.85,Right,5.0,1200000.0,3000.0,2200000.0,298.0,...,47.0,47.0,49.0,45.0,38.0,38.0,38.0,45.0,17.0,66.140625
2,19.0,GK,187.96,79.83,Right,17.0,120000.0,500.0,249000.0,48.0,...,18.0,18.0,16.0,16.0,18.0,18.0,18.0,16.0,53.0,53.574219
3,16.0,CDM,190.5,77.11,Right,23.0,160000.0,500.0,464000.0,215.0,...,54.0,54.0,53.0,53.0,54.0,54.0,54.0,53.0,11.0,56.714844
4,24.0,CDM,187.96,77.11,Right,5.0,2300000.0,13000.0,4300000.0,295.0,...,72.0,72.0,70.0,69.0,68.0,68.0,68.0,69.0,18.0,69.441406


In [31]:
print(data_for_test)

0    67.0
1    68.0
2    54.0
3    55.0
4    70.0
Name: OVA, dtype: float64
