In [None]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

file= pd.read_csv('fifa21_train.csv')
file.head()

In [None]:
for col in file.columns:
    print(col)


In [None]:
file_filtered = file[["Age","Weight", "Height","BP","Attacking", "Skill","Movement","Power","Mentality",
                     "Defending","Goalkeeping","LS","ST","RS","LW","LF","CF","RF","RW","LAM","CAM","RAM","LM","LCM","CM","RCM",
                      "RM","LWB","LDM","CDM","RDM","RWB","LB","LCB","CB","RCB","RB","GK","OVA"]]

In [None]:
file_filtered.head().T

# Check Duplicates

In [None]:
file_filtered = file_filtered.drop_duplicates()
display(file_filtered)

# Checking Datatypes

In [None]:
file_filtered.isna().sum()

In [None]:
column_types = file_filtered.dtypes
print(column_types)

In [None]:
display(file_filtered["BP"].value_counts(dropna = False))

# Height and Weight is converted into cms and Kgs

In [None]:
def convert_height(height):
    feet, inches = height.split("'")
    feet = int(feet)
    inches = int(inches.replace("\"", ""))
    height_cm = feet * 30.48 + inches * 2.54
    return height_cm

file_filtered["Height_in_cms"] = file_filtered['Height'].apply(convert_height)

In [None]:
file_filtered = file_filtered.drop(["Height"], axis=1)

In [None]:
def convert_weight(weight):
    
    weight =weight.replace("lbs", "")
    return float(weight)*0.45

In [None]:
file_filtered["Weight_in_kg"] = file_filtered['Weight'].apply(convert_weight)

In [None]:
file_filtered = file_filtered.drop(["Weight"], axis=1)

In [None]:
file_filtered.head().T

In [None]:
#we put the column of Height_in_cms and Weight_in_kg in the beggining of Dataframe
height_column = file_filtered.pop('Height_in_cms')
file_filtered.insert(1, 'Height_in_cms', height_column)

weight_column = file_filtered.pop('Weight_in_kg')
file_filtered.insert(2, 'Weight_in_kg', weight_column)

# Print the updated dataframe
file_filtered.head().T

In [None]:
columns_to_convert = ["LS","ST","RS","LW","LF","CF","RF","RW","LAM","CAM","RAM","LM","LCM","CM","RCM",
                      "RM","LWB","LDM","CDM","RDM","RWB","LB","LCB","CB","RCB","RB","GK" ] 

def convert_stats(x):
    if isinstance(x, int):
        return x
    else:
        return int(x[0:x.index("+")])


for column in columns_to_convert:
    file_filtered[column] = file_filtered[column].apply(convert_stats)



In [None]:
file_filtered.head().T

In [None]:
# Basing on this link we have decided to leave only these positions 
#link = https://gaming.stackexchange.com/questions/167318/what-do-fifa-14-position-acronyms-mean


In [None]:
#Check correlation by Heatmap
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

correlation_matrix = file_filtered.corr()

plt.figure(figsize=(20, 15), dpi =80)  # Set the figure size
sns.heatmap(correlation_matrix, annot=True,cmap='coolwarm')
plt.show()


In [None]:
#After looking at the correlation matrix, we figured out that there are sveral columns with same and have same meaning. 
#for Eg LCB,CB, RCB. Hence we can delete these redundant columns
file_filtered_new= file_filtered [['Age','Height_in_cms',"Weight_in_kg",'BP', 'Attacking', 'Skill', 'Movement', 
                      'Power', 'Mentality', 'Defending','Goalkeeping', 'ST', 'LW', 
                      'CF', 'RW', 'CAM', 'LM', 'CM', 'RM', 'LWB', 'CDM', 'RWB', 'LB', 'CB', 'RB', 'GK','OVA']]
file_filtered_new

In [None]:
file_filtered_new.to_csv('fifa21_train_clean.csv', index=True)

In [None]:
figure(figsize=(15, 10), dpi=80)

corr_mat = file_filtered_new.corr(numeric_only = True)

# Checking categorical and numerical data

In [None]:
numerical = file_filtered_new.select_dtypes(include=np.number)
categoricals = file_filtered_new.select_dtypes(include=np.object)

In [None]:
display(numerical)

In [None]:
display(categoricals)

In [None]:
#Insight: Features have high correlation among themselves and low with target. Now I will try deleting some features.
columns_to_drop = ["LM", "LW", "LWB","LB", "GK"]
file_filtered_new = file_filtered_new.drop(columns_to_drop, axis=1)

# Display the updated DataFrame
display(file_filtered_new)

In [None]:
file_filtered_new.head().T

# Creating feature and target columns

In [None]:
X = file_filtered_new.copy() # features
X = X.drop(["OVA"], axis = 1)
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = object)

y = file_filtered_new["OVA"] 

In [None]:
display(X.head())
display(X_num.head())
display(X_cat.head())
display(y.head())

In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# numerical_normalized = scaler.fit_transform(numerical)
# numerical_normalized = pd.DataFrame(numerical_normalized, columns=numerical.columns)
# numerical_normalized.head()

# Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder

# Normalizing X's numerical columns

def minmaxscaler(X_num):

    scaler = MinMaxScaler().fit(X_num)
    X_num_normalized = scaler.transform(X_num)
    X_num_scaled = pd.DataFrame(X_num_normalized, columns = X_num.columns)
    return X_num_scaled

In [None]:
# Called minmaxscaler function.

X_num_scaled = minmaxscaler(X_num)

print(X_num_scaled.shape)
X_num_scaled.head().T

# Encoding categorical columns

In [None]:
# Function to encode categorical columns.

def onehotencoder(X_cat): 
    
    encoder = OneHotEncoder(drop='first').fit(X_cat)
    cols = encoder.get_feature_names_out(input_features=X_cat.columns)
    X_cat_encode = pd.DataFrame(encoder.transform(X_cat).toarray(),columns=cols)
    
    return X_cat_encode


In [None]:
# Calling onehotencoder function
X_cat_encode = onehotencoder(X_cat)

print(X_cat_encode.shape)
X_cat_encode.head().T
X_cat_encode.columns

# Concatenating X_num and X_cat dataframes into X

In [None]:
X = pd.concat([X_cat_encode, X_num_scaled], axis = 1)
display(X.shape)
display(X.head().T)

display(X.columns)
display(X_num_scaled)

# Splitting data into train and test dataset.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


# Linear Regresion

In [None]:
from sklearn import linear_model
lm = linear_model.LinearRegression()

# Training X_train and y_train to find the patterns and relationship among them. 

lm.fit(X_train,y_train)

# Model Predictions and Validations - r2 score

In [None]:
from sklearn.metrics import r2_score


# As we have trained in above line, now we will make predictions from X_train dataset.
predictions = lm.predict(X_train)

# Now we will deduce r2 score for train model, to understand the chances of errors.
display(r2_score(y_train, predictions))

In [None]:
predictions_test = lm.predict(X_test)

#r2 score  for test model
display(r2_score(y_test, predictions_test))
display(predictions_test.shape)
display(predictions_test[:5])
display(y_test[:5])

# Mean Squared Error

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test,predictions_test)
mse

# Root mean square error

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse

# Mean Absolute Error

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, predictions_test)
mae


In [None]:
display(y_test.mean())
display(file_filtered_new["OVA"].mean())

# Results after looking at the r2 score

In [None]:
file_to_predict = pd.read_csv("fifa21_validate.csv")

display(file_to_predict.shape)
file_to_predict.head().T

# Dropping duplicates

In [None]:
file_to_predict.dtypes

In [None]:
file_to_predict = file_to_predict.drop_duplicates()

file_to_predict.shape

# Extracting necessary columns from main dataset

In [None]:
columns_to_keep = ['Age', 'Height', 'Weight', 'BP', 'Attacking', 'Skill',
                   'Movement', 'Power', 'Mentality', 'Defending', 'Goalkeeping', 'ST',
                   'CF', 'RW', 'CAM', 'CM', 'RM', 'CDM', 'RWB', 'CB', 'RB', 'OVA']

file_to_predict_new = file_to_predict[columns_to_keep]
display(file_to_predict_new.shape)
file_to_predict_new.head().T

# Converting height into cms and weight into kgs

In [None]:
file_to_predict_new["Height_in_cms"] = file_to_predict_new["Height"].apply(convert_height)
file_to_predict_new["Weight_in_kgs"] = file_to_predict_new["Weight"].apply(convert_weight)

In [None]:
file_to_predict_new = file_to_predict_new.drop(["Height", "Weight"], axis = 1)

In [None]:
file_to_predict_new.head().T

# Converting columns into int datatype

In [None]:
columns = ['ST', 'CF', 'RW', 'CAM', 'CM', 'RM', 'CDM', 'RWB', 'CB', 'RB']

In [None]:
for column in columns:
    file_to_predict_new[column] = file_to_predict_new[column].apply(convert_stats)
    
file_to_predict_new.isna().sum()

In [None]:
height_column = file_to_predict_new.pop('Height_in_cms')
file_to_predict_new.insert(2, 'Height_in_cms', height_column)

weight_column = file_to_predict_new.pop('Weight_in_kgs')
file_to_predict_new.insert(3, 'Weight_in_kgs', weight_column)

# Creating features and target columns

In [None]:
X_predict = file_to_predict_new.copy() # features
X_predict = X_predict.drop(["OVA"], axis = 1)
X_num_predict = X_predict.select_dtypes(include = np.number)
X_cat_predict = X_predict.select_dtypes(include = object)

y = file_to_predict_new["OVA"] # target

# Normalization

In [None]:
X_cat_encoded_predict = onehotencoder(X_cat_predict)
display(X_cat_encoded_predict.shape)
X_cat_encoded_predict.head().T

In [None]:
X_num_scaled_predict = minmaxscaler(X_num_predict)
display(X_num_scaled_predict.shape)
X_num_scaled_predict.head().T

# Concatenating transformed data into X_predict

In [None]:
X_predict = pd.concat([X_cat_encoded_predict, X_num_scaled_predict], axis = 1)
display(X_predict.shape)
display(X_predict.head().T)

In [None]:
results = lm.predict(X_predict)

In [None]:
results

In [None]:
len(results)

In [None]:
predicted_df = pd.concat([file_to_predict_new,pd.Series(results, name="predicted_OVA")],axis=1)

In [None]:
predicted_df.head().T

# Metrics

In [None]:
r2_predict = r2_score(file_to_predict_new["OVA"], results)
r2_predict

In [None]:
mae = mean_absolute_error(file_to_predict_new["OVA"], results)
mae

In [None]:
mse = mean_squared_error(file_to_predict_new["OVA"], results)
mse

In [None]:
rmse = np.sqrt(mse)
rmse