In [58]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

%matplotlib inline

# Preprocess the Data

## 1. Simple EDA + Data Quality checking

In [59]:
df = pd.read_csv("./Data/final_data.csv")

In [60]:
df.shape

(7136, 17)

In [61]:
df.head()

Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value
0,Manchester United,Defender Centre-Back,175.0,25,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,0.0,6408,175,22,9,50000000
1,Manchester United,Defender Centre-Back,194.0,30,68,0.037799,0.0,0.302394,0.0189,0.0,0.0,0.0,4762,148,27,1,25000000
2,Manchester United,Defender Centre-Back,187.0,28,70,0.0,0.032901,0.115153,0.0,0.0,0.0,0.0,5471,95,19,10,15000000
3,Manchester United,Defender Centre-Back,186.0,21,34,0.0,0.0,0.130529,0.0,0.0,0.0,0.0,2758,47,13,0,2000000
4,Manchester United,Defender Left-Back,169.0,23,89,0.012619,0.063096,0.227145,0.0,0.0,0.0,0.0,7132,182,15,3,22000000


In [62]:
df.dtypes

team                    object
position                object
height                 float64
age                      int64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
dtype: object

In [63]:
# Accurecy - Outlier

# Convert categorical variables using OneHotEncoding
categorical_features = ['team', 'position']

numeric_features = [
    'height', 'age', 'appearance', 'goals', 'assists', 'yellow cards',
    'second yellow cards', 'red cards', 'goals conceded', 'clean sheets',
    'minutes played', 'days_injured', 'games_injured', 'award', 'current_value'
]


In [64]:
for i in categorical_features:
    print(df[i].value_counts())

team
Royal AM FC             36
Daegu FC                34
Al-Batin FC             34
Swallows FC             33
Maritzburg United FC    33
                        ..
FC Barcelona             5
Gaziantep FK             4
Manchester City          4
Bayern Munich            2
Real Madrid              1
Name: count, Length: 374, dtype: int64
position
Defender Centre-Back          1387
midfield-CentralMidfield       893
Attack Centre-Forward          774
Defender Right-Back            685
midfield-DefensiveMidfield     680
Defender Left-Back             643
midfield-AttackingMidfield     563
Attack-RightWinger             529
Attack-LeftWinger              490
Goalkeeper                     188
midfield-RightMidfield          79
midfield-LeftMidfield           77
midfield                        51
Attack-SecondStriker            41
Attack                          32
Defender                        24
Name: count, dtype: int64


In [65]:
# Calculate the frequency of each 'team'
team_counts = df['team'].value_counts()

# Filter 'team' values that appear 2 or more times
df = df[df['team'].isin(team_counts[team_counts >= 2].index)]

In [66]:
df.shape

(7135, 17)

## 2. Feature engineering

1. Feature scaling
2. Aggregation
3. One hot coding

In [67]:
categorical_features

['team', 'position']

In [68]:
df = pd.get_dummies(df, columns=categorical_features)

In [69]:
df.shape

(7135, 404)

In [70]:
df.head(2)

Unnamed: 0,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,...,position_Defender Centre-Back,position_Defender Left-Back,position_Defender Right-Back,position_Goalkeeper,position_midfield,position_midfield-AttackingMidfield,position_midfield-CentralMidfield,position_midfield-DefensiveMidfield,position_midfield-LeftMidfield,position_midfield-RightMidfield
0,175.0,25,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
1,194.0,30,68,0.037799,0.0,0.302394,0.0189,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False


## 3. Feature selection

In [71]:
correlation = df.corr()
print(correlation['current_value'].sort_values(ascending=False))

current_value          1.000000
minutes played         0.452337
appearance             0.447049
award                  0.226473
games_injured          0.201774
                         ...   
position_Attack       -0.032377
team_Royal AM FC      -0.033083
position_midfield     -0.040892
age                   -0.065643
position_Goalkeeper   -0.074296
Name: current_value, Length: 404, dtype: float64


In [72]:
# Set the correlation threshold
threshold = 0.21  # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['current_value']) > threshold]['current_value'].index
selected_features

Index(['appearance', 'minutes played', 'award', 'current_value'], dtype='object')

In [73]:
selected_features = ['appearance', 'minutes played', 'award', 'current_value']

In [74]:
df = df[selected_features]
df.head()

Unnamed: 0,appearance,minutes played,award,current_value
0,82,6408,9,50000000
1,68,4762,1,25000000
2,70,5471,10,15000000
3,34,2758,0,2000000
4,89,7132,3,22000000


## 4. Prepare train and test data

In [75]:
# Prepare data
X = df.drop(['current_value'], axis=1)
y = df['current_value']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

# sacle the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [76]:
X.shape


(7135, 3)

# Buliding the Model

In [77]:
model = LinearRegression()

# Train the Model

In [78]:
# Fit the model on the training data
model.fit(X_train_scaled, y_train)

# Test the Model

In [79]:
# Predict and evaluate the model
y_pred = model.predict(X_test_scaled)

# Interpretation of the Model


In [80]:
model.coef_

array([ 707308.93786033, 1802004.5983971 ,  894976.10054515])

In [81]:
coeff_df = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
appearance,707308.9
minutes played,1802005.0
award,894976.1


In [84]:
print(model.intercept_) 

2879939.558514366
