This notebook is dedicated to building and evaluating machine learning models for predicting the market value of football players. It leverages various regression algorithms to analyze player performance data and estimate their market value based on a number of features.

#### Import Libraries and connect to database

In [6]:
import pandas as pd
import numpy as np
import time 
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
# removes warnings
import warnings
warnings.filterwarnings('ignore')
#suppress scientific notation by setting float_format
pd.options.display.float_format = '{:.2f}'.format 
pd.set_option('display.max_rows', None) # display all rows
pd.set_option('display.max_columns', None) # display all columns

In [7]:
# Database connection details from environment variables
load_dotenv()


db_config = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': int(os.getenv('DB_PORT'))  # Ensure the port is correctly converted to an integer
}

# Create a connection string for SQLAlchemy
connection_string = f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['dbname']}"
engine=create_engine(connection_string)

In [8]:
df=pd.read_sql('SELECT * FROM prediction', engine)

In [6]:
columns_to_drop = ['mp', 'starts', 'min', 'nineties','squad','birth_date_dt','matches','rec','mis','born_date','comp','live','a_minus_xag','date_of_birth', 'g_plus_a', 'g_minus_pk', 'pkatt', 'crdy', 'crdr', 'g_plus_a_minus_pk', 'xg_plus_xag', 'fk', 'npxg_plus_xag','tkld','tklw','birth_date', 'npxg_per_sh','np:g_minus_xg','contract_expiry','tkl', 'def_3rd', 'mid_3rd', 'att_3rd', 'att']
df = df.drop(columns=columns_to_drop,axis=1)
df.columns

Index(['unique_id', 'gls', 'ast', 'pk', 'xg', 'npxg', 'xag', 'prgc', 'prgp',
       'prgr', 'player_x', 'tkl_pcnt', 'lost', 'blocks', 'pass', 'int',
       'tkl_plus_int', 'clr', 'err', 'sh', 'sot', 'sot_pcnt', 'sh_per_90',
       'sot_per_90', 'g_per_sh', 'g_per_sot', 'dist', 'g_minus_xg', 'cmp',
       'cmp_pcnt', 'totdist', 'prgdist', 'xa', 'kp', 'pass_into_final_third',
       'ppa', 'crspa', 'touches', 'def_pen', 'att_pen', 'succ', 'succ_pcnt',
       'tkld_pcnt', 'carries', 'carries_into_final_third', 'cpa', 'dis',
       'player_id', 'player_y', 'main_position', 'value', 'height',
       'current_club', 'league', 'days_until_expiry', 'age'],
      dtype='object')

##### Identifying important colunms

 columns_to_drop = ['mp', 'starts', 'min', 'nineties','squad','birth_date_dt','matches','rec','mis','born_date','comp','live','a_minus_xag','date_of_birth', 'g_plus_a', 'g_minus_pk', 'pkatt', 'crdy', 'crdr', 'g_plus_a_minus_pk', 'xg_plus_xag', 'fk', 'npxg_plus_xag','tkld','tklw','birth_date', 'npxg_per_sh','np:g_minus_xg','contract_expiry','tkl', 'def_3rd', 'mid_3rd', 'att_3rd', 'att']


##### Data Pre-processing

In order to prepare the combined dataset for modeling, categorical features need to be encoded and numerical features need to be scaled.
One hot encoding is performed for non-ordinal categorical variable as there was no order in the values of the columns.

A ColumnTransformer is used to preprocess the data. This involves:

One-hot encoding the categorical columns.

Standard scaling the numerical columns.

In [84]:
str_colums= df[df.select_dtypes(exclude=['int64', 'float64']).columns]
str_colums.columns


Index(['unique_id', 'player_x', 'player_id', 'player_y', 'main_position',
       'current_club', 'league'],
      dtype='object')

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Identify columns to be encoded
categoric_columns = ['league', 'main_position', 'current_club']

# Separate features and target
X = df.drop(columns=['value', 'unique_id', 'player_x', 'player_id', 'player_y'], axis=1)
y = df['value']

# Preprocess the data
# OneHotEncode the object columns and StandardScale the numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(), categoric_columns)
    ])

# Apply transformations
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.25, random_state=0)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor()
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[model_name] = {'MAE': mae, 'R²': r2}

# Print the results
for model_name, metrics in results.items():
    print(f"{model_name} - MAE: {metrics['MAE']:.2f}, R²: {metrics['R²']:.2f}")

Linear Regression - MAE: 7685526.66, R²: 0.54
Random Forest - MAE: 6849079.18, R²: 0.54
Decision Tree - MAE: 8758067.73, R²: 0.24
XGBoost - MAE: 6522798.63, R²: 0.58


In [11]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Initialize the model
xgb = XGBRegressor()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best parameters found: ", best_params)

# Train the final model with the best parameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# Make predictions
y_pred = final_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Final Model - MAE: {mae:.2f}, R²: {r2:.2f}')

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=5, n_estimators=500, subsample=0.8; total time=   3.5s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=5, n_estimators=500, subsample=0.8; total time=   3.5s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=5, n_estimators=500, subsample=0.8; total time=   3.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=1.0; total time=   5.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=300, subsample=1.0; total time=   2.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=300, subsample=1.0; total time=   2.6s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=1.0; total time=   5.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=1.0; total time=   5.2s
[CV] END cols

##### Save the final model

In [12]:
#save model
import joblib

joblib.dump(final_model, 'xgboost_model.pkl')

['xgboost_model.pkl']

In [13]:
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

In [8]:
import joblib
loaded_model = joblib.load('xgboost_model.pkl')

In [14]:
kane_row = df[df['player_y'] == 'Harry Kane']

kane_features = kane_row.drop('value', axis=1)

kane_processed = preprocessor.transform(kane_features)

# Predict the market value
kane_value = loaded_model.predict(kane_processed)

print(f'Predicted market value for Harry Kane: {kane_value[0]}')

Predicted market value for Harry Kane: 85441744.0


In [15]:
salah_row = df[df['player_x'] == 'Mohamed Salah']

salah_features = salah_row.drop('value', axis=1)

salah_processed = preprocessor.transform(salah_features)

salah_value = final_model.predict(salah_processed)

print(f'Predicted market value for Mohamed Salah: {salah_value[0]}')

Predicted market value for Mohamed Salah: 117132624.0


In [16]:
bruno_row = df[df['player_x'] == 'Bruno Fernandes']

bruno_features = bruno_row.drop('value', axis=1)

bruno_processed = preprocessor.transform(bruno_features)

bruno_value = final_model.predict(bruno_processed)

print(f'Predicted market value for Bruno Fernandes: {bruno_value[0]}')

Predicted market value for Bruno Fernandes: 54369352.0


In [19]:
str_colums= df[df.select_dtypes(exclude=['int64', 'float64']).columns]
str_colums.columns

Index(['unique_id', 'player_x', 'player_id', 'player_y', 'main_position',
       'current_club', 'league'],
      dtype='object')

In [9]:
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

df=pd.read_sql('SELECT * FROM prediction', engine)  

# Select relevant columns
columns = ['gls', 'ast', 'pk', 'xg', 'npxg', 'xag', 'prgc', 'prgp',
       'prgr', 'tkl_pcnt', 'lost', 'blocks', 'pass', 'int',
       'tkl_plus_int', 'clr', 'err', 'sh', 'sot', 'sot_pcnt', 'sh_per_90',
       'sot_per_90', 'g_per_sh', 'g_per_sot', 'dist', 'g_minus_xg', 'cmp',
       'cmp_pcnt', 'totdist', 'prgdist', 'xa', 'kp', 'pass_into_final_third',
       'ppa', 'crspa', 'touches', 'def_pen', 'att_pen', 'succ', 'succ_pcnt',
       'tkld_pcnt', 'carries', 'carries_into_final_third', 'cpa', 'dis',
       'main_position', 'value', 'height',
       'current_club', 'league', 'days_until_expiry', 'age']
df = df[columns]


# Fill missing values if any
df = df.dropna()

# Separate features and target
X = df.drop('value', axis=1)
y = df['value']

# Categorical columns
categorical_cols = ['main_position',
       'current_club', 'league']

# Numeric columns
numeric_cols = ['gls', 'ast', 'pk', 'xg', 'npxg', 'xag', 'prgc', 'prgp',
       'prgr', 'tkl_pcnt', 'lost', 'blocks', 'pass', 'int',
       'tkl_plus_int', 'clr', 'err', 'sh', 'sot', 'sot_pcnt', 'sh_per_90',
       'sot_per_90', 'g_per_sh', 'g_per_sot', 'dist', 'g_minus_xg', 'cmp',
       'cmp_pcnt', 'totdist', 'prgdist', 'xa', 'kp', 'pass_into_final_third',
       'ppa', 'crspa', 'touches', 'def_pen', 'att_pen', 'succ', 'succ_pcnt',
       'tkld_pcnt', 'carries', 'carries_into_final_third', 'cpa', 'dis', 'height',
        'days_until_expiry', 'age']

# Preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)

# Save the scaler
scaler_filename = "scaler.pkl"
joblib.dump(preprocessor, scaler_filename)

['scaler.pkl']

In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Assuming X_preprocessed and y are already defined

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.25, random_state=0)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Xgboost': XGBRegressor()
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}")

Linear Regression RMSE: 11213021.23, MAE: 7685480.89, R²: 0.54
Decision Tree RMSE: 15806785.64, MAE: 9062350.60, R²: 0.09
Random Forest RMSE: 11175390.60, MAE: 6812892.43, R²: 0.55
Xgboost RMSE: 10749617.88, MAE: 6522798.63, R²: 0.58


#### Hyperparameter Tuning

In [27]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Initialize the model
xgb = XGBRegressor()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best parameters found: ", best_params)

# Train the final model with the best parameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# Make predictions
y_pred = final_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Final Model - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}')

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=5, n_estimators=500, subsample=0.8; total time=   2.7s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=5, n_estimators=500, subsample=0.8; total time=   2.7s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=5, n_estimators=500, subsample=0.8; total time=   2.8s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=1.0; total time=   3.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=300, subsample=1.0; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=300, subsample=1.0; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=1.0; total time=   3.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=300, subsample=1.0; total time=   1.9s
[CV] END cols

In [28]:
#save model
import joblib

joblib.dump(final_model, 'best_xgboost.pkl')

['best_xgboost.pkl']

In [11]:
# Load the saved model
import joblib
loaded_model = joblib.load('best_xgboost.pkl')