### LGBM Regressor

We train a LGBM Regressor for each position to predict the number of points each player will score. We use `optuna` to find the best values for model hyperparameters using Bayesian Optimisation.

In [1]:
import pandas as pd

# add 'over_60_minutes' column
df = pd.read_csv(r"C:\Users\harve\OneDrive\Documents\Python\VScodeprojects\FPLbot\previous_seasons_dataset.csv")
df['over_60_minutes'] = (df['minutes'] >= 60).astype(int)

In [2]:
import optuna
from lightgbm import LGBMRegressor
import logging
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial, position):
    
    data = df[(df['position'] == position) & (df['over_60_minutes'] == 1)]
    
    x = data[['team_market_value', 'opponent_market_value', 'value', 'was_home', 'points_last_game', 'total_points', 'mins_last_game',
              'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5', 'mean_mins_last_5', 'mean_points_last_10', 
              'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
              'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
              'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
              'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
              'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
              'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
              'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']] 
    
    y = data['points']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    n_estimators = trial.suggest_int('n_estimators', 50, 700)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2, log=True)
    num_leaves = trial.suggest_int('num_leaves', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 50)
    min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
    subsample = trial.suggest_float('subsample', 0.3, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.3, 1.0)

    model = LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate, num_leaves=num_leaves, max_depth=max_depth,
                              min_child_samples=min_child_samples, subsample=subsample, colsample_bytree=colsample_bytree, random_state=42, verbose=-1)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse


def run_optimization(df, positions):
    best_params = {}
    for position in positions:
    
        study_lgbm = optuna.create_study(direction='minimize')
        study_lgbm.optimize(lambda trial: objective(trial, position), n_trials=500)
        best_params[(position)] = study_lgbm.best_params
        print(f"Best LGBM params for {position}: {study_lgbm.best_params}, RMSE: {study_lgbm.best_value}")

    return best_params

positions = ['GK', 'DEF', 'MID', 'FWD']
best_hyperparameters = run_optimization(df, positions)

  from .autonotebook import tqdm as notebook_tqdm


Best LGBM params for GK: {'n_estimators': 51, 'learning_rate': 0.012291234169711234, 'num_leaves': 123, 'max_depth': 3, 'min_child_samples': 82, 'subsample': 0.5766104770230416, 'colsample_bytree': 0.31966982235901564}, RMSE: 2.8004195535984526
Best LGBM params for DEF: {'n_estimators': 533, 'learning_rate': 0.014113481150412267, 'num_leaves': 191, 'max_depth': 26, 'min_child_samples': 14, 'subsample': 0.7375579848649274, 'colsample_bytree': 0.4835634875278364}, RMSE: 2.254656868443535
Best LGBM params for MID: {'n_estimators': 166, 'learning_rate': 0.019389049124094237, 'num_leaves': 117, 'max_depth': 46, 'min_child_samples': 83, 'subsample': 0.816178334679264, 'colsample_bytree': 0.6007820089745433}, RMSE: 3.0285287436964134
Best LGBM params for FWD: {'n_estimators': 253, 'learning_rate': 0.01954599037711251, 'num_leaves': 97, 'max_depth': 4, 'min_child_samples': 69, 'subsample': 0.5801920947265934, 'colsample_bytree': 0.7886548880165467}, RMSE: 3.5826761256463797


Goalkeepers:

In [3]:
GK_data = df[(df['position'] == 'GK') & (df['over_60_minutes'] == 1)]
GK_points_target = GK_data['points']
GK_points_features = GK_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(GK_points_features, GK_points_target, train_size=0.8, test_size=0.2)

best_GK_lgbm_params = best_hyperparameters[('GK')]

GK_lgbm = LGBMRegressor(n_estimators=best_GK_lgbm_params['n_estimators'], learning_rate=best_GK_lgbm_params['learning_rate'], 
                        num_leaves = best_GK_lgbm_params['num_leaves'], max_depth = best_GK_lgbm_params['max_depth'], 
                        min_child_samples = best_GK_lgbm_params['min_child_samples'], subsample = best_GK_lgbm_params['subsample'],
                        colsample_bytree = best_GK_lgbm_params['colsample_bytree'], random_state=42)

cv_scores = cross_val_score(GK_lgbm, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
GK_lgbm.fit(x_train, y_train)
y_pred = GK_lgbm.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = GK_lgbm.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': GK_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))
print(100*'-')

print(f'Cross validation scores: {np.abs(cv_scores)}')
print(f'Mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'RMSE: {RMSE}, R-squared: {R_squared}')

                              Feature  Importance
1               opponent_market_value          28
37    total_team_conceded_last_season          24
22          mean_team_conceded_last_5          23
27        mean_opponent_points_last_5          19
38  total_opponent_points_last_season          18
0                   team_market_value          17
2                               value          17
35             total_mins_last_season          17
31      mean_opponent_conceded_last_3          16
26        mean_opponent_points_last_3          14
----------------------------------------------------------------------------------------------------
Cross validation scores: [2.79737849 2.7689592  2.74543702 2.80727721 2.80488888]
Mean cross validation score: 2.784788160130526
RMSE: 2.9845839527518065, R-squared: 0.0059507754630847565


Defenders:

In [4]:
DEF_data = df[(df['position'] == 'DEF') & (df['over_60_minutes'] == 1)]
DEF_points_target = DEF_data['points']
DEF_points_features = DEF_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(DEF_points_features, DEF_points_target, train_size=0.8, test_size=0.2)

best_DEF_lgbm_params = best_hyperparameters[('DEF')]

DEF_lgbm = LGBMRegressor(n_estimators=best_DEF_lgbm_params['n_estimators'], learning_rate=best_DEF_lgbm_params['learning_rate'], 
                        num_leaves = best_DEF_lgbm_params['num_leaves'], max_depth = best_DEF_lgbm_params['max_depth'], 
                        min_child_samples = best_DEF_lgbm_params['min_child_samples'], subsample = best_DEF_lgbm_params['subsample'],
                        colsample_bytree = best_DEF_lgbm_params['colsample_bytree'], random_state=42)

cv_scores = cross_val_score(DEF_lgbm, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
DEF_lgbm.fit(x_train, y_train)
y_pred = DEF_lgbm.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = DEF_lgbm.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': DEF_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))
print(100*'-')

print(f'Cross validation scores: {np.abs(cv_scores)}')
print(f'Mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'RMSE: {RMSE}, R-squared: {R_squared}')

                           Feature  Importance
35          total_mins_last_season        3576
34        total_points_last_season        3222
27     mean_opponent_points_last_5        3092
17         mean_team_points_last_5        3053
7                       total_mins        3050
33  mean_opponent_conceded_last_10        3037
16         mean_team_points_last_3        2969
32   mean_opponent_conceded_last_5        2965
26     mean_opponent_points_last_3        2952
18        mean_team_points_last_10        2903
----------------------------------------------------------------------------------------------------
Cross validation scores: [2.40219472 2.51356976 2.46910297 2.46736517 2.32548457]
Mean cross validation score: 2.435543434331759
RMSE: 2.480400845063206, R-squared: 0.39804778620829584


Midfielders:

In [5]:
MID_data = df[(df['position'] == 'MID') & (df['over_60_minutes'] == 1)]
MID_points_target = MID_data['points']
MID_points_features = MID_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(MID_points_features, MID_points_target, train_size=0.8, test_size=0.2)

best_MID_lgbm_params = best_hyperparameters[('MID')]

MID_lgbm = LGBMRegressor(n_estimators=best_MID_lgbm_params['n_estimators'], learning_rate=best_MID_lgbm_params['learning_rate'], 
                        num_leaves = best_MID_lgbm_params['num_leaves'], max_depth = best_MID_lgbm_params['max_depth'], 
                        min_child_samples = best_MID_lgbm_params['min_child_samples'], subsample = best_MID_lgbm_params['subsample'],
                        colsample_bytree = best_MID_lgbm_params['colsample_bytree'], random_state=42)

cv_scores = cross_val_score(MID_lgbm, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
MID_lgbm.fit(x_train, y_train)
y_pred = MID_lgbm.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = MID_lgbm.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': MID_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))
print(100*'-')

print(f'Cross validation scores: {np.abs(cv_scores)}')
print(f'Mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'RMSE: {RMSE}, R-squared: {R_squared}')

                                Feature  Importance
1                 opponent_market_value         610
34             total_points_last_season         550
2                                 value         541
5                          total_points         529
35               total_mins_last_season         511
39  total_opponent_conceded_last_season         497
32        mean_opponent_conceded_last_5         493
33       mean_opponent_conceded_last_10         487
28         mean_opponent_points_last_10         486
16              mean_team_points_last_3         469
----------------------------------------------------------------------------------------------------
Cross validation scores: [2.786425   2.91896008 3.05479219 3.11931927 3.02875766]
Mean cross validation score: 2.9816508416361316
RMSE: 2.953347961187858, R-squared: 0.11050489459233215


Forwards:

In [6]:
FWD_data = df[(df['position'] == 'FWD') & (df['over_60_minutes'] == 1)]
FWD_points_target = FWD_data['points']
FWD_points_features = FWD_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(FWD_points_features, FWD_points_target, train_size=0.8, test_size=0.2)

best_FWD_lgbm_params = best_hyperparameters[('FWD')]

FWD_lgbm = LGBMRegressor(n_estimators=best_FWD_lgbm_params['n_estimators'], learning_rate=best_FWD_lgbm_params['learning_rate'], 
                        num_leaves = best_FWD_lgbm_params['num_leaves'], max_depth = best_FWD_lgbm_params['max_depth'], 
                        min_child_samples = best_FWD_lgbm_params['min_child_samples'], subsample = best_FWD_lgbm_params['subsample'],
                        colsample_bytree = best_FWD_lgbm_params['colsample_bytree'], random_state=42)

cv_scores = cross_val_score(FWD_lgbm, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
FWD_lgbm.fit(x_train, y_train)
y_pred = FWD_lgbm.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = FWD_lgbm.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': FWD_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))
print(100*'-')

print(f'Cross validation scores: {np.abs(cv_scores)}')
print(f'Mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'RMSE: {RMSE}, R-squared: {R_squared}')

                            Feature  Importance
1             opponent_market_value         110
24            total_opponent_points          98
17          mean_team_points_last_5          94
29          total_opponent_conceded          80
37  total_team_conceded_last_season          71
18         mean_team_points_last_10          67
2                             value          67
0                 team_market_value          66
25        opponent_points_last_game          65
8                mean_points_last_3          58
----------------------------------------------------------------------------------------------------
Cross validation scores: [3.41305425 3.80269376 3.50297521 3.48183161 3.57050875]
Mean cross validation score: 3.5542127150163845
RMSE: 3.4834586384965682, R-squared: 0.06233598722605149


In [8]:
import joblib

joblib.dump(GK_lgbm, 'GK_lgbm.pkl')
joblib.dump(DEF_lgbm, 'DEF_lgbm.pkl')
joblib.dump(MID_lgbm, 'MID_lgbm.pkl')
joblib.dump(FWD_lgbm, 'FWD_lgbm.pkl')

['FWD_lgbm.pkl']