### Regressors

We now train a Random Forest Regressor and a Gradient Boosting Regressor for each position to predict the number of points a player will score in the next game (assuming they play). Only players with 'minutes_over_60' = 1 are included. We use `optuna` to find the best values for model hyperparameters using Bayesian Optimisation.

In [1]:
import pandas as pd

# add 'over_60_minutes' column
df = pd.read_csv('data/previous_seasons_dataset.csv')
df['over_60_minutes'] = (df['minutes'] >= 60).astype(int)

In [2]:
import optuna
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import logging

optuna.logging.set_verbosity(optuna.logging.WARNING)


def objective(trial, model_type, position):
    
    
    data = df[(df['position'] == position) & (df['over_60_minutes'] == 1)]
        
    x = data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']] 
    y = data['points']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    # Define hyperparameters for RandomForest and GradientBoosting
    if model_type == 'random_forest':
        n_estimators = trial.suggest_int('n_estimators', 50, 1000)
        max_depth = trial.suggest_int('max_depth', 3, 50)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        max_features = trial.suggest_categorical('max_features', [None, 'sqrt', 'log2'])
        
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                      max_features=max_features, random_state=42)
    
    elif model_type == 'gradient_boosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 1000)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        subsample = trial.suggest_float('subsample', 0.5, 1.0)
        max_features = trial.suggest_categorical('max_features', [None, 'sqrt', 'log2'])

        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, min_samples_split=min_samples_split,
                                          min_samples_leaf=min_samples_leaf, subsample=subsample, max_features=max_features, random_state=42)

    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse

# Function to run optimization for both models across positions
def run_optimization(df, positions):
    best_params = {}
    for position in positions:
        # RandomForest Optimization
        study_rf = optuna.create_study(direction='minimize')
        study_rf.optimize(lambda trial: objective(trial, 'random_forest', position), n_trials=50)
        best_params[(position, 'random_forest')] = study_rf.best_params
        print(f"Best Random Forest params for {position}: {study_rf.best_params}, best RMSE: {study_rf.best_value}")

        # GradientBoosting Optimization
        study_gb = optuna.create_study(direction='minimize')
        study_gb.optimize(lambda trial: objective(trial, 'gradient_boosting', position), n_trials=50)
        best_params[(position, 'gradient_boosting')] = study_gb.best_params
        print(f"Best Gradient Boosting params for {position}: {study_gb.best_params}, RMSE: {study_gb.best_value}")
        
    return best_params

positions = ['GK', 'DEF', 'MID', 'FWD']
best_hyperparameters = run_optimization(df, positions)

  from .autonotebook import tqdm as notebook_tqdm


Best Random Forest params for GK: {'n_estimators': 854, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 19, 'max_features': 'sqrt'}, best RMSE: 2.8093781253125756
Best Gradient Boosting params for GK: {'n_estimators': 53, 'learning_rate': 0.017328746691892976, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 17, 'subsample': 0.9178808813486018, 'max_features': 'sqrt'}, RMSE: 2.803767540384599
Best Random Forest params for DEF: {'n_estimators': 696, 'max_depth': 48, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}, best RMSE: 2.207797608039847
Best Gradient Boosting params for DEF: {'n_estimators': 531, 'learning_rate': 0.025797377857124538, 'max_depth': 10, 'min_samples_split': 12, 'min_samples_leaf': 9, 'subsample': 0.70172644461833, 'max_features': None}, RMSE: 2.2735697402845108
Best Random Forest params for MID: {'n_estimators': 944, 'max_depth': 47, 'min_samples_split': 14, 'min_samples_leaf': 8, 'max_features': 'sqrt'}, best RMSE: 3.0292

Using the best values for the hyperparameters we then train the models for each position.

In [3]:
GK_data = df[(df['position'] == 'GK') & (df['over_60_minutes'] == 1)]
GK_points_target = GK_data['points']
GK_points_features = GK_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(GK_points_features, GK_points_target, train_size=0.8, test_size=0.2)

best_GK_rf_params = best_hyperparameters[('GK', 'random_forest')]

GK_rf_reg = RandomForestRegressor(n_estimators=best_GK_rf_params['n_estimators'], min_samples_split=best_GK_rf_params['min_samples_split'],
                                max_depth=best_GK_rf_params['max_depth'], min_samples_leaf=best_GK_rf_params['min_samples_leaf'],
                                max_features=best_GK_rf_params['max_features'], random_state=42)

cv_scores = cross_val_score(GK_rf_reg, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
GK_rf_reg.fit(x_train, y_train)
y_pred = GK_rf_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = GK_rf_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': GK_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Random Forest cross validation scores: {np.abs(cv_scores)}')
print(f'Random Forest mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Random Forest RMSE: {RMSE}, Random Forest R-squared: {R_squared}')
print('-'*100)

best_GK_gb_params = best_hyperparameters[('GK', 'gradient_boosting')]

GK_gb_reg = GradientBoostingRegressor(n_estimators=best_GK_gb_params['n_estimators'], learning_rate=best_GK_gb_params['learning_rate'],
                                    max_depth=best_GK_gb_params['max_depth'], max_features=best_GK_gb_params['max_features'],
                                    min_samples_leaf=best_GK_gb_params['min_samples_leaf'], min_samples_split=best_GK_gb_params['min_samples_split'],
                                    subsample=best_GK_gb_params['subsample'] )

cv_scores = cross_val_score(GK_gb_reg, x_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
GK_gb_reg.fit(x_train, y_train)
y_pred = GK_gb_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = GK_gb_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': GK_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {np.abs(cv_scores)}')
print(f'Gradient Boosting mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Gradient Boosting RMSE: {RMSE}, Gradient Boosting R-squared: {R_squared}')

                              Feature  Importance
1               opponent_market_value    0.089642
0                   team_market_value    0.058764
32      mean_opponent_conceded_last_5    0.051126
38  total_opponent_points_last_season    0.050022
27        mean_opponent_points_last_5    0.047345
33     mean_opponent_conceded_last_10    0.046836
37    total_team_conceded_last_season    0.037073
28       mean_opponent_points_last_10    0.036456
22          mean_team_conceded_last_5    0.035887
2                               value    0.035727
Random Forest cross validation scores: [2.78647332 2.68838421 2.76159661 2.81634589 2.95919813]
Random Forest mean cross validation score: 2.8023996331068077
Random Forest RMSE: 2.8891013508595607, Random Forest R-squared: 0.01394269441933682
----------------------------------------------------------------------------------------------------
                                Feature  Importance
1                 opponent_market_value    0.126331
32

Defenders:

In [4]:
DEF_data = df[(df['position'] == 'DEF') & (df['over_60_minutes'] == 1)]
DEF_points_target = DEF_data['points']
DEF_points_features = DEF_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10','team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(DEF_points_features, DEF_points_target, train_size=0.8, test_size=0.2)

# Random Forest Regressor

best_DEF_rf_params = best_hyperparameters[('DEF', 'random_forest')]

DEF_rf_reg = RandomForestRegressor(n_estimators=best_DEF_rf_params['n_estimators'], min_samples_split=best_DEF_rf_params['min_samples_split'],
                                max_depth=best_DEF_rf_params['max_depth'], min_samples_leaf=best_DEF_rf_params['min_samples_leaf'],
                                max_features=best_DEF_rf_params['max_features'], random_state=42)

cv_scores = cross_val_score(DEF_rf_reg, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
DEF_rf_reg.fit(x_train, y_train)
y_pred = DEF_rf_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = DEF_rf_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': DEF_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Random Forest cross validation scores: {np.abs(cv_scores)}')
print(f'Random Forest mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Random Forest RMSE: {RMSE}, Random Forest R-squared: {R_squared}')
print('-'*100)

# Gradient Boosting Regressor
best_DEF_gb_params = best_hyperparameters[('DEF', 'gradient_boosting')]

DEF_gb_reg = GradientBoostingRegressor(n_estimators=best_DEF_gb_params['n_estimators'], learning_rate=best_DEF_gb_params['learning_rate'],
                                    max_depth=best_DEF_gb_params['max_depth'], max_features=best_DEF_gb_params['max_features'],
                                    min_samples_leaf=best_DEF_gb_params['min_samples_leaf'], min_samples_split=best_DEF_gb_params['min_samples_split'],
                                    subsample=best_DEF_gb_params['subsample'] )

cv_scores = cross_val_score(DEF_gb_reg, x_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
DEF_gb_reg.fit(x_train, y_train)
y_pred = DEF_gb_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = DEF_gb_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': DEF_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {np.abs(cv_scores)}')
print(f'Gradient Boosting mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Gradient Boosting RMSE: {RMSE}, Gradient Boosting R-squared: {R_squared}')

                           Feature  Importance
1            opponent_market_value    0.068970
0                team_market_value    0.040416
36   total_team_points_last_season    0.036203
16         mean_team_points_last_3    0.031906
30     opponent_conceded_last_game    0.030623
31   mean_opponent_conceded_last_3    0.030482
17         mean_team_points_last_5    0.030314
33  mean_opponent_conceded_last_10    0.028582
34        total_points_last_season    0.028066
25       opponent_points_last_game    0.027723
Random Forest cross validation scores: [2.3631486  2.44346323 2.42743723 2.35388403 2.51638494]
Random Forest mean cross validation score: 2.420863605022493
Random Forest RMSE: 2.4015336777097342, Random Forest R-squared: 0.44392968530162336
----------------------------------------------------------------------------------------------------
                           Feature  Importance
1            opponent_market_value    0.059648
0                team_market_value    0.036817

Midfielders:

In [5]:
MID_data = df[(df['position'] == 'MID') & (df['over_60_minutes'] == 1)]
MID_points_target = MID_data['points']
MID_points_features = MID_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(MID_points_features, MID_points_target, train_size=0.8, test_size=0.2)

# Random Forest Regressor

best_MID_rf_params = best_hyperparameters[('MID', 'random_forest')]

MID_rf_reg = RandomForestRegressor(n_estimators=best_MID_rf_params['n_estimators'], min_samples_split=best_MID_rf_params['min_samples_split'],
                                max_depth=best_MID_rf_params['max_depth'], min_samples_leaf=best_MID_rf_params['min_samples_leaf'],
                                max_features=best_MID_rf_params['max_features'], random_state=42)

cv_scores = cross_val_score(MID_rf_reg, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
MID_rf_reg.fit(x_train, y_train)
y_pred = MID_rf_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = MID_rf_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': MID_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Random Forest cross validation scores: {np.abs(cv_scores)}')
print(f'Random Forest mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Random Forest RMSE: {RMSE}, Random Forest R-squared: {R_squared}')
print('-'*100)

# Gradient Boosting Regressor
best_MID_gb_params = best_hyperparameters[('MID', 'gradient_boosting')]

MID_gb_reg = GradientBoostingRegressor(n_estimators=best_MID_gb_params['n_estimators'], learning_rate=best_MID_gb_params['learning_rate'],
                                    max_depth=best_MID_gb_params['max_depth'], max_features=best_MID_gb_params['max_features'],
                                    min_samples_leaf=best_MID_gb_params['min_samples_leaf'], min_samples_split=best_MID_gb_params['min_samples_split'],
                                    subsample=best_MID_gb_params['subsample'] )

cv_scores = cross_val_score(MID_gb_reg, x_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
MID_gb_reg.fit(x_train, y_train)
y_pred = MID_gb_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = MID_gb_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': MID_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {np.abs(cv_scores)}')
print(f'Gradient Boosting mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Gradient Boosting RMSE: {RMSE}, Gradient Boosting R-squared: {R_squared}')

                           Feature  Importance
2                            value    0.092332
34        total_points_last_season    0.058295
12             mean_points_last_10    0.038739
5                     total_points    0.035688
1            opponent_market_value    0.031944
0                team_market_value    0.029916
33  mean_opponent_conceded_last_10    0.028263
10              mean_points_last_5    0.027652
35          total_mins_last_season    0.026352
23      mean_team_conceded_last_10    0.026067
Random Forest cross validation scores: [2.99573401 2.87338547 3.08238239 2.89811569 2.96372005]
Random Forest mean cross validation score: 2.962667522350677
Random Forest RMSE: 3.0366997898333996, Random Forest R-squared: 0.09696379801611266
----------------------------------------------------------------------------------------------------
                                Feature  Importance
2                                 value    0.306735
34             total_points_last_sea

Forwards:

In [6]:
FWD_data = df[(df['position'] == 'FWD') & (df['over_60_minutes'] == 1)]
FWD_points_target = FWD_data['points']
FWD_points_features = FWD_data[['team_market_value', 'opponent_market_value', 'value', 'was_home','points_last_game', 'total_points', 'mins_last_game',
                        'total_mins', 'mean_points_last_3', 'mean_mins_last_3', 'mean_points_last_5','mean_mins_last_5', 'mean_points_last_10', 
                        'mean_mins_last_10', 'team_points_last_game', 'total_team_points', 'mean_team_points_last_3', 'mean_team_points_last_5',
                        'mean_team_points_last_10', 'team_conceded_last_game', 'total_team_conceded', 
                        'mean_team_conceded_last_3', 'mean_team_conceded_last_5', 'mean_team_conceded_last_10', 'total_opponent_points',
                        'opponent_points_last_game', 'mean_opponent_points_last_3', 'mean_opponent_points_last_5', 'mean_opponent_points_last_10',
                        'total_opponent_conceded', 'opponent_conceded_last_game', 'mean_opponent_conceded_last_3', 'mean_opponent_conceded_last_5',
                        'mean_opponent_conceded_last_10', 'total_points_last_season', 'total_mins_last_season', 'total_team_points_last_season',
                        'total_team_conceded_last_season', 'total_opponent_points_last_season', 'total_opponent_conceded_last_season']]

x_train, x_test, y_train, y_test = train_test_split(FWD_points_features, FWD_points_target, train_size=0.8, test_size=0.2)

# Random Forest Regressor

best_FWD_rf_params = best_hyperparameters[('FWD', 'random_forest')]

FWD_rf_reg = RandomForestRegressor(n_estimators=best_FWD_rf_params['n_estimators'], min_samples_split=best_FWD_rf_params['min_samples_split'],
                                max_depth=best_FWD_rf_params['max_depth'], min_samples_leaf=best_FWD_rf_params['min_samples_leaf'],
                                max_features=best_FWD_rf_params['max_features'], random_state=42)

cv_scores = cross_val_score(FWD_rf_reg, x_train, y_train, cv=5, scoring= 'neg_root_mean_squared_error', n_jobs=-1)
FWD_rf_reg.fit(x_train, y_train)
y_pred = FWD_rf_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = FWD_rf_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': FWD_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Random Forest cross validation scores: {np.abs(cv_scores)}')
print(f'Random Forest mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Random Forest RMSE: {RMSE}, Random Forest R-squared: {R_squared}')
print('-'*100)

# Gradient Boosting Regressor

best_FWD_gb_params = best_hyperparameters[('FWD', 'gradient_boosting')]

FWD_gb_reg = GradientBoostingRegressor(n_estimators=best_FWD_gb_params['n_estimators'], learning_rate=best_FWD_gb_params['learning_rate'],
                                    max_depth=best_FWD_gb_params['max_depth'], max_features=best_FWD_gb_params['max_features'],
                                    min_samples_leaf=best_FWD_gb_params['min_samples_leaf'], min_samples_split=best_FWD_gb_params['min_samples_split'],
                                    subsample=best_FWD_gb_params['subsample'] )

cv_scores = cross_val_score(FWD_gb_reg, x_train, y_train, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
FWD_gb_reg.fit(x_train, y_train)
y_pred = FWD_gb_reg.predict(x_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R_squared = r2_score(y_test, y_pred)

importances = FWD_gb_reg.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': FWD_points_features.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False).head(10))

print(f'Gradient Boosting cross validation scores: {np.abs(cv_scores)}')
print(f'Gradient Boosting mean cross validation score: {np.abs(np.mean(cv_scores))}')
print(f'Gradient Boosting RMSE: {RMSE}, Gradient Boosting R-squared: {R_squared}')

                           Feature  Importance
2                            value    0.113531
1            opponent_market_value    0.060205
27     mean_opponent_points_last_5    0.038850
34        total_points_last_season    0.035713
32   mean_opponent_conceded_last_5    0.034843
28    mean_opponent_points_last_10    0.031920
29         total_opponent_conceded    0.029989
26     mean_opponent_points_last_3    0.028987
17         mean_team_points_last_5    0.028565
33  mean_opponent_conceded_last_10    0.027984
Random Forest cross validation scores: [3.70022361 3.69283678 3.52039764 3.41846852 3.20461175]
Random Forest mean cross validation score: 3.507307662816094
Random Forest RMSE: 3.6367529092092683, Random Forest R-squared: 0.018745894305854605
----------------------------------------------------------------------------------------------------
                            Feature  Importance
2                             value    0.093372
1             opponent_market_value    0.06

In [7]:
import joblib
joblib.dump(DEF_gb_reg, 'models/DEF_gb_reg.pkl')
joblib.dump(GK_gb_reg, 'models/GK_gb_reg.pkl')
joblib.dump(MID_gb_reg, 'models/MID_gb_reg.pkl')
joblib.dump(FWD_gb_reg, 'models/FWD_gb_reg.pkl')

joblib.dump(DEF_rf_reg, 'models/DEF_rf_reg.pkl')
joblib.dump(GK_rf_reg, 'models/GK_rf_reg.pkl')
joblib.dump(MID_rf_reg, 'models/MID_rf_reg.pkl')
joblib.dump(FWD_rf_reg, 'models/FWD_rf_reg.pkl')


['FWD_rf_reg.pkl']