In [6]:
pip install pandas scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [16]:
# Load your dataset
df = pd.read_csv('train_data/forward.csv')

# Preview the data to ensure everything looks right
print(df.head())

# Check for any missing values and handle them (if any)
df.fillna(0, inplace=True)  # Simple way to handle missing values

          name       team   opponent_team        date  was_home position  \
0  Jamie Vardy  Leicester       Brentford  2022-08-07      True       FW   
1  Jamie Vardy  Leicester     Bournemouth  2022-10-08     False      Sub   
2  Jamie Vardy  Leicester  Crystal Palace  2022-10-15      True      Sub   
3  Jamie Vardy  Leicester  Leicester City  2022-10-20      True       FW   
4  Jamie Vardy  Leicester   Wolverhampton  2022-10-23     False      Sub   

   minutes  goals    xG  assists  ...  last_season_goals  last_season_assists  \
0       90    0.0  0.00      1.0  ...               15.0                  2.0   
1       23    0.0  0.03      0.0  ...               15.0                  2.0   
2       25    0.0  0.00      0.0  ...               15.0                  2.0   
3       63    0.0  0.00      0.0  ...               15.0                  2.0   
4       29    1.0  0.52      1.0  ...               15.0                  2.0   

   last_season_xG  last_season_xA  last_season_points_pe

In [18]:
# Define the feature columns you want to use for predictions
features = [
    'minutes','goals','xG','assists','xA','total_points','shots','key_passes','ict_index','bonus','starts','form','xG&A_form','minutes_per_game','last_season_goals','last_season_assists','last_season_xG','last_season_xA','last_season_points_per_minute','next_week_fixture_difficulty'
]

# Define the target column (points scored in the next gameweek)
target = 'next_week_points'

# Split the dataset into features (X) and target (y)
X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model's performance
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest MAE: {rf_mae}")

Random Forest MSE: 5.764142181203323
Random Forest MAE: 1.318298196184799


In [20]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model's performance
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost MAE: {xgb_mae}")

XGBoost MSE: 5.4641052832574
XGBoost MAE: 1.2593726721069536


In [21]:
# Feature importance for Random Forest
rf_feature_importance = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_feature_importance
}).sort_values(by='importance', ascending=False)

print(rf_importance_df)

# Feature importance for XGBoost
xgb_feature_importance = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_feature_importance
}).sort_values(by='importance', ascending=False)

print(xgb_importance_df)

                          feature  importance
0                         minutes    0.250147
19   next_week_fixture_difficulty    0.165014
13               minutes_per_game    0.124898
12                      xG&A_form    0.085137
11                           form    0.079142
8                       ict_index    0.058110
2                              xG    0.041778
17                 last_season_xA    0.027489
4                              xA    0.026501
16                 last_season_xG    0.024752
18  last_season_points_per_minute    0.019161
5                    total_points    0.019096
6                           shots    0.017803
14              last_season_goals    0.017454
15            last_season_assists    0.014259
7                      key_passes    0.010056
1                           goals    0.005578
9                           bonus    0.005502
10                         starts    0.005305
3                         assists    0.002819
                          feature 