In [6]:
pip install pandas scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [11]:
# Load your dataset
df = pd.read_csv('train_data/forward.csv')

# Preview the data to ensure everything looks right
print(df.head())

# Check for any missing values and handle them (if any)
df.fillna(0, inplace=True)  # Simple way to handle missing values

          name       team   opponent_team        date  was_home position  \
0  Jamie Vardy  Leicester       Brentford  2022-08-07      True       FW   
1  Jamie Vardy  Leicester     Bournemouth  2022-10-08     False      Sub   
2  Jamie Vardy  Leicester  Crystal Palace  2022-10-15      True      Sub   
3  Jamie Vardy  Leicester  Leicester City  2022-10-20      True       FW   
4  Jamie Vardy  Leicester   Wolverhampton  2022-10-23     False      Sub   

   minutes  goals    xG  assists  ...  starts  form  xG&A_form  \
0       90      0  0.00        1  ...   False   0.0       0.00   
1       23      0  0.03        0  ...   False   5.0       1.00   
2       25      0  0.00        0  ...   False   3.0       0.52   
3       63      0  0.00        0  ...   False   2.0       0.34   
4       29      1  0.52        1  ...   False   2.0       0.26   

   minutes_per_game  last_season_goals  last_season_assists  last_season_xG  \
0             90.00                 15                    2        

In [12]:
# Define the feature columns you want to use for predictions
features = [
    'minutes','goals','xG','assists','xA','total_points','shots','key_passes','ict_index','bonus','starts','form','xG&A_form','minutes_per_game','last_season_goals','last_season_assists','last_season_xG','last_season_xA','last_season_points_per_minute'
]

# Define the target column (points scored in the next gameweek)
target = 'next_week_points'

# Split the dataset into features (X) and target (y)
X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model's performance
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest MAE: {rf_mae}")

Random Forest MSE: 8.772310227503555
Random Forest MAE: 2.2404692260816574


In [14]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model's performance
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost MAE: {xgb_mae}")

XGBoost MSE: 8.925540240830209
XGBoost MAE: 2.168908930640715


In [15]:
# Feature importance for Random Forest
rf_feature_importance = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_feature_importance
}).sort_values(by='importance', ascending=False)

print(rf_importance_df)

# Feature importance for XGBoost
xgb_feature_importance = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_feature_importance
}).sort_values(by='importance', ascending=False)

print(xgb_importance_df)

                          feature  importance
13               minutes_per_game    0.211403
12                      xG&A_form    0.115751
11                           form    0.101573
0                         minutes    0.100249
8                       ict_index    0.096158
2                              xG    0.079787
4                              xA    0.051640
6                           shots    0.034455
5                    total_points    0.032633
16                 last_season_xG    0.028836
17                 last_season_xA    0.026840
14              last_season_goals    0.026149
18  last_season_points_per_minute    0.022729
15            last_season_assists    0.021606
7                      key_passes    0.016118
9                           bonus    0.011629
10                         starts    0.009554
1                           goals    0.008863
3                         assists    0.004026
                          feature  importance
13               minutes_per_game 