In [6]:
pip install pandas scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [13]:
# Load your dataset
df = pd.read_csv('../train_data/forward.csv')

# Preview the data to ensure everything looks right
print(df.head())

# Check for any missing values and handle them (if any)
df.fillna(0, inplace=True)  # Simple way to handle missing values

          name       team   opponent_team        date  was_home position  \
0  Jamie Vardy  Leicester       Brentford  2022-08-07      True       FW   
1  Jamie Vardy  Leicester     Bournemouth  2022-10-08     False      Sub   
2  Jamie Vardy  Leicester  Crystal Palace  2022-10-15      True      Sub   
3  Jamie Vardy  Leicester  Leicester City  2022-10-20      True       FW   
4  Jamie Vardy  Leicester   Wolverhampton  2022-10-23     False      Sub   

   minutes  goals    xG  assists  ...  last_season_assists  last_season_xG  \
0       90    0.0  0.00      1.0  ...                  2.0            9.99   
1       23    0.0  0.03      0.0  ...                  2.0            9.99   
2       25    0.0  0.00      0.0  ...                  2.0            9.99   
3       63    0.0  0.00      0.0  ...                  2.0            9.99   
4       29    1.0  0.52      1.0  ...                  2.0            9.99   

   last_season_xA  last_season_points_per_minute  next_week_points  next_t

In [14]:
# Define the feature columns you want to use for predictions
features = [
    'minutes','goals','xG','assists','xA','total_points','shots','key_passes','ict_index','bonus','starts','form','xG&A_form','minutes_per_game','last_season_goals','last_season_assists','last_season_xG','last_season_xA','last_season_points_per_minute','next_week_specific_fixture_difficulty', 'next_week_holistic_fixture_difficulty'
]

# Define the target column (points scored in the next gameweek)
target = 'next_week_points'

# Split the dataset into features (X) and target (y)
X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model's performance
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest MAE: {rf_mae}")

Random Forest MSE: 6.024970397931842
Random Forest MAE: 1.3706850351548454


In [16]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model's performance
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost MAE: {xgb_mae}")

XGBoost MSE: 5.798981699816808
XGBoost MAE: 1.3101764836982517


In [17]:
# Feature importance for Random Forest
rf_feature_importance = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_feature_importance
}).sort_values(by='importance', ascending=False)

print(rf_importance_df)

# Feature importance for XGBoost
xgb_feature_importance = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_feature_importance
}).sort_values(by='importance', ascending=False)

print(xgb_importance_df)

                                  feature  importance
0                                 minutes    0.250762
13                       minutes_per_game    0.124199
19  next_week_specific_fixture_difficulty    0.090715
12                              xG&A_form    0.087161
11                                   form    0.080639
20  next_week_holistic_fixture_difficulty    0.072818
8                               ict_index    0.058909
2                                      xG    0.040856
4                                      xA    0.028324
17                         last_season_xA    0.023464
16                         last_season_xG    0.023200
5                            total_points    0.021020
18          last_season_points_per_minute    0.018867
15                    last_season_assists    0.017364
6                                   shots    0.017119
14                      last_season_goals    0.016640
7                              key_passes    0.008681
1                           