In [1]:
pip install pandas scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [3]:
# Load your dataset
df = pd.read_csv('train_data/defender.csv')

# Preview the data to ensure everything looks right
print(df.head())

# Check for any missing values and handle them (if any)
df.fillna(0, inplace=True)  # Simple way to handle missing values

         name      team opponent_team        date  was_home position  minutes  \
0  Nathan Aké  Man City      West Ham  2022-08-07     False       DC       90   
1  Nathan Aké  Man City   Southampton  2022-10-08      True       DC       90   
2  Nathan Aké  Man City     Liverpool  2022-10-16     False       DL       90   
3  Nathan Aké  Man City      Brighton         NaN      True      NaN        0   
4  Nathan Aké  Man City  Leeds United         NaN     False      NaN        0   

   goals    xG  assists  ...  clean_sheet_probability  last_season_xG  \
0    0.0  0.01      0.0  ...                     1.00             0.0   
1    0.0  0.00      0.0  ...                     1.00             0.0   
2    0.0  0.00      0.0  ...                     0.67             0.0   
3    NaN   NaN      NaN  ...                     0.67             0.0   
4    NaN   NaN      NaN  ...                     0.67             0.0   

   last_season_xA  last_season_expected_goals_conceded  \
0             0.

In [5]:
# Define the feature columns you want to use for predictions
features = [
    'minutes','goals','xG','assists','xA','total_points','shots','key_passes','ict_index','bonus','clean_sheets','goals_conceded','expected_goals_conceded','clean_sheet_probability','last_season_expected_goals_conceded','last_season_clean_sheet_probability','starts','form','xG&A_form','minutes_per_game','last_season_xG','last_season_xA','next_week_fixture_difficulty'
]

# Define the target column (points scored in the next gameweek)
target = 'next_week_points'

# Split the dataset into features (X) and target (y)
X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model's performance
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest MAE: {rf_mae}")

Random Forest MSE: 4.018280959351383
Random Forest MAE: 1.1608421921959664


In [7]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model's performance
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost MAE: {xgb_mae}")

XGBoost MSE: 3.833340567644946
XGBoost MAE: 1.1255560030206777


In [8]:
# Feature importance for Random Forest
rf_feature_importance = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_feature_importance
}).sort_values(by='importance', ascending=False)

print(rf_importance_df)

# Feature importance for XGBoost
xgb_feature_importance = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_feature_importance
}).sort_values(by='importance', ascending=False)

print(xgb_importance_df)

                                feature  importance
0                               minutes    0.191349
22         next_week_fixture_difficulty    0.176194
19                     minutes_per_game    0.126537
8                             ict_index    0.065397
17                                 form    0.065015
13              clean_sheet_probability    0.059808
12              expected_goals_conceded    0.059726
18                            xG&A_form    0.047103
5                          total_points    0.025939
4                                    xA    0.025770
2                                    xG    0.023583
21                       last_season_xA    0.021182
20                       last_season_xG    0.020776
14  last_season_expected_goals_conceded    0.020746
11                       goals_conceded    0.019044
15  last_season_clean_sheet_probability    0.018187
6                                 shots    0.009801
7                            key_passes    0.008146
9           