In [None]:
pip install pandas scikit-learn xgboost

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [3]:
# Load your dataset
df = pd.read_csv('../train_data/goalkeeper.csv')

# Preview the data to ensure everything looks right
print(df.head())

# Check for any missing values and handle them (if any)

              name       team      opponent_team date  was_home  minutes  \
0  Martin Dubravka  Newcastle  Nottingham Forest  NaN      True        0   
1  Martin Dubravka    Man Utd            Everton  NaN     False        0   
2  Martin Dubravka    Man Utd   Newcastle United  NaN      True        0   
3  Martin Dubravka    Man Utd          Tottenham  NaN      True        0   
4  Martin Dubravka    Man Utd            Chelsea  NaN     False        0   

   goals_conceded  expected_goals_conceded  saves  penalties_saved  ...  \
0               0                      0.0      0                0  ...   
1               0                      0.0      0                0  ...   
2               0                      0.0      0                0  ...   
3               0                      0.0      0                0  ...   
4               0                      0.0      0                0  ...   

   last_season_expected_goals_conceded  last_season_clean_sheet_probability  \
0            

In [4]:
# Define the feature columns you want to use for predictions
features = [
    'minutes','goals_conceded','expected_goals_conceded','saves','penalties_saved','total_points','bonus','clean_sheets','xA','starts','form','clean_sheet_probability','last_season_penalties_saved','last_season_expected_goals_conceded','last_season_clean_sheet_probability','next_week_specific_fixture_difficulty', 'saves_per_game','last_season_total_saves'
]



# Define the target column (points scored in the next gameweek)
target = 'next_week_points'

# Split the dataset into features (X) and target (y)
X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model's performance
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest MAE: {rf_mae}")

Random Forest MSE: 2.831302467282847
Random Forest MAE: 0.835460693220828


In [6]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model's performance
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost MAE: {xgb_mae}")

XGBoost MSE: 2.7241608498333596
XGBoost MAE: 0.8349102705141254


In [7]:
# Feature importance for Random Forest
rf_feature_importance = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_feature_importance
}).sort_values(by='importance', ascending=False)

print(rf_importance_df)

# Feature importance for XGBoost
xgb_feature_importance = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_feature_importance
}).sort_values(by='importance', ascending=False)

print(xgb_importance_df)

                                  feature  importance
0                                 minutes    0.373556
16                         saves_per_game    0.131353
10                                   form    0.098111
11                clean_sheet_probability    0.093040
2                 expected_goals_conceded    0.078324
15  next_week_specific_fixture_difficulty    0.048693
3                                   saves    0.044042
5                            total_points    0.042697
1                          goals_conceded    0.028609
13    last_season_expected_goals_conceded    0.012839
17                last_season_total_saves    0.012202
8                                      xA    0.007914
14    last_season_clean_sheet_probability    0.007898
6                                   bonus    0.007715
12            last_season_penalties_saved    0.004386
9                                  starts    0.003655
7                            clean_sheets    0.002656
4                         pe