In [1]:
pip install pandas scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [2]:
# Load your dataset
df = pd.read_csv('../train_data/midfielder.csv')

# Preview the data to ensure everything looks right
print(df.head())

# Check for any missing values and handle them (if any)
df.fillna(0, inplace=True)  # Simple way to handle missing values

         name           team     opponent_team        date  was_home position  \
0  Ryan Yates  Nott'm Forest  Newcastle United         NaN     False      NaN   
1  Ryan Yates  Nott'm Forest       Aston Villa  2022-10-10      True       MC   
2  Ryan Yates  Nott'm Forest     Wolverhampton  2022-10-15     False       MC   
3  Ryan Yates  Nott'm Forest          Brighton  2022-10-18     False       MC   
4  Ryan Yates  Nott'm Forest         Liverpool  2022-10-22      True       MC   

   minutes  goals    xG  assists  ...  last_season_assists  last_season_xG  \
0        0    NaN   NaN      NaN  ...                  0.0             0.0   
1       90    0.0  0.00      0.0  ...                  0.0             0.0   
2       90    0.0  0.14      0.0  ...                  0.0             0.0   
3       90    0.0  0.08      0.0  ...                  0.0             0.0   
4       90    0.0  0.44      0.0  ...                  0.0             0.0   

   last_season_xA  last_season_points_per_mi

In [3]:
# Define the feature columns you want to use for predictions
features = [
    'goals_conceded', 'clean_sheets', 'minutes','goals','xG','assists','xA','total_points','shots','key_passes','ict_index','bonus', 'starts','form','xG&A_form','minutes_per_game','last_season_goals','last_season_assists','last_season_xG','last_season_xA','last_season_points_per_minute','next_week_specific_fixture_difficulty'
]

# Define the target column (points scored in the next gameweek)
target = 'next_week_points'

# Split the dataset into features (X) and target (y)
X = df[features]
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model's performance
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
print(f"Random Forest MAE: {rf_mae}")

Random Forest MSE: 4.636893737201241
Random Forest MAE: 1.143888275069021


In [5]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model's performance
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
print(f"XGBoost MAE: {xgb_mae}")

XGBoost MSE: 4.5027528906410685
XGBoost MAE: 1.1106957341508978


In [6]:
# Feature importance for Random Forest
rf_feature_importance = rf_model.feature_importances_
rf_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_feature_importance
}).sort_values(by='importance', ascending=False)

print(rf_importance_df)

# Feature importance for XGBoost
xgb_feature_importance = xgb_model.feature_importances_
xgb_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_feature_importance
}).sort_values(by='importance', ascending=False)

print(xgb_importance_df)

                                  feature  importance
2                                 minutes    0.277404
15                       minutes_per_game    0.125910
13                                   form    0.086073
10                              ict_index    0.074186
14                              xG&A_form    0.070225
21  next_week_specific_fixture_difficulty    0.046786
19                         last_season_xA    0.042515
4                                      xG    0.038336
6                                      xA    0.038312
18                         last_season_xG    0.034328
7                            total_points    0.026215
0                          goals_conceded    0.023024
17                    last_season_assists    0.022406
16                      last_season_goals    0.021207
20          last_season_points_per_minute    0.018195
8                                   shots    0.017870
9                              key_passes    0.013187
12                          