In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from constants import *

In [24]:
def get_predictors():
    general = ["venue_code", "team_code", "day_code"]
    attacking = ["gf", "xg", "sh", "sot", "npxg", "npxg/sh"]
    passing = ["totpasscmp", "totpassatt", "totpasscmp%", "totpassdist", "prgpassdist", "xag", "xa", "keypasses"]
    gk = ["sota", "saves", "save%", "psxg"]
    ca = ["sca", "gca", "scalivepass", "gcalivepass"]
    possesion = ["poss", "att3rdtouches", "attboxtouches", "atttakeons", "succtakeons", "carries", "totdistcarried", "prgdistcarried"]
    defense = ["tkl", "tklw", "tkldef3rd", "tklmid3rd", "tklatt3rd", "blocks", "int"]
    misc = ["fouls", "foulsdrawn", "recov", "aerialwon%"]
    
    base = attacking + passing + gk + ca + possesion + defense + misc
    base_averages = [f"{x}_rolling" for x in base] + [f"{x}_mean" for x in base]
    base_home_away = [f"{x}_home" for x in base_averages] + [f"{x}_away" for x in base_averages]
    predictors = [f"{x}_home" for x in general] + [f"{x}_away" for x in general] + base_home_away

    return predictors

In [25]:
model = RandomForestClassifier(n_estimators=2400, min_samples_split=150, min_samples_leaf=10, random_state=42)

data = pd.read_csv(CLEAN_DATA)
data['date'] = pd.to_datetime(data['date'])

predictors = get_predictors()


In [39]:
train_set = data[data['date'].dt.date < pd.Timestamp(DATE).date()].dropna()
num_nans = train_set.isna().sum().sum()
if num_nans > 0:
    train_set = train_set.dropna()
next_games = data[data['date'].dt.date >= pd.Timestamp(DATE).date()]

In [40]:
X = train_set[predictors]
y = train_set['result_code']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [43]:
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Create a new DataFrame with the desired columns
r_df = X_test.copy()
r_df['Date'] = data.loc[X_test.index, 'date']
r_df['Home_Team'] = data.loc[X_test.index, 'home_team']
r_df['Away_Team'] = data.loc[X_test.index, 'away_team']
r_df['Actual_Result'] = y_test  # Actual result of the game
r_df['Predicted_Result'] = preds  # Predicted result of the game

In [52]:
X = next_games[predictors]
y = next_games['result_code']

preds = model.predict(X)
probs = model.predict_proba(X)  # Obtain the predicted probabilities

# Create a new DataFrame with the desired columns
result_df = next_games.copy()
result_df['Date'] = next_games['date']
result_df['Home_Team'] = next_games['home_team']
result_df['Away_Team'] = next_games['away_team']
result_df['Predicted_Result'] = preds  # Predicted result of the game

# Add predicted probabilities for each class to the DataFrame
result_df['Prob_Home_Win'] = probs[:, 2]
result_df['Prob_Draw'] = probs[:, 1]
result_df['Prob_Away_Win'] = probs[:, 0]

In [53]:
result_df[['Date', 'Home_Team', 'Away_Team', 'Predicted_Result', 'Prob_Home_Win', 'Prob_Draw', 'Prob_Away_Win']]

Unnamed: 0,Date,Home_Team,Away_Team,Predicted_Result,Prob_Home_Win,Prob_Draw,Prob_Away_Win
840,2023-10-21,Liverpool,Everton,2,0.664054,0.161411,0.174535
841,2023-10-21,Bournemouth,Wolves,2,0.427373,0.258393,0.314233
842,2023-10-21,Brentford,Burnley,2,0.46748,0.23714,0.29538
843,2023-10-21,Manchester City,Brighton,2,0.609145,0.172249,0.218606
844,2023-10-21,Newcastle Utd,Crystal Palace,2,0.587179,0.194559,0.218262
845,2023-10-21,Nottingham Forest,Luton Town,2,0.389977,0.270433,0.339591
846,2023-10-21,Chelsea,Arsenal,2,0.455281,0.195019,0.349701
847,2023-10-21,Sheffield Utd,Manchester Utd,0,0.248021,0.222753,0.529226
848,2023-10-22,Aston Villa,West Ham,2,0.509484,0.222352,0.268164
849,2023-10-23,Tottenham,Fulham,2,0.618543,0.17517,0.206287
