In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [3]:
matches = pd.read_csv("matches.csv", index_col=0)

In [4]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13.0,1.0,18.7,1.0,1,1,2022,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19.0,7.0,17.5,0.0,0,0,2022,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21.0,10.0,16.2,1.0,0,0,2022,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18.0,5.0,14.1,0.0,0,0,2022,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17.0,9.0,14.8,0.0,0,0,2022,Manchester City


In [5]:
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"]=matches["date"].dt.dayofweek
matches["target"] = (matches["result"] == "W")

In [6]:
train = matches[matches["date"]< '2022-01-01']
test = matches[matches["date"]>'2022-01-01']
predictors = ["venue_code", "opp_code", "hour", "day_code"]

## Training and getting predictions for Random Forest with hyper parameter tuning

In [17]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': [None, 'log2'], 
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [18]:
rf = RandomForestClassifier(random_state=42)

In [19]:
# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=0, n_jobs=-1)

In [22]:
import warnings
warnings.filterwarnings("ignore")
print('Fitting grid search...')
grid_search.fit(train[predictors], train["target"])
print('Done!')


Fitting grid search...
Done!


In [10]:
# rf =RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [23]:
grid_search.best_params_

{'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 50}

In [29]:
best_model = grid_search.best_estimator_

In [33]:
predictions = best_model.predict(test[predictors])

In [34]:
accuracy_metric = accuracy_score(test["target"], predictions)
accuracy_metric

0.6054006968641115

In [85]:
metrics = {
    'Accuracy': accuracy_score(test["target"], predictions) * 100,
    'F1 Score': f1_score(test["target"], predictions, average='macro') * 100,
    'Recall':  recall_score(test["target"], predictions, average='macro') * 100,
    'Precision': precision_score(test["target"], predictions, average = 'macro') * 100
}

In [86]:
metrics

{'Accuracy': 60.540069686411144,
 'F1 Score': 51.793456144562896,
 'Recall': 53.78125,
 'Precision': 55.99776436848468}

# Training  and getting predictions with a larger dataset and rolling averages

In [87]:
matches = pd.read_csv('matches_6_years.csv', index_col = 0)

In [88]:
matches.shape

(4560, 27)

In [89]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed ='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [90]:
def process_data(matches):
  matches['date']=pd.to_datetime(matches['date'])
  matches['venue_code'] = matches['venue'].astype('category').cat.codes
  matches['opp_code'] = matches['opponent'].astype('category').cat.codes
  matches['hour'] = matches['time'].str.replace(':.+','',regex=True).astype('int')
  matches['day_code']= matches['date'].dt.dayofweek
  matches['target']=(matches['result'] == 'W')

In [91]:
process_data(matches)

In [92]:
grouped_matches = matches.groupby('team')
group= grouped_matches.get_group('Manchester City')
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,1.0,1,1,2022-2023,Manchester City,0,27,16,6,True
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,0.0,0,0,2022-2023,Manchester City,1,2,15,5,True
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,1.0,0,0,2022-2023,Manchester City,0,17,16,6,False
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,0.0,0,0,2022-2023,Manchester City,1,8,15,5,True
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,0.0,0,0,2022-2023,Manchester City,1,19,19,2,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2018-04-22,16:30,Premier League,Matchweek 35,Sun,Home,W,5,0,Swansea City,...,1.0,0,1,2017-2018,Manchester City,1,23,16,6,True
53,2018-04-29,14:15,Premier League,Matchweek 36,Sun,Away,W,4,1,West Ham,...,1.0,0,0,2017-2018,Manchester City,0,27,14,6,True
54,2018-05-06,13:30,Premier League,Matchweek 37,Sun,Home,D,0,0,Huddersfield,...,0.0,0,0,2017-2018,Manchester City,1,11,13,6,False
55,2018-05-09,20:00,Premier League,Matchweek 31,Wed,Home,W,3,1,Brighton,...,1.0,0,0,2017-2018,Manchester City,1,4,20,2,True


In [93]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [94]:
# Creating new features using the rolling averages
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

matches_rolling = matches_rolling.droplevel("team")
matches_rolling.index = range(matches_rolling.shape[0])

matches_rolling.index = range(matches_rolling.shape[0])

rolling_train = matches_rolling[matches_rolling["date"]< '2022-01-01']
rolling_test = matches_rolling[matches_rolling["date"]>'2022-01-01']
rolling_predictors = predictors + new_cols

In [95]:
# Retraining the best model on expanded datasdet
best_model.fit(rolling_train[rolling_predictors], rolling_train["target"])

In [96]:
predictions_rolling = best_model.predict(rolling_test[rolling_predictors])

In [97]:
accuracy_rolling = accuracy_score(rolling_test["target"], predictions_rolling)
accuracy_rolling

0.6314410480349345

In [98]:
precison_rolling = precision_score(rolling_test["target"], predictions_rolling, average = 'macro')
precison_rolling

0.6028302759721483

In [99]:
f1_rolling = f1_score(rolling_test["target"], predictions_rolling, average='macro')
f1_rolling


0.5839074243749871

In [100]:
recall_rolling = recall_score(rolling_test["target"], predictions_rolling, average='macro')
recall_rolling

0.5854839330012884

In [101]:
metrics_rolling = {
    'Accuracy': accuracy_score(rolling_test["target"], predictions_rolling) * 100,
    'F1 Score': f1_score(rolling_test["target"], predictions_rolling, average='macro') * 100,
    'Recall':  recall_score(rolling_test["target"], predictions_rolling, average='macro') * 100,
    'Precision': precision_score(rolling_test["target"], predictions_rolling, average = 'macro')* 100
}

In [102]:
metrics_rolling

{'Accuracy': 63.14410480349345,
 'F1 Score': 58.39074243749871,
 'Recall': 58.54839330012884,
 'Precision': 60.28302759721483}

## Comparing Results

In [118]:
df_metrics = pd.DataFrame({
    'Metrics without Rolling Averages': metrics,
    'Metrics with Rolling Averages': metrics_rolling
})
df_metrics

Unnamed: 0,Metrics without Rolling Averages,Metrics with Rolling Averages
Accuracy,60.54007,63.144105
F1 Score,51.793456,58.390742
Recall,53.78125,58.548393
Precision,55.997764,60.283028


In [119]:
df_metrics = df_metrics.T
df_metrics

Unnamed: 0,Accuracy,F1 Score,Recall,Precision
Metrics without Rolling Averages,60.54007,51.793456,53.78125,55.997764
Metrics with Rolling Averages,63.144105,58.390742,58.548393,60.283028


In [121]:
df_metrics.round(2)

Unnamed: 0,Accuracy,F1 Score,Recall,Precision
Metrics without Rolling Averages,60.54,51.79,53.78,56.0
Metrics with Rolling Averages,63.14,58.39,58.55,60.28
