In [132]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [7]:
matches = pd.read_csv("matches.csv", index_col=0)
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13.0,1.0,18.7,1.0,1,1,2022,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19.0,7.0,17.5,0.0,0,0,2022,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21.0,10.0,16.2,1.0,0,0,2022,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18.0,5.0,14.1,0.0,0,0,2022,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17.0,9.0,14.8,0.0,0,0,2022,Manchester City


In [8]:
matches["date"] = pd.to_datetime(matches["date"])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"]=matches["date"].dt.dayofweek
matches["target"] = (matches["result"] == "W")

In [9]:
train = matches[matches["date"]< '2022-01-01']
test = matches[matches["date"]>'2022-01-01']
predictors = ["venue_code", "opp_code", "hour", "day_code"]

## Training and getting predictions using Decision tree base classifier and tuning the hyper parameters

In [16]:
param_grid_dtree = {
    'base_estimator__max_depth': [1, 2, 3],
    'base_estimator__min_samples_split': [2, 4],
    'base_estimator__min_samples_leaf': [1, 2],
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1, 1]
}

In [52]:
# Defining classifier
ada_dt = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),random_state=42)

In [40]:
# Grid search with cross-validation for Decision Tree
cv_ada_dtree = GridSearchCV(estimator=ada_dt, param_grid=param_grid_dtree, cv=5)

In [41]:
cv_ada_dtree.fit(train[predictors], train["target"])

In [42]:
cv_ada_dtree.best_params_

{'base_estimator__max_depth': 3,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__min_samples_split': 2,
 'learning_rate': 0.01,
 'n_estimators': 100}

In [46]:
# Train using the best hyper parameters
best_params_dtree = {'n_estimators': 100, 'learning_rate': 0.01, 'base_estimator__max_depth': 3,
                     'base_estimator__min_samples_split': 2, 'base_estimator__min_samples_leaf': 1}

dtree = DecisionTreeClassifier(max_depth=best_params_dtree['base_estimator__max_depth'],
                               min_samples_split=best_params_dtree['base_estimator__min_samples_split'],
                               min_samples_leaf=best_params_dtree['base_estimator__min_samples_leaf'])

#initialize classifier with best parameter
ada_dtree = AdaBoostClassifier(base_estimator=dtree,
                               n_estimators=best_params_dtree['n_estimators'],
                               learning_rate=best_params_dtree['learning_rate'])

# Train the model
ada_dtree.fit(train[predictors], train["target"])

In [63]:
# Predict using the best hyper parameters
predictions_dt = ada_dtree.predict(test[predictors])

In [64]:
accuracy_dt = accuracy_score(test["target"], predictions_dt)

In [65]:
accuracy_dt

0.6097560975609756

## Training and getting predictions using Gausian NB base classifier and tuning the hyper parameters

In [54]:
param_grid_gnb = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1, 1]
}

In [56]:
ada_gnb = AdaBoostClassifier(base_estimator=GaussianNB())

In [57]:
cv_ada_gnb = GridSearchCV(estimator=ada_gnb, param_grid=param_grid_gnb, cv=5)
# ada_clf_nb = AdaBoostClassifier(base_estimator=GaussianNB(), random_state=42)

In [58]:
cv_ada_gnb.fit(train[predictors], train["target"])

In [59]:
cv_ada_gnb.best_params_

{'learning_rate': 0.1, 'n_estimators': 50}

In [70]:
# Train using the best hyper parameters
gnb = GaussianNB()

best_params_gnb = {'n_estimators': 50, 'learning_rate': 0.1}

#initialize classifier with best parameter
ada_gnb = AdaBoostClassifier(base_estimator=gnb,
                             n_estimators=best_params_gnb['n_estimators'],
                             learning_rate=best_params_gnb['learning_rate'],
                             random_state=42)

# Train the AdaBoost model
ada_gnb.fit(train[predictors], train["target"])

In [71]:
# Predict using AdaBoost with Naive Bayes
predictions_gnb = ada_gnb.predict(test[predictors])

In [72]:
accuracy_gnb = accuracy_score(test["target"], predictions_gnb)
print("Accuracy of AdaBoost with Naive Bayes:", accuracy_gnb)

Accuracy of AdaBoost with Naive Bayes: 0.6149825783972126


In [81]:
metrics = {
    'Accuracy': [accuracy_score(test["target"], predictions_dt) * 100, accuracy_score(test["target"], predictions_gnb ) * 100],
    'Recall': [recall_score(test["target"], predictions_dt, average='macro') * 100, recall_score(test["target"], predictions_gnb , average='macro') * 100],
    'Precision': [precision_score(test["target"], predictions_dt, average='macro') * 100, precision_score(test["target"], predictions_gnb, average='macro') * 100],
    'F1 Score': [f1_score(test["target"], predictions_dt, average='macro') * 100, f1_score(test["target"], predictions_gnb, average='macro') * 100]
}

In [82]:
# Create DataFrame
results_df = pd.DataFrame(metrics, index=['AdaBoost with Decision Tree', 'AdaBoost with Gaussian NB'])

In [127]:
# Round the DataFrame to 2 decimal places
results_df = results_df.round(2)
results_df

Unnamed: 0,Accuracy,Recall,Precision,F1 Score
AdaBoost with Decision Tree,60.98,52.25,56.08,47.1
AdaBoost with Gaussian NB,61.5,51.79,58.36,44.29


## Using Rolling Averages and a larger dataset

In [90]:
matches = pd.read_csv('matches_6_years.csv', index_col = 0)

In [91]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed ='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [92]:
def process_data(matches):
  matches['date']=pd.to_datetime(matches['date'])
  matches['venue_code'] = matches['venue'].astype('category').cat.codes
  matches['opp_code'] = matches['opponent'].astype('category').cat.codes
  matches['hour'] = matches['time'].str.replace(':.+','',regex=True).astype('int')
  matches['day_code']= matches['date'].dt.dayofweek
  matches['target']=(matches['result'] == 'W')

In [96]:
matches.shape

(4560, 32)

In [105]:
process_data(matches)

In [106]:
grouped_matches = matches.groupby('team')
group= grouped_matches.get_group('Manchester City')

In [107]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [108]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [109]:
matches_rolling = matches_rolling.droplevel("team")
matches_rolling.index = range(matches_rolling.shape[0])

In [110]:
matches_rolling.index = range(matches_rolling.shape[0])

In [111]:
# Evaluating with new predictors, expanded data set, and using the best parameters from earlier
rolling_train = matches_rolling[matches_rolling["date"]< '2022-01-01']
rolling_test = matches_rolling[matches_rolling["date"]>'2022-01-01']
rolling_predictors = predictors + new_cols

In [117]:
#Train Decision Tree Classifier using the best hyper parameters

# Train the model
ada_dtree.fit(rolling_train[rolling_predictors], rolling_train["target"])

In [119]:
# Predict using the best hyper parameters
predictions_dt_rolling = ada_dtree.predict(rolling_test[rolling_predictors])

In [121]:
# Train Gaussian NB classifier using the best hyper parameters
ada_gnb.fit(rolling_train[rolling_predictors], rolling_train["target"])

In [122]:
# Predict using AdaBoost with Naive Bayes
predictions_gnb_rolling = ada_gnb.predict(rolling_test[rolling_predictors])

In [129]:
metrics_rolling = {
    'Accuracy_Rolling': [accuracy_score(rolling_test["target"], predictions_dt_rolling) * 100, accuracy_score(rolling_test["target"], predictions_gnb_rolling ) * 100],
    'Recall_Rolling': [recall_score(rolling_test["target"], predictions_dt_rolling, average='macro') * 100, recall_score(rolling_test["target"], predictions_gnb_rolling , average='macro') * 100],
    'Precision_Rolling': [precision_score(rolling_test["target"], predictions_dt_rolling, average='macro') * 100, precision_score(rolling_test["target"], predictions_gnb_rolling, average='macro') * 100],
    'F1 Score_Rolling': [f1_score(rolling_test["target"], predictions_dt_rolling, average='macro') * 100, f1_score(rolling_test["target"], predictions_gnb_rolling, average='macro') * 100]
}

In [130]:
results_df_rolling = pd.DataFrame(metrics_rolling, index=['AdaBoost Decision Tree', 'AdaBoost Gaussian NB'])

In [131]:
results_df_rolling = results_df_rolling.round(2)
results_df_rolling

Unnamed: 0,Accuracy_Rolling,Recall_Rolling,Precision_Rolling,F1 Score_Rolling
AdaBoost Decision Tree,62.62,56.43,59.44,55.24
AdaBoost Gaussian NB,63.23,56.13,60.72,54.04
