In [1]:
# EPL Soccer Match Predictor

In [2]:
import pandas as pd
# import datetime
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
epl = pd.read_csv("datasets/EPL_dataset.csv") # DATE: d/m/y

# Data Cleaning

In [None]:
epl.head()

In [None]:
# cleaning/deleting columns
epl["date"] = pd.to_datetime(epl["Date"])
del epl["Unnamed: 0"]
del epl["Date"]

# create columns for our target value, venue, day
epl["target"] = (epl["FTR"] == "H").astype("int")
epl["home_team"] = (epl["HomeTeam"]).astype("category").cat.codes
epl["away_team"] = (epl["AwayTeam"]).astype("category").cat.codes
epl["day"] = epl["date"].dt.day_of_week

In [None]:
epl.head()

# Predictor

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
# X: home team, opponent, day of week
# y: 0 for lose/draw, 1 for win
X = epl[["home_team", "away_team", "day"]]
y = epl["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# match_predictor = RandomForestClassifier(n_jobs=-1, random_state=42)

In [None]:
y

In [10]:
# ANN model
class EPL_ANN(nn.Module):
    def __init__(self, input=3, hidden=64, output=2):
        super(EPL_ANN, self).__init__()
        torch.manual_seed(12345)

        self.input = nn.Linear(input, hidden)
        self.output = nn.Linear(hidden, output)


    def forward(self, x):
        x = F.relu(self.input(x))
        x = self.output(x)

        return x

In [11]:
X_train = np.array(X_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)

torch_x_train = torch.tensor(X_train, dtype=torch.float32)
torch_x_test = torch.tensor(X_test, dtype=torch.float32)
torch_y_train = torch.tensor(y_train, dtype=torch.float32)
torch_y_test = torch.tensor(y_test, dtype=torch.float32)

In [None]:
print(torch_x_train.dtype)
print(torch_x_test.dtype)
print(torch_y_train.dtype)
print(torch_y_test.dtype)

In [16]:
model = EPL_ANN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

In [17]:
# train
def model_train():
    model.train()
    output = model(torch_x_train)
    loss = loss_fn(output, torch_y_train)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    return loss


# test
def model_test():
    model.eval()

    output = model(torch_x_test)
    prediction = output.argmax(dim=1)
    print("0 is a home loss, 1 is a home win")
    print(f"Given: {torch_x_test}\t\tPrediction: {prediction}")
    correct = prediction == torch_y_test

    accuracy = int(correct.sum()) / len(torch_x_test)

    return accuracy




In [None]:
# train
epochs = 100
for i in range(epochs):
    l = model_train()

    if i%10==0:
        print(f"Epoch: {i}/{epochs}; Loss: {l}")

# test
a = model_test()
print(f"Model Accuracy: {a}")

In [10]:
# # grid search
# param_grid = {
#     'max_depth': [10, 12, 15, 19, 20, 25, 30, 35],
#     'min_samples_leaf': [50, 100, 125, 200],
#     'n_estimators': [100, 125, 150, 200, 225, 250]
# }

# grid_search = GridSearchCV(estimator=match_predictor,
#                            param_grid=param_grid,
#                            cv = 4,
#                            n_jobs=-1, verbose=1, scoring="accuracy")

# grid_search.fit(X_train, y_train)
# print(f"Best Params: {grid_search.best_params_}")
# print(f"Best Score: {grid_search.best_score_}")                

In [21]:
# match_predictor = RandomForestClassifier(n_estimators=200, 
#                            max_depth=10,
#                            min_samples_leaf=50,
#                            min_samples_split=10,
#                            n_jobs=-1, 
#                            random_state=42)

match_predictor = RandomForestClassifier(n_estimators=200, 
                           max_depth=10,
                           min_samples_leaf=10,
                           n_jobs=-1, 
                           random_state=42)

In [None]:
match_predictor.fit(X_train, y_train)
match_predictor.score(X_train, y_train)

In [None]:
match_predictor.score(X_test, y_test)

In [None]:
p = match_predictor.predict(X_test)
print(accuracy_score(y_test, p))

In [None]:
print(classification_report(y_test, p))

In [None]:
# plt.figure(figsize=(10, 6))
# plt.boxplot(y_test)
# plt.show()