In [2]:
import requests
import time
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# Read in CSV file of webscrapped data. 
matches = pd.read_csv("matches.csv", index_col = 0)

# Convert categorical values into numerical values. 
matches["Date"] = pd.to_datetime(matches["Date"])
matches["Venue_Code"] = matches["Venue"].astype("category").cat.codes
matches["Opp_Code"] = matches["Opponent"].astype("category").cat.codes
matches["Hour"] = matches["Time"].str.replace(":.+", "", regex = True).astype("int")
matches["Day_Code"] = matches["Date"].dt.dayofweek
matches["Target"] = (matches["Result"] == "W").astype("int")

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [5]:
# Initialize Random Forest for training and testing. 
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [6]:
def rolling_averages(group, cols, new_cols):
  group = group.sort_values("Date")
  for col in cols:
    group[col] = pd.to_numeric(group[col], errors='coerce')
  rolling_stats = group[cols].rolling(3, closed = "left").mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset = new_cols)
  return group

def make_future_predictions(data, predictors):
  # Seperate train and test based on date. 
  train = data[data["Date"] < '2024-6-29']
  test = data[data["Date"] >= '2024-6-29']
  predictors = ["Venue_Code", "Opp_Code", "Hour", "Day_Code"]
  rf.fit(train[predictors], train["Target"])
  preds = rf.predict(test[predictors])
  # 'Combined' outputs actual result and predicted result, while precision would output how accurate the predictions were. 
  # As these are future predictions, the actual result and the precision are ignored as we don't have the actual result of future matches. 
  # Instead, only the predicted result is utilized. For the match predictor model: actual, predicted, and precision are utilized. 
  combined = pd.DataFrame(dict(actual = test["Target"], prediction = preds))
  precision = precision_score(test["Target"], preds)
  return combined, precision

In [8]:
# Categorical predictors.
predictors = ["Venue_Code", "Opp_Code", "Hour", "Day_Code"]

# Quantitive predictors (which will be used as rolling averages).
cols = ["Poss", "GF", "GA", "Sh", "SoT", "PK", "PKatt"]
# Rename cols as, for ex: Poss_Rolling, because we are looking at rolling averages. 
new_cols = [f"{c}_Rolling" for c in cols]

# Look at each team and compute rolling averages based on past data. 
matches_rolling = matches.groupby("Team").apply(lambda x: rolling_averages(x, cols, new_cols))
# As above creates a dataframe for each team, which isn't needed; just drop level to have one dataframe. 
matches_rolling = matches_rolling.droplevel('Team')
# Create unique indicies for each row of data. 
matches_rolling.index = range(matches_rolling.shape[0])

# Categorical and quantitive predictors are combined together for testing and training. 
combined, precision = make_future_predictions(matches_rolling, predictors + new_cols)
# In order to not just have 'actual' and 'predicted' outputed. Instead, other data is outputed to easily identify the match. 
combined = combined.merge(matches_rolling[["Date", "Team", "Opponent", "Result"]], left_index = True, right_index = True)

# Put predictions into a CSV.
combined.to_csv("future_predictions.csv")