In [5]:
# Get all matches from 2022-2023 season

import requests

# Get API token from .env file
with open(".env") as f:
  for line in f:
    if "FOOTBALL-DATA-API-KEY" in line:
      token = line.split("=")[1].strip()
      break

uri = "https://api.football-data.org/v4/competitions/2021/matches?season=2022"
headers = { "X-Auth-Token": token }

response = requests.get(uri, headers=headers)

data = response.json()

In [21]:
import pandas as pd
import re

df = pd.DataFrame(data["matches"])

# Drop columns we don"t need
df = df[["utcDate", "matchday", "homeTeam", "awayTeam", "score"]]

# Parse the homeTeam and awayTeam and get the names or ids from the column
df["home"] = df["homeTeam"].apply(lambda x: x["name"])
df["away"] = df["awayTeam"].apply(lambda x: x["name"])
df = df.drop(columns=["homeTeam", "awayTeam"])

# Get home team and away team scores from the dataframe, drop score column
df["homeScore"] = df["score"].apply(lambda x: x["fullTime"]["home"])
df["awayScore"] = df["score"].apply(lambda x: x["fullTime"]["away"])
df = df.drop(columns=["score"])

# Function to convert camel case to title case
def camel_to_title(camel_str):
    title_str = re.sub("([A-Z])", r" \1", camel_str)
    return title_str.title()

# Apply the function to each column name
df.columns = [camel_to_title(col) for col in df.columns]

## Setting up Training Data

In [23]:
# Initialize elo ratings for each team
elo = {team: 1500 for team in df["Home"].unique()}

def update_elo(winner_elo, loser_elo, k=40):
    expected_win = 1.0 / (1 + 10**((loser_elo - winner_elo) / 400))
    change = k * (1 - expected_win)
    return winner_elo + change, loser_elo - change

# Process matches and update ELO ratings
for index, row in df.iterrows():
    home_team, away_team = row["Home"], row["Away"]
    
    if row["Home Score"] > row["Away Score"]:  # Home team won
        elo[home_team], elo[away_team] = update_elo(elo[home_team], elo[away_team])
    elif row["Away Score"] > row["Home Score"]:  # Away team won
        elo[away_team], elo[home_team] = update_elo(elo[away_team], elo[home_team])
    # For draws, no ELO adjustment
    
    df.at[index, "Home Elo"] = elo[home_team]
    df.at[index, "Away Elo"] = elo[away_team]

# Determine outcomes: 3 for win, 1 for draw, 0 for loss
df["Home Outcome"] = 1
df["Away Outcome"] = 1
df.loc[df["Home Score"] > df["Away Score"], "Home Outcome"] = 3
df.loc[df["Home Score"] > df["Away Score"], "Away Outcome"] = 0
df.loc[df["Away Score"] > df["Home Score"], "Away Outcome"] = 3
df.loc[df["Away Score"] > df["Home Score"], "Home Outcome"] = 0

## Training

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For simplicity, let"s predict the home outcome based on Elo ratings
x = df[["Home Elo", "Away Elo"]]
y = df["Home Outcome"]

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

# Standardize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

# Train the model
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)

# Validate the model
val_score = model.score(x_val, y_val)
print(f"Validation accuracy: {val_score * 100:.2f}%")

Validation accuracy: 60.53%
