In [1]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np

import sys
import os
sys.path.append(os.path.abspath('..'))

from shared.baseline_shared import get_shared_components
from shared.baseline_train import train_model, evaluate_model


In [3]:
class AttentionLayer(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.attention = nn.Linear(input_dim, 1)

    def forward(self, x):
        weights = torch.softmax(self.attention(x), dim=1)
        weighted = (x * weights).sum(dim=1)
        return weighted

In [4]:
class MatchPredictor(nn.Module):
    def __init__(self, input_dim, num_matches=5, hidden_size=64, num_classes=3):
        super().__init__()
        self.num_matches = num_matches
        self.hidden_size = hidden_size
        self.player_feat_dim = 2
        self.num_players = 11
        self.team_feat_dim = self.num_players * self.player_feat_dim

        self.gru = nn.GRU(
            input_size=self.team_feat_dim,
            hidden_size=hidden_size,
            batch_first=True
        )

        self.attention = AttentionLayer(hidden_size)

        self.match_context_dim = input_dim - (2 * self.team_feat_dim * self.num_matches)
        self.context_branch = nn.Sequential(
            nn.Linear(self.match_context_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2 + 64, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        batch_size = x.size(0)
        h_start = 0
        h_end = self.team_feat_dim * self.num_matches
        a_end = h_end + self.team_feat_dim * self.num_matches

        home_seq = x[:, h_start:h_end].view(batch_size, self.num_matches, self.team_feat_dim)
        away_seq = x[:, h_end:a_end].view(batch_size, self.num_matches, self.team_feat_dim)
        match_context = x[:, a_end:]

        home_out, _ = self.gru(home_seq)
        away_out, _ = self.gru(away_seq)

        home_features = self.attention(home_out)
        away_features = self.attention(away_out)

        context_features = self.context_branch(match_context)

        combined = torch.cat([home_features, away_features, context_features], dim=1)
        return self.classifier(combined)

class MatchSequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [None]:
def build_match_sequences(df, num_matches=5):
    df = df.sort_values("match_date")
    df = df.reset_index(drop=True)

    team_histories = {}
    samples = []
    labels = []

    for idx, row in df.iterrows():
        home_id = row["home_team_api_id"]
        away_id = row["away_team_api_id"]
        
        context = np.zeros(3, dtype=np.float32)
        try:
            context = row[["avg_home_prob", "avg_draw_prob", "avg_away_prob"]].values.astype(np.float32)
            if np.any(np.isnan(context)):
                context = np.zeros(3, dtype=np.float32)
        except:
            pass
            
        home_goals = row["home_team_goal"]
        away_goals = row["away_team_goal"]
        label = int((home_goals > away_goals) * 2 + (home_goals == away_goals))

        def extract_team_sequence(team_id):
            history = team_histories.get(team_id, [])
            if len(history) == 0:
                return np.zeros((num_matches, 22), dtype=np.float32)
            elif len(history) < num_matches:
                padding = [np.zeros(22, dtype=np.float32)] * (num_matches - len(history))
                return np.concatenate([padding, history])
            else:
                return np.stack(history[-num_matches:])

        def get_player_features(prefix, count=11):
            ratings = []
            potentials = []
            for i in range(1, count+1):
                rating = row.get(f"{prefix}_player_{i}_rating", 0)
                potential = row.get(f"{prefix}_player_{i}_potential", 0)
                rating = 0 if pd.isna(rating) else rating
                potential = 0 if pd.isna(potential) else potential
                ratings.append(float(rating))
                potentials.append(float(potential))
            return np.array(ratings, dtype=np.float32), np.array(potentials, dtype=np.float32)

        home_ratings, home_potentials = get_player_features("home")
        away_ratings, away_potentials = get_player_features("away")
        
        home_vec = np.stack([home_ratings, home_potentials], axis=1).flatten()
        away_vec = np.stack([away_ratings, away_potentials], axis=1).flatten()

        home_seq = extract_team_sequence(home_id)
        away_seq = extract_team_sequence(away_id)

        match_input = np.concatenate([home_seq.flatten(), away_seq.flatten(), context])
        samples.append(match_input)
        labels.append(label)

        team_histories.setdefault(home_id, []).append(home_vec)
        team_histories.setdefault(away_id, []).append(away_vec)

    X = torch.tensor(np.stack(samples), dtype=torch.float32)
    y = torch.tensor(np.array(labels), dtype=torch.long)
    
    X[torch.isnan(X)] = 0
    return X, y

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_csv("../final_5_league.csv")
df['match_date'] = pd.to_datetime(df['match_date'])
components = get_shared_components(df, device)

train = df[df['match_date'].dt.year < 2016]
test = df[df['match_date'].dt.year == 2016]

X_train, _ = build_match_sequences(train, num_matches=5)
X_test, _ = build_match_sequences(test, num_matches=5)

X_train = torch.FloatTensor(X_train).to(device)
X_test = torch.FloatTensor(X_test).to(device)

y_train = components['y_train']
y_test = components['y_test']

In [None]:
model = MatchPredictor(input_dim=X_train.shape[1], num_matches=5).to(device)

optimizer = components['make_optimizer'](model)

train_model(
    model,
    X_train,
    y_train,
    optimizer,
    components['criterion'],
    epochs=50
)

test_dates = test['match_date'].values

evaluate_model(
    model,
    X_test,
    y_test,
    bookie_probs=components['bookie_test'].cpu().numpy(),
    match_dates=test_dates
)