## EDA Notebook

In [27]:
import pandas as pd
import pymc3 as pm
import numpy as np
import theano.tensor.nnet as nnet

In [43]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')
teams = pd.read_csv('../data/teams.csv')
# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1
)

In [29]:
def results_points(a, b):
    """ Win, lose, draw 3, 0, 1 """
    if a > b:
        return [3, 0]
    if b > a:
        return [0, 3]
    if b == a:
        return [1, 1]
    else:
        return ValueError

In [30]:
results['Points'] = results.apply(lambda row: results_points(row['HomeScore'], row['AwayScore']), axis=1)
results[['HomePoints', 'AwayPoints']] = pd.DataFrame(results['Points'].tolist(), index=results.index)

In [31]:
season1 = results[results['SeasonID']==1]

In [32]:
def get_points_by_week(df):
    new_df = df.copy()
    home = new_df[['Gameweek', 'HomeTeamID', 'HomePoints', 'HomeScore', 'AwayScore']].copy()
    away = new_df[['Gameweek', 'AwayTeamID', 'AwayPoints', 'AwayScore', 'HomeScore']].copy()
    cols = ['Gameweek', 'ID', 'Points', 'For', 'Against']
    home.columns = cols
    away.columns = cols
    points_by_week = pd.concat([home, away])
    return points_by_week

In [33]:
points_by_week_s1 = get_points_by_week(season1)

In [34]:
# Sum points over all weeks by Team
final_table_s1 = (points_by_week_s1.groupby(['ID']).agg({'Points': 'sum', 'For': 'sum', 'Against': 'sum'})
               .reset_index()
               .sort_values('Points', ascending=False)
               )
final_table_s1['GD'] = final_table_s1['For'] - final_table_s1['Against']
final_table_s1 = final_table_s1.reset_index().drop('index', axis=1)

final_table_s1 = pd.merge(final_table_s1, teams, left_on='ID', right_on='TeamID')
final_table_s1[['TeamName', 'Points', 'For', 'Against', 'GD']].head(5)

Unnamed: 0,TeamName,Points,For,Against,GD
0,Miami,138,159,41,118
1,Cincinnati,125,130,51,79
2,Baltimore,117,136,41,95
3,New York S,113,108,52,56
4,Boston,106,130,58,72


In [50]:
outcome_mapping = {'HomeWin': 0, 'Draw': 1, 'AwayWin': 2}
# Applying the mapping
results['Outcome_encoded'] = results['Outcome'].map(outcome_mapping)

# Extracting the encoded outcomes
y = results['Outcome_encoded'].values

In [51]:
# Feature engineering (using previous season's ranks as a proxy for team strength)
team_rank = { i : x for x, i in enumerate(final_table_s1['TeamID'].tolist(), 1)}
previous_season_ranking = team_rank

# Map previous season ranks
results['HomeRank'] = results['HomeTeamID'].map(previous_season_ranking)
results['AwayRank'] = results['AwayTeamID'].map(previous_season_ranking)

# Home advantage feature
results['HomeAdvantage'] = 1  # All games are home games for the home team

# Extract features and labels
X = results[['HomeAdvantage', 'HomeRank', 'AwayRank']].values
y = results['Outcome_encoded'].values

if np.isnan(X).any() or np.isnan(y).any():
    raise ValueError("X or y contains NaN values, which are not supported.")

# Define Bayesian model using PyMC3
with pm.Model() as model:
    # Priors for the coefficients
    intercept = pm.Normal('intercept', 0, sigma=10)
    beta_home_adv = pm.Normal('beta_home_adv', 0, sigma=10)
    beta_home_rank = pm.Normal('beta_home_rank', 0, sigma=10)
    beta_away_rank = pm.Normal('beta_away_rank', 0, sigma=10)
    
    # Linear combination
    theta = (intercept + beta_home_adv * X[:, 0] + 
             beta_home_rank * X[:, 1] +
             beta_away_rank * X[:, 2])
    
    # Multinomial logistic regression
    #p = pm.math.softmax(theta)
    p = nnet.softmax(theta)  # Correct softmax usage
    outcome = pm.Categorical('outcome', p=p, observed=y)
    
    # Sampling from the posterior
    trace = pm.sample(2000, tune=1000, return_inferencedata=True)
    
# The trace contains samples from the posterior distribution, which we can analyze


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_away_rank, beta_home_rank, beta_home_adv, intercept]


Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 57 seconds.


In [None]:
trace.posterior['']

In [84]:
# Load fixtures for Season 2
fixtures = pd.read_csv('../data/fixtures.csv')

# Map previous season ranks to the new fixtures
fixtures['HomeRank'] = fixtures['HomeTeamID'].map(previous_season_ranking)
fixtures['AwayRank'] = fixtures['AwayTeamID'].map(previous_season_ranking)
fixtures['HomeAdvantage'] = 1  # All home teams

# Handle NaN values (e.g., new teams)
fixtures['HomeRank'].fillna(max(previous_season_ranking.values()) + 1, inplace=True)
fixtures['AwayRank'].fillna(max(previous_season_ranking.values()) + 1, inplace=True)

# Extract features for prediction
X_new = fixtures[['HomeAdvantage', 'HomeRank', 'AwayRank']].values
n_new = X_new.shape[0]
n_categories = 3

# Assuming `trace` contains the posterior samples from the previous season's model fitting
# And `model` is the PyMC3 model defined above
with model:
    # Extract posterior samples for each coefficient
    intercept_samples = trace.posterior['intercept']
    beta_home_adv_samples = trace.posterior['beta_home_adv']
    beta_home_rank_samples = trace.posterior['beta_home_rank']
    beta_away_rank_samples = trace.posterior['beta_away_rank']

    # Number of samples in the posterior
    n_samples = len(intercept_samples)

    # Initialize an array to store the logits for new fixtures
    theta_new = np.zeros((n_samples, n_new, n_categories))

    # Calculate the logits for each category, for each new fixture
    for i in range(n_samples):
        theta_new[i] = (intercept_samples[i] +
                        beta_home_adv_samples[i] * X_new[:, 0, np.newaxis] +
                        beta_home_rank_samples[i] * X_new[:, 1, np.newaxis] +
                        beta_away_rank_samples[i] * X_new[:, 2, np.newaxis])

    # Compute the softmax probabilities
    p_new = nnet.softmax(theta_new)

    # Calculate the mean probability over all samples for each fixture
    mean_p_new = p_new.mean(axis=0)

    # Predict the outcome categories using the highest mean probability class
    predicted_outcomes = np.argmax(mean_p_new, axis=1)

# `predicted_outcomes` will now contain the predicted categories for each fixture
# where 0 = Home Win, 1 = Draw, 2 = Away Win
print(predicted_outcomes)

ValueError: dimensions ('draw',) must have the same length as the number of data dimensions, ndim=2