## EDA Notebook

In [4]:
import pandas as pd
import pymc3 as pm
import numpy as np

In [3]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')

# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1
)

# Encode the outcome
results['Outcome'] = results['Outcome'].map({'HomeWin': 0, 'Draw': 1, 'AwayWin': 2})

# Feature engineering (using previous season's ranks as a proxy for team strength)
previous_season_ranking = {
    'TeamA': 1, 'TeamB': 2, 'TeamC': 3, 'TeamD': 4, 'TeamE': 5,
    'TeamF': 6, 'TeamG': 7, 'TeamH': 8, 'TeamI': 9, 'TeamJ': 10
}

# Map previous season ranks
results['HomeRank'] = results['HomeTeam'].map(previous_season_ranking)
results['AwayRank'] = results['AwayTeam'].map(previous_season_ranking)

# Home advantage feature
results['HomeAdvantage'] = 1  # All games are home games for the home team

# Extract features and labels
X = results[['HomeAdvantage', 'HomeRank', 'AwayRank']].values
y = results['Outcome'].values

# Define Bayesian model using PyMC3
with pm.Model() as model:
    # Priors for the coefficients
    intercept = pm.Normal('intercept', 0, sigma=10)
    beta_home_adv = pm.Normal('beta_home_adv', 0, sigma=10)
    beta_home_rank = pm.Normal('beta_home_rank', 0, sigma=10)
    beta_away_rank = pm.Normal('beta_away_rank', 0, sigma=10)
    
    # Linear combination
    theta = (intercept + beta_home_adv * X[:, 0] + 
             beta_home_rank * X[:, 1] +
             beta_away_rank * X[:, 2])
    
    # Multinomial logistic regression
    p = pm.math.softmax(theta)
    outcome = pm.Categorical('outcome', p=p, observed=y)
    
    # Sampling from the posterior
    trace = pm.sample(2000, tune=1000, return_inferencedata=True)
    
# The trace contains samples from the posterior distribution, which we can analyze


KeyError: 'HomeTeam'