## EDA Notebook

In [15]:
import pandas as pd
import pymc3 as pm
import numpy as np
import theano.tensor.nnet as nnet
import theano.tensor as tt

In [3]:
# Load data from results.csv
results = pd.read_csv('../data/results.csv')
teams = pd.read_csv('../data/teams.csv')
# Define the outcome variable
results['Outcome'] = results.apply(
    lambda row: 'HomeWin' if row['HomeScore'] > row['AwayScore'] 
    else ('Draw' if row['HomeScore'] == row['AwayScore'] else 'AwayWin'), axis=1
)

In [4]:
def results_points(a, b):
    """ Win, lose, draw 3, 0, 1 """
    if a > b:
        return [3, 0]
    if b > a:
        return [0, 3]
    if b == a:
        return [1, 1]
    else:
        return ValueError

In [5]:
results['Points'] = results.apply(lambda row: results_points(row['HomeScore'], row['AwayScore']), axis=1)
results[['HomePoints', 'AwayPoints']] = pd.DataFrame(results['Points'].tolist(), index=results.index)

In [6]:
season1 = results[results['SeasonID']==1]

In [7]:
def get_points_by_week(df):
    new_df = df.copy()
    home = new_df[['Gameweek', 'HomeTeamID', 'HomePoints', 'HomeScore', 'AwayScore']].copy()
    away = new_df[['Gameweek', 'AwayTeamID', 'AwayPoints', 'AwayScore', 'HomeScore']].copy()
    cols = ['Gameweek', 'ID', 'Points', 'For', 'Against']
    home.columns = cols
    away.columns = cols
    points_by_week = pd.concat([home, away])
    return points_by_week

In [8]:
points_by_week_s1 = get_points_by_week(season1)

In [9]:
# Sum points over all weeks by Team
final_table_s1 = (points_by_week_s1.groupby(['ID']).agg({'Points': 'sum', 'For': 'sum', 'Against': 'sum'})
               .reset_index()
               .sort_values('Points', ascending=False)
               )
final_table_s1['GD'] = final_table_s1['For'] - final_table_s1['Against']
final_table_s1 = final_table_s1.reset_index().drop('index', axis=1)

final_table_s1 = pd.merge(final_table_s1, teams, left_on='ID', right_on='TeamID')
final_table_s1[['TeamName', 'Points', 'For', 'Against', 'GD']].head(5)

Unnamed: 0,TeamName,Points,For,Against,GD
0,Miami,138,159,41,118
1,Cincinnati,125,130,51,79
2,Baltimore,117,136,41,95
3,New York S,113,108,52,56
4,Boston,106,130,58,72


In [10]:
# Build a model
# Each team has a latent strength parameter (e.g. last season's performance)
# Outcome depends on this and randomness
# Prior1 - strength prior: last season's performance
# Prior2 - randomness of results. Use variance of their scores, results, goals, shots or something to do with
# last season's odds - to start I will simply model prior2 as variation in scores.

In [19]:
import numpy as np
import pandas as pd

# Teams
teams = ['A', 'B', 'C']

# Define a mapping for outcomes: 0 - Draw, 1 - Team 1 wins, 2 - Team 2 wins
outcome_mapping = {0: 'Draw', 1: 'Team 1 wins', 2: 'Team 2 wins'}

# Sample data generation
np.random.seed(42)
num_games = 200
data = []

for _ in range(num_games):
    team1, team2 = np.random.choice(teams, 2, replace=False)
    outcome = np.random.choice([0, 1, 2], p=[0.3, 0.35, 0.35])  # Example probabilities for outcomes
    data.append((team1, team2, outcome_mapping[outcome]))

# Create a DataFrame for the data
df_games = pd.DataFrame(data, columns=['Team1', 'Team2', 'Outcome'])

In [20]:
df_games.head()

Unnamed: 0,Team1,Team2,Outcome
0,A,B,Team 2 wins
1,A,B,Team 1 wins
2,A,B,Draw
3,B,A,Team 2 wins
4,A,B,Draw


In [21]:
with pm.Model() as model:
    # Priors for team strengths
    strength = pm.Normal('strength', mu=0, sigma=1, shape=len(teams))
    
    # Prior for the variance parameter (prior2)
    prior2 = pm.HalfNormal('prior2', sigma=1)
    
    # Likelihood function for the outcome probabilities
    def outcome_prob(team1, team2):
        diff = strength[team1] - strength[team2]
        p_win = pm.math.sigmoid(diff / prior2)
        p_draw = 0.1  # Fixed small probability for draw
        return tt.stack([p_draw, p_win * (1 - p_draw), (1 - p_win) * (1 - p_draw)])
    
    # Iterate over each game and define likelihood
    outcomes = []
    for idx, game in df_games.iterrows():
        team1_idx = teams.index(game['Team1'])
        team2_idx = teams.index(game['Team2'])
        observed_outcome = list(outcome_mapping.keys())[list(outcome_mapping.values()).index(game['Outcome'])]
        outcome_probs = outcome_prob(team1_idx, team2_idx)
        outcomes.append(pm.Categorical(f'outcome_{idx}', p=outcome_probs, observed=observed_outcome))
    
    # Inference
    trace = pm.sample(2000, return_inferencedata=False)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [prior2, strength]


Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 167 seconds.
There were 68 divergences after tuning. Increase `target_accept` or reparameterize.
There were 89 divergences after tuning. Increase `target_accept` or reparameterize.
The acceptance probability does not match the target. It is 0.655540292363794, but should be close to 0.8. Try to increase the number of tuning steps.
There were 20 divergences after tuning. Increase `target_accept` or reparameterize.
There were 31 divergences after tuning. Increase `target_accept` or reparameterize.
The estimated number of effective samples is smaller than 200 for some parameters.


In [28]:
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt

# Define the teams
teams = ['A', 'B', 'C']

# Generate a new sample of 20 games
np.random.seed(42)  # For reproducibility
num_new_games = 20
new_game_data = []

for _ in range(num_new_games):
    team1, team2 = np.random.choice(teams, 2, replace=False)
    new_game_data.append((team1, team2))

# Create a DataFrame for the new games
df_new_games = pd.DataFrame(new_game_data, columns=['Team1', 'Team2'])
outcome_counts = df_games['Outcome'].value_counts(normalize=True)
p_draw_data = outcome_counts.get('Draw', 0)

# Function to compute outcome probabilities using sampled strengths
def compute_outcome_probabilities(trace, team1, team2):
    # Get posterior samples of strengths
    strength_samples = trace['strength']
    prior2_samples = trace['prior2']

    # Store predicted probabilities
    pred_probs = []

    for s, p2 in zip(strength_samples, prior2_samples):
        diff = s[team1] - s[team2]
        p_win = pm.math.sigmoid(diff / p2).eval()  # Compute win probability
        p_draw = p_draw_data  # Use the observed data draw probability
        p_lose = 1 - p_win - p_draw  # Compute lose probability

        # Ensure normalization
        total_prob = p_draw + p_win + p_lose
        pred_probs.append([p_draw / total_prob, p_win / total_prob, p_lose / total_prob])
    
    # Convert list to numpy array for easier handling
    pred_probs = np.array(pred_probs)
    return pred_probs.mean(axis=0)  # Return average predicted probabilities

# Load the model and trace from previous fitting (assuming model fitting is done)
# Here we assume that the trace is already available
# If the model and trace were not saved, you need to fit the model again

# Predictive probabilities for each new game
predicted_probs = []

for idx, game in df_new_games.iterrows():
    team1_idx = teams.index(game['Team1'])
    team2_idx = teams.index(game['Team2'])
    probs = compute_outcome_probabilities(trace, team1_idx, team2_idx)
    predicted_probs.append(probs)

# Create a DataFrame for the predicted probabilities
df_predicted_probs = pd.DataFrame(predicted_probs, columns=['P_Draw', 'P_Team1_Win', 'P_Team2_Win'])
df_results = pd.concat([df_new_games, df_predicted_probs], axis=1)

# Output the results
print(df_results)

Timeout: The file lock 'C:\Users\chris\AppData\Local\Theano\compiledir_Windows-10-10.0.22631-SP0-Intel64_Family_6_Model_186_Stepping_2_GenuineIntel-3.10.14-64\.lock' could not be acquired.