# Q10

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

In [4]:
def get_posterior(initial_point, num_samples, y, beta_inv, m_s, sigma_s):
    # Produce a posteroir from Gibbs sampling

    point = np.array(initial_point)
    samples = np.empty([num_samples+1, 2]) 
    samples[0] = point
    A = np.array([[1, -1]])
    
    for i in range(num_samples):
        s1_sample, s2_sample = samples[i]
        if y==1:
            t = stats.truncnorm.rvs(a=0, b=np.inf, loc=(s1_sample - s2_sample), scale=np.sqrt(beta_inv))  # p(t|s1, s2, y)
        elif y==-1:
            t = stats.truncnorm.rvs(a=-np.inf, b=0, loc=(s1_sample - s2_sample), scale=np.sqrt(beta_inv))
            
        S = np.linalg.inv(np.linalg.inv(sigma_s) + (beta_inv)**(-1) * (A.T @ A))
        m = S @ (np.linalg.inv(sigma_s) @ m_s + beta_inv**(-1) * A.T * t)

        point = np.random.multivariate_normal(m.ravel(), S)
        samples[i+1] = point

    return m[0][0], m[1][0], S[0,0], S[1,1], point

In [5]:
def prediction(mu_s, sigma_s, beta_inv):
    A = np.array([[1, -1]])
    mu_t = A @ mu_s
    sigma_t = beta_inv + A @ sigma_s @ A.T
    p = stats.norm.cdf(0, loc=mu_t, scale=np.sqrt(sigma_t))
    return -1 if p > 0.5 else 1
    

In [6]:
serie_A_data = pd.read_csv('SerieA.csv', delimiter=',')
print(serie_A_data.head())

   yyyy-mm-dd  HH:MM     team1     team2  score1  score2
0  2018-08-18  18:00    Chievo  Juventus       2       3
1  2018-08-18  20:30     Lazio    Napoli       1       2
2  2018-08-19  18:00    Torino      Roma       0       1
3  2018-08-19  20:30  Sassuolo     Inter       1       0
4  2018-08-19  20:30     Parma   Udinese       2       2


In [3]:
serie_A_data_2 = pd.read_csv('season-1718.csv', delimiter=',')
serie_A_data_2 = serie_A_data_2[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
print(serie_A_data_2.head())

   HomeTeam  AwayTeam  FTHG  FTAG
0  Juventus  Cagliari     3     0
1    Verona    Napoli     1     3
2  Atalanta      Roma     0     1
3   Bologna    Torino     1     1
4   Crotone     Milan     0     3


In [7]:
data = serie_A_data_2.rename(columns={"HomeTeam":"team1", "AwayTeam": "team2", "FTHG":"score1", "FTAG":"score2"}, errors="raise" )
data.head()
new_data = serie_A_data.append(data)


  new_data = serie_A_data.append(data)


In [8]:
new_data.tail()

Unnamed: 0,yyyy-mm-dd,HH:MM,team1,team2,score1,score2
375,,,Milan,Fiorentina,5,1
376,,,Napoli,Crotone,2,1
377,,,Sassuolo,Roma,0,1
378,,,Spal,Sampdoria,3,1
379,,,Udinese,Bologna,1,0


In [None]:
# Sets initial values
beta_inv = 1
num_samples = 2000
mu_0 = 25.0
sigma_0 = 40.0

pred_wins_team1 = []
true_wins_team1 = []

unique_teams = pd.concat([new_data['team1'], new_data['team2']]).unique()
skills_df = pd.DataFrame({'team': unique_teams,
    'mu': [mu_0] * len(unique_teams),
    'sigma': [sigma_0] * len(unique_teams),
    's' : np.random.normal(mu_0, np.sqrt(sigma_0))
})
skills_history = {team: [(mu_0, sigma_0)] for team in list(unique_teams)}

for _, match_instance in tqdm(new_data.iterrows(), total=new_data.shape[0], desc="Processing Matches", leave=True, position=0, unit='match'):
    team1, team2, score1, score2 = match_instance['team1'], match_instance['team2'], match_instance['score1'], match_instance['score2']
    if score1 == score2:
        continue
    
    mu1 = skills_df.loc[skills_df['team'] == team1, 'mu'].values[0]
    sigma1 = skills_df.loc[skills_df['team'] == team1, 'sigma'].values[0]
    mu2 = skills_df.loc[skills_df['team'] == team2, 'mu'].values[0]
    sigma2 = skills_df.loc[skills_df['team'] == team2, 'sigma'].values[0]
    s1 = skills_df.loc[skills_df['team'] == team1, 's'].values[0]
    s2 = skills_df.loc[skills_df['team'] == team2, 's'].values[0]

    y = 1 if score1 > score2 else -1

    y_pred = prediction(mu_s = np.array([[mu1], [mu2]]),
                        sigma_s = np.array([[sigma1, 0], [0, sigma2]]),
                        beta_inv = beta_inv)
    
    pred_wins_team1.append(y_pred)
    true_wins_team1.append(y)

    mu1_new, mu2_new, sigma1_new, sigma2_new, point = get_posterior(initial_point = [s1, s2],
                                                            num_samples = num_samples,
                                                            y = y,
                                                            beta_inv = beta_inv,
                                                            m_s = np.array([[mu1], [mu2]]),
                                                            sigma_s = np.array([[sigma1, 0], [0, sigma2]]))
    
    skills_history[team1].append((mu1_new, sigma1_new))
    skills_history[team2].append((mu2_new, sigma2_new))

    skills_df.loc[skills_df['team'] == team1, 'mu'] = mu1_new
    skills_df.loc[skills_df['team'] == team1, 'sigma'] = sigma1_new
    skills_df.loc[skills_df['team'] == team2, 'mu'] = mu2_new
    skills_df.loc[skills_df['team'] == team2, 'sigma'] = sigma2_new
    skills_df.loc[skills_df['team'] == team1, 's'] = point[0]
    skills_df.loc[skills_df['team'] == team2, 's'] = point[1]

r = sum(1 for pred, true in zip(pred_wins_team1, true_wins_team1) if pred == true)/len(true_wins_team1)
print(f"Prediction Rate = {round(r, 3)}")
