In [51]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
import os
import calendar
import datetime 
import zipfile
import re
import sklearn.model_selection as sms 
import scipy.stats
import sklearn.preprocessing 
import sklearn.impute
import boto3
import io 
import scipy.optimize as sopt
import warnings
import joblib 
import math
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn.metrics as skm
import helpers.haws as haws
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

Set the s3 credentials

In [17]:
# Initialize a session.
s3 = haws.get_service_resource(aws_profile='ck', service_name = 's3')

# Set the S3 bucket and dataset path.
s3_bucket_name = 'cryptokaizen-data-test'
s3_dataset_path = 'kaizen_ai/soccer_prediction/datasets/OSF_football/'

# Define the local directory to save the files.
local_directory = 'datasets/OSF_football'
os.makedirs(local_directory, exist_ok=True)

def download_files_from_s3(bucket_name, prefix, local_dir):
    """
    Function to download files from S3.
    """
    bucket = s3.Bucket(bucket_name)
    for obj in bucket.objects.filter(Prefix=prefix):
        key = obj.key
        if key.endswith('.txt'):
            local_file_path = os.path.join(local_dir, os.path.basename(key))
            print(f"Downloading {key} to {local_file_path}")
            bucket.download_file(key, local_file_path)

# Call the function to download the files
download_files_from_s3(s3_bucket_name, s3_dataset_path, local_directory)

# Load the datasets into pandas dataframes
dataframes_3 = {}
for dirname, _, filenames in os.walk(local_directory):
    for filename in filenames:
        if filename.endswith(".txt"):
            file_key = filename.split('.')[0] + '_df'
            filepath = os.path.join(dirname, filename)
            print(f"Loading {filepath}")
            df = pd.read_csv(filepath, sep="\t", encoding="UTF-8")
            print(file_key, df.shape)
            df = df.drop_duplicates()
            dataframes_3[file_key] = df

print('Data imported')

# Verify the content of dataframes_3 dictionary.
for key, df in dataframes_3.items():
    print(f"{key}: {df.shape}")

# Access the dataframes directly from the dictionary.
ISDBv1_df = dataframes_3.get('ISDBv1_df')
ISDBv2_df = dataframes_3.get('ISDBv2_df')

# Print the shapes to confirm they are loaded correctly.
print(f"ISDBv1_df shape: {ISDBv1_df.shape if ISDBv1_df is not None else 'Not found'}")
print(f"ISDBv2_df shape: {ISDBv2_df.shape if ISDBv2_df is not None else 'Not found'}")

Loading datasets/OSF_football/ISDBv1.txt
ISDBv1_df (216743, 9)
Loading datasets/OSF_football/ISDBv2.txt
ISDBv2_df (218916, 9)
Data imported
ISDBv1_df: (216743, 9)
ISDBv2_df: (218916, 9)
ISDBv1_df shape: (216743, 9)
ISDBv2_df shape: (218916, 9)


In [18]:
ISDBv2_df.head()

Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL
0,00-01,GER1,11/08/2000,Dortmund,Hansa Rostock,1,0,1,W
1,00-01,GER1,12/08/2000,Bayern Munich,Hertha Berlin,4,1,3,W
2,00-01,GER1,12/08/2000,Freiburg,VfB Stuttgart,4,0,4,W
3,00-01,GER1,12/08/2000,Hamburger SV,Munich 1860,2,2,0,D
4,00-01,GER1,12/08/2000,Kaiserslautern,Bochum,0,1,-1,L


Make the season into a single interger for simplicity

In [19]:
ISDBv2_df['season'] = ISDBv2_df['Sea'].apply(lambda x: int('20' + str(x)[:2]))
df = ISDBv2_df[ISDBv2_df['season'] >= 2009]
# Preprocess the dataset
# Preprocess the dataset
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df.sort_values(by='Date', inplace=True)
# Define the burn-in period and warm-up season removal
def remove_warmup_and_burnin(df, warmup_seasons, burnin_rounds=5):
    filtered_df = pd.DataFrame()
    leagues = df['Lge'].unique()

    for league in leagues:
        league_df = df[df['Lge'] == league]
        seasons = league_df['Sea'].unique()
        for season in seasons:
            season_df = league_df[league_df['Sea'] == season]
            if season == seasons[0] and season in warmup_seasons:
                continue
            season_df = season_df.iloc[burnin_rounds:]
            filtered_df = pd.concat([filtered_df, season_df])

    return filtered_df
warmup_seasons = {2009}
df_final = remove_warmup_and_burnin(df, warmup_seasons)
# Split the data into training, validation, and test sets
train_size = int(0.6* len(df_final))
val_size = int(0.2 * len(df_final))
train_df = df_final[:train_size]
val_df = df_final[train_size:train_size + val_size]
test_df = df_final[train_size + val_size:]

In [20]:
df_final.columns

Index(['Sea', 'Lge', 'Date', 'HT', 'AT', 'HS', 'AS', 'GD', 'WDL', 'season'], dtype='object')

# Double Precision Model (IGNORE)

In [19]:
# Check if the validation dataframe is empty
if val_df.empty:
    print("Validation dataframe is empty.")
else:
    print(f"Validation dataframe has {len(val_df)} records.")

# Check if the training dataframe is empty
if train_df.empty:
    print("Training dataframe is empty.")
else:
    print(f"Training dataframe has {len(train_df)} records.")

# Define the Double Poisson likelihood function with exponential time weighting
def double_poisson_log_likelihood(params, df, teams, alpha=0.0019):
    strength = dict(zip(teams, params[:-1]))
    home_advantage = params[-1]
    current_date = datetime.datetime.now()

    # Map team strengths to home and away teams
    home_teams = df['HT'].map(strength)
    away_teams = df['AT'].map(strength)
    home_goals = df['HS'].values
    away_goals = df['AS'].values
    t_deltas = (current_date - df['Date']).dt.days
    weights = np.exp(-alpha * t_deltas)

    # Calculate expected goals
    lambda_home = np.exp(home_teams.values - away_teams.values + home_advantage)
    lambda_away = np.exp(away_teams.values - home_teams.values)

    # Calculate log-likelihoods for home and away goals
    log_likelihood_home = home_goals * np.log(lambda_home) - lambda_home - np.log(np.vectorize(math.factorial)(home_goals))
    log_likelihood_away = away_goals * np.log(lambda_away) - lambda_away - np.log(np.vectorize(math.factorial)(away_goals))

    # Sum the weighted log-likelihoods
    total_log_likelihood = np.sum(weights * (log_likelihood_home + log_likelihood_away))

    return -total_log_likelihood  # Return negative log-likelihood to maximize it by minimizing

# Define the teams and initial parameters
teams = pd.unique(df[['HT', 'AT']].values.ravel('K'))
initial_strength = np.zeros(len(teams))
initial_home_advantage = 0.1

# Function to refit parameters after each league round
def refit_parameters(train_df, teams):
    # Initial guess for teams' strength.
    initial_strength = np.zeros(len(teams))
    # Initial guess for home advantage
    initial_home_advantage = 0.1  
    initial_params = np.concatenate([initial_strength, [initial_home_advantage]])

    print("Starting optimization with initial parameters")
    result = sopt.minimize(
        double_poisson_log_likelihood,
        x0=initial_params,
        args=(train_df, teams),
        method='L-BFGS-B',
        options={ 'disp': True}  
    )
    print("Optimization result:", result)
    if not result.success:
        print("Optimization failed.")
    return result.x

# Initial parameter fitting
params = refit_parameters(train_df, teams)

# Extract the optimized parameters
team_strengths = dict(zip(teams, params[:-1]))
home_advantage = params[-1]

print("Optimized Team Strengths:")
print(team_strengths)
print("Home Advantage:")
print(home_advantage)

# Function to predict match outcomes
def predict_match(home_team, away_team, team_strengths, home_advantage):
    lambda_home = np.exp(team_strengths[home_team] - team_strengths[away_team] + home_advantage)
    lambda_away = np.exp(team_strengths[away_team] - team_strengths[home_team])
    return lambda_home, lambda_away

# Function to calculate Ranked Probability Score (RPS)
def rps(predictions, actual):
    cumulative_preds = np.cumsum(predictions)
    cumulative_actual = np.cumsum(actual)
    return np.sum((cumulative_preds - cumulative_actual) ** 2) / (len(predictions) - 1)

# Function to evaluate the model
def evaluate_model(df, team_strengths, home_advantage):
    correct_predictions = 0
    total_predictions = 0
    total_rps = 0

    def calculate_metrics(row):
        nonlocal correct_predictions, total_predictions, total_rps
        home_team = row['HT']
        away_team = row['AT']
        home_goals = row['HS']
        away_goals = row['AS']

        # Check if team strengths are available
        if home_team not in team_strengths or away_team not in team_strengths:
            print(f"Missing strength for teams: {home_team}, {away_team}")
            return

        lambda_home, lambda_away = predict_match(home_team, away_team, team_strengths, home_advantage)
        predicted_home_goals = np.round(lambda_home)
        predicted_away_goals = np.round(lambda_away)

        if (predicted_home_goals == home_goals) and (predicted_away_goals == away_goals):
            correct_predictions += 1

        # Determine the maximum number of goals to dynamically size the result arrays
        max_goals = int(max(home_goals, away_goals, predicted_home_goals, predicted_away_goals)) + 1

        # Create actual result distribution
        actual_result = np.zeros(max_goals)
        actual_result[int(home_goals)] = 1
        actual_result[int(away_goals)] = 1

        # Create prediction distribution
        predictions = np.zeros(max_goals)
        predictions[int(predicted_home_goals)] = lambda_home
        predictions[int(predicted_away_goals)] = lambda_away

        total_rps += rps(predictions, actual_result)
        total_predictions += 1

    joblib.Parallel(n_jobs=-1)(joblib.delayed(calculate_metrics)(row) for idx, row in df.iterrows())

    # Check if total_predictions is zero to avoid ZeroDivisionError
    if total_predictions == 0:
        print("No predictions made. Total predictions is zero.")
        return 0, 0

    accuracy = correct_predictions / total_predictions
    average_rps = total_rps / total_predictions

    return accuracy, average_rps

# Evaluate the model on the validation set
val_accuracy, val_rps = evaluate_model(val_df, team_strengths, home_advantage)

print("Validation Accuracy:")
print(val_accuracy)
print("Validation Ranked Probability Score (RPS):")
print(val_rps)

# Evaluate the model on the test set
test_accuracy, test_rps = evaluate_model(test_df, team_strengths, home_advantage)

# Calculate confidence intervals for hit rate
hit_rate = test_accuracy
z = 1.96  # 95% confidence interval
hit_rate_std = np.sqrt((hit_rate * (1 - hit_rate)) / len(test_df))
confidence_interval = (hit_rate - z * hit_rate_std, hit_rate + z * hit_rate_std)

print("Test Accuracy:")
print(test_accuracy)
print("Test Ranked Probability Score (RPS):")
print(test_rps)
print("Hit Rate:")
print(hit_rate)
print("Confidence Interval:")
print(confidence_interval)

Validation dataframe has 23885 records.
Training dataframe has 71656 records.
Starting optimization with initial parameters


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1262     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.88327D+02    |proj g|=  3.54416D+01

At iterate    1    f=  2.82931D+02    |proj g|=  2.94682D+00

At iterate    2    f=  2.82796D+02    |proj g|=  8.38196D-01

At iterate    3    f=  2.82579D+02    |proj g|=  3.90514D+00

At iterate    4    f=  2.81735D+02    |proj g|=  1.14141D+01

At iterate    5    f=  2.80326D+02    |proj g|=  1.78459D+01

At iterate    6    f=  2.78255D+02    |proj g|=  1.85961D+01

At iterate    7    f=  2.76291D+02    |proj g|=  5.08267D+00

At iterate    8    f=  2.76064D+02    |proj g|=  1.14291D+00

At iterate    9    f=  2.76040D+02    |proj g|=  2.15546D+00

Optimization result:   message: STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT
  success: False
   status: 1
      fun: 275.9794089837743
        x: [-7.382e-02 -4.914e-02 ...  0.000e+00  3.339e-01]
      n

No predictions made. Total predictions is zero.
Validation Accuracy:
0
Validation Ranked Probability Score (RPS):
0
No predictions made. Total predictions is zero.
Test Accuracy:
0
Test Ranked Probability Score (RPS):
0
Hit Rate:
0
Confidence Interval:
(0.0, 0.0)


# GLM Model 

In [52]:
ISDBv2_df['season'] = ISDBv2_df['Sea'].apply(lambda x: int('20' + str(x)[:2]))
df = ISDBv2_df[ISDBv2_df['season'] >= 2009]
# Preprocess the dataset.
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df.sort_values(by='Date', inplace=True)
categorical_columns = ['HT', 'AT']
for col in categorical_columns:
    df[col] = df[col].astype('category')
# Ensure reproducibility.
random_state = 42
# Step 1: Split by team to ensure each team is represented in the train split.
teams = df['HT'].unique()
train_indices = []
test_indices = []
for team in teams:
    team_df = df[df['HT'] == team]
    train_team, test_team = sms.train_test_split(
        team_df, 
        test_size=0.2, 
        random_state=random_state
    )
    train_indices.extend(train_team.index)
    test_indices.extend(test_team.index)
# Create train and test DataFrames.
train_df = df.loc[train_indices]
test_df = df.loc[test_indices]

def unravel_dataset(df) -> pd.DataFrame():
    """
    Unravel the dataset by creating one entry for each row as team-opponent pair. 
    """
    home_df = df[['Date', 'Sea', 'Lge', 'HT', 'AT', 'HS']].copy()
    home_df.rename(columns={'HT': 'team', 'AT': 'opponent', 'HS': 'goals'}, inplace=True)
    home_df['is_home'] = 1
    away_df = df[['Date', 'Sea', 'Lge', 'HT', 'AT', 'AS']].copy()
    away_df.rename(columns={'AT': 'team', 'HT': 'opponent', 'AS': 'goals'}, inplace=True)
    away_df['is_home'] = 0
    unraveled_df = pd.concat([home_df, away_df], ignore_index=True)
    return unraveled_df

# Unravel the training dataset.
unraveled_train_df = unravel_dataset(train_df)
# Unravel the test dataset.
unraveled_test_df = unravel_dataset(test_df)

In [53]:
unraveled_train_df.head()

Unnamed: 0,Date,Sea,Lge,team,opponent,goals,is_home
0,2015-01-30,14-15,CHL1,CD Huachipato,CD Cobresal,2,1
1,2015-02-07,14-15,CHL1,CD Huachipato,Universidad de Chile,2,1
2,2009-01-31,09-10,CHL1,CD Huachipato,Universidad de Concepcion,0,1
3,2009-09-12,09-10,CHL1,CD Huachipato,CSD Rangers,0,1
4,2017-03-12,16-17,CHL1,CD Huachipato,Higgins,0,1


Create a representative sample for easier handling.

In [54]:
def representative_sample(df, sample_size) -> pd.DataFrame():
    """
    Function to perform representative sampling to ensure each team 
    is represented.
    param: df: Input dataframe for sampling.
    param: sample_size: Size of the extracted sample (output dataframe).
    return: sampled_df: Sampled dataframe.
    """
    teams = df['team'].unique()
    samples_per_team = sample_size // len(teams)
    sampled_df = pd.DataFrame()
    for team in teams:
        team_df = df[df['team'] == team]
        team_sample = team_df.sample(n=min(samples_per_team, len(team_df)), random_state=1)
        sampled_df = pd.concat([sampled_df, team_sample])
    # Additional random sampling to fill the remaining sample size
    remaining_sample_size = sample_size - len(sampled_df)
    if remaining_sample_size > 0:
        additional_sample = df.drop(sampled_df.index).sample(n=remaining_sample_size, random_state=1)
        sampled_df = pd.concat([sampled_df, additional_sample])
    return sampled_df

# Sample 20% of the training data.
sample_size = int(0.2 * len(unraveled_train_df))
# Perform representative sampling on the training set.
sampled_train_df = representative_sample(unraveled_train_df, sample_size)
sampled_train_df.head()

Unnamed: 0,Date,Sea,Lge,team,opponent,goals,is_home
44,2009-07-18,09-10,CHL1,CD Huachipato,Union Espanola,1,1
173699,2012-03-03,12-13,CHL1,CD Huachipato,Union San Felipe,2,0
51,2017-02-11,16-17,CHL1,CD Huachipato,CD Palestino,3,1
185403,2014-09-14,14-15,CHL1,CD Huachipato,Antofagasta,2,0
35,2009-11-08,09-10,CHL1,CD Huachipato,Universidad de Chile,1,1


Describe the sampled DataFrame.

In [55]:
print("NaN values per column:")
print(sampled_train_df.isna().sum())

numeric_cols = sampled_train_df.select_dtypes(include=[np.number]).columns
print("\nInfinite values per numeric column:")
for col in numeric_cols:
    num_infs = np.isinf(sampled_train_df[col]).sum()
    print(f"{col}: {num_infs}")
print(sampled_train_df.dtypes)
print(sampled_train_df.describe(include='all'))

NaN values per column:
Date        0
Sea         0
Lge         0
team        0
opponent    0
goals       0
is_home     0
dtype: int64

Infinite values per numeric column:
goals: 0
is_home: 0
Date        datetime64[ns]
Sea                 object
Lge                 object
team              category
opponent          category
goals                int64
is_home              int64
dtype: object
                                 Date    Sea    Lge  \
count                           38676  38676  38676   
unique                            NaN      8     52   
top                               NaN  16-17   FRA3   
freq                              NaN   6050   1514   
mean    2013-07-11 21:55:52.032268032    NaN    NaN   
min               2009-02-01 00:00:00    NaN    NaN   
25%               2011-06-10 18:00:00    NaN    NaN   
50%               2013-08-28 00:00:00    NaN    NaN   
75%               2015-08-30 00:00:00    NaN    NaN   
max               2017-06-28 00:00:00    NaN    NaN   
s

In [60]:
def ensure_finite_weights(df) -> pd.DataFrame():
    """
    Function to ensure weights are finite.
    """
    # Adding a small constant to goals to avoid log(0).
    df['goals'] = df['goals'].apply(lambda x: x + 1e-9 if x == 0 else x)
    # Check if there are any infinite or NaN weights and handle them
    if df.isna().sum().sum() > 0:
        print("NaN values found in the data. Removing rows with NaNs.")
        df.dropna(inplace=True)
    if np.isinf(df.select_dtypes(include=[np.number])).sum().sum() > 0:
        print("Infinite values found in the data. Removing rows with Infs.")
        df = df[~np.isinf(df.select_dtypes(include=[np.number])).any(1)]
    return df

# Ensure weights are finite in the sampled training data.
sampled_train_df = ensure_finite_weights(sampled_train_df)
# Create the formula to include team offensive and opponent defensive strengths and home advantage.
formula = 'goals ~ C(team) + C(opponent) + is_home'
# Fit the Poisson regression model.
poisson_model = smf.glm(formula=formula, data=sampled_train_df, family=sm.families.Poisson()).fit(maxiter=10)
# Display the summary of the model.
print(poisson_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  goals   No. Observations:                38676
Model:                            GLM   Df Residuals:                    36188
Model Family:                 Poisson   Df Model:                         2487
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -52668.
Date:                Tue, 28 May 2024   Deviance:                       41037.
Time:                        08:29:11   Pearson chi2:                 3.53e+04
No. Iterations:                    10   Pseudo R-squ. (CS):             0.1639
Covariance Type:            nonrobust                                         
                                                       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------

Generate Predictions

In [61]:
# Predict the expected goals for home and away teams in the test set.
unraveled_test_df['predicted_goals'] = poisson_model.predict(unraveled_test_df)
unraveled_test_df

Unnamed: 0,Date,Sea,Lge,team,opponent,goals,is_home,predicted_goals
0,2012-04-28,12-13,CHL1,CD Huachipato,Universidad Catolica,2,1,1.736253
1,2014-04-13,13-14,CHL1,CD Huachipato,Universidad de Chile,5,1,1.610566
2,2010-02-14,10-11,CHL1,CD Huachipato,Union Espanola,1,1,2.274356
3,2010-11-07,10-11,CHL1,CD Huachipato,Higgins,2,1,2.207500
4,2014-01-10,13-14,CHL1,CD Huachipato,CSD Rangers,3,1,1.360213
...,...,...,...,...,...,...,...,...
49517,2016-10-30,16-17,NZL1,Waitakere United,Hamilton Wanderers,2,0,2.210899
49518,2017-02-05,16-17,NZL1,Team Wellington,Eastern Suburbs,1,0,1.180839
49519,2016-11-13,16-17,NZL1,Waitakere United,Eastern Suburbs,0,0,1.115173
49520,2017-02-19,16-17,NZL1,Eastern Suburbs,Tasman United,1,0,1.907933


In [58]:
# Split the dataframe into home and away rows.
home_df = unraveled_test_df[unraveled_test_df['is_home'] == 1].copy()
away_df = unraveled_test_df[unraveled_test_df['is_home'] == 0].copy()
# Rename columns for merging
home_df.rename(columns={'team': 'HT', 'opponent': 'AT', 'goals': 'HS', 'predicted_goals': 'Lambda_HS'}, inplace=True)
away_df.rename(columns={'team': 'AT', 'opponent': 'HT', 'goals': 'AS', 'predicted_goals': 'Lambda_AS'}, inplace=True)
# Merge the home and away dataframes
merged_df = pd.merge(home_df, away_df, on=['Date', 'Sea', 'Lge', 'HT', 'AT'], suffixes=('_home', '_away'))
# Select and reorder columns for the final dataframe
test_df = merged_df[['Date', 'Sea', 'Lge', 'HT', 'AT', 'HS', 'AS', 'Lambda_HS', 'Lambda_AS']]
# Display the resulting dataframe
print(test_df.head())

        Date    Sea   Lge             HT                    AT  HS  AS  \
0 2012-04-28  12-13  CHL1  CD Huachipato  Universidad Catolica   2   2   
1 2014-04-13  13-14  CHL1  CD Huachipato  Universidad de Chile   5   2   
2 2010-02-14  10-11  CHL1  CD Huachipato        Union Espanola   1   2   
3 2010-11-07  10-11  CHL1  CD Huachipato               Higgins   2   1   
4 2014-01-10  13-14  CHL1  CD Huachipato           CSD Rangers   3   2   

   Lambda_HS  Lambda_AS  
0   1.736253   2.076067  
1   1.610566   1.628574  
2   2.274356   1.759763  
3   2.207500   1.579188  
4   1.360213   0.995547  


Evaluate

In [59]:
# Round off the predicted goals to integers
test_df['Lambda_HS'] = test_df['Lambda_HS'].round().astype(int)
test_df['Lambda_AS'] = test_df['Lambda_AS'].round().astype(int)

# Define the 
def calculate_match_outcome_probabilities(row):
    """
    Function to calculate match outcome probabilities.
    """
    max_goals = 10  
    home_goals_probs = [np.exp(-row['Lambda_HS']) * row['Lambda_HS']**i / np.math.factorial(i) for i in range(max_goals)]
    away_goals_probs = [np.exp(-row['Lambda_AS']) * row['Lambda_AS']**i / np.math.factorial(i) for i in range(max_goals)]
    prob_home_win = 0
    prob_away_win = 0
    prob_draw = 0
    for i in range(max_goals):
        for j in range(max_goals):
            prob = home_goals_probs[i] * away_goals_probs[j]
            if i > j:
                prob_home_win += prob
            elif i < j:
                prob_away_win += prob
            else:
                prob_draw += prob
    return pd.Series({
        'prob_home_win': prob_home_win,
        'prob_away_win': prob_away_win,
        'prob_draw': prob_draw
    })
# Apply the function to the test set
probabilities = test_df.apply(calculate_match_outcome_probabilities, axis=1)
test_df = pd.concat([test_df, probabilities], axis=1)
# Display the test set with probabilities
print(test_df.head())
# Predict the outcomes based on probabilities
test_df['predicted_outcome'] = np.where(test_df['prob_home_win'] > test_df['prob_away_win'], 'home_win',
                                        np.where(test_df['prob_away_win'] > test_df['prob_home_win'], 
                                                 'away_win', 'draw'))
# Calculate actual outcomes for comparison
test_df['actual_outcome'] = np.where(test_df['HS'] > test_df['AS'], 'home_win',
                                     np.where(test_df['HS'] < test_df['AS'], 'away_win', 'draw'))
# Calculate accuracy
accuracy = skm.accuracy_score(test_df['actual_outcome'], test_df['predicted_outcome'])
print("Model Accuracy on Test Set:", accuracy)

        Date    Sea   Lge             HT                    AT  HS  AS  \
0 2012-04-28  12-13  CHL1  CD Huachipato  Universidad Catolica   2   2   
1 2014-04-13  13-14  CHL1  CD Huachipato  Universidad de Chile   5   2   
2 2010-02-14  10-11  CHL1  CD Huachipato        Union Espanola   1   2   
3 2010-11-07  10-11  CHL1  CD Huachipato               Higgins   2   1   
4 2014-01-10  13-14  CHL1  CD Huachipato           CSD Rangers   3   2   

   Lambda_HS  Lambda_AS  prob_home_win  prob_away_win  prob_draw  
0          2          2       0.396453       0.396453   0.207002  
1          2          2       0.396453       0.396453   0.207002  
2          2          2       0.396453       0.396453   0.207002  
3          2          2       0.396453       0.396453   0.207002  
4          1          1       0.345746       0.345746   0.308508  
Model Accuracy on Test Set: 0.46480352166713784
