In [1]:
#Import All Used Libraries
random_state = 42

import pandas as pd
import numpy as np
import random as python_random
import glob
import json

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVR

import tensorflow
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, RMSprop, SGD

In [2]:
#Pull in Men's Rugby Union World Rankings
#One score per international team per year 
world_rankings = pd.read_csv('data/rugby_world_rankings.csv')
world_rankings.head()

Unnamed: 0,year,team,world_rank_score
0,2023,Ireland,90.63
1,2022,South Africa,90.61
2,2021,South Africa,94.2
3,2020,South Africa,94.2
4,2019,New Zealand,92.55


In [3]:
#Pull in Men's Rugby Union International Game Data from 2004 - 2022 and retain only columns of interest/value
column_names = ['match_id', 'match_number', 'location_id', 'venue_name', 'venue_city', 'venue_country', 
                'del1', 'del2', 'match_timestamp', 'del3', 'del4', 'team_a', 'team_a_abbrev', 'del5', 
                'team_b', 'team_b_abbrev', 'score_a', 'score_b', 'del6', 'del7', 'del8', 'tournament', 
                'rugby_type', 'del9', 'del10', 'del11', 'del12', 'match_details']

files = glob.glob("data/matches/*.csv")

df = pd.DataFrame()
for f in files:
    csv = pd.read_csv(f, names = column_names)
    df = df.append(csv)
    
df = df[['venue_name', 'venue_city', 'venue_country', 'match_timestamp', 'team_a', 'team_a_abbrev', 'team_b', 
         'team_b_abbrev', 'score_a', 'score_b', 'tournament', 'rugby_type', 'match_details']]

#Limit to only Men's Rugby Union
df = df[df['rugby_type'] == 'mru']
df.head() #5,038 records

Unnamed: 0,venue_name,venue_city,venue_country,match_timestamp,team_a,team_a_abbrev,team_b,team_b_abbrev,score_a,score_b,tournament,rugby_type,match_details
65,Estadio do Pacaembu,Sao Paulo,Brazil,"Fri 3 Feb 2017, 20:15 GMT-02:00",Brazil,BRA,Chile,CHI,17,3,2017 Americas Rugby Championship,mru,"{""matchId"":23930.0,""description"":""Match 1"",""ve..."
90,Murrayfield,Edinburgh,Scotland,"Sat 4 Feb 2017, 14:25 GMT",Scotland,SCO,Ireland,IRE,27,22,2017 Six Nations,mru,"{""matchId"":22580.0,""description"":""Match 1"",""ve..."
94,Toyota Field,San Antonio,United States,"Sat 4 Feb 2017, 15:00 GMT-06:00",USA,USA,Uruguay,URU,29,23,2017 Americas Rugby Championship,mru,"{""matchId"":23931.0,""description"":""Match 3"",""ve..."
100,Twickenham,London,England,"Sat 4 Feb 2017, 16:50 GMT",England,ENG,France,FRA,19,16,2017 Six Nations,mru,"{""matchId"":22581.0,""description"":""Match 2"",""ve..."
102,Westhills Stadium,Langford,Canada,"Sat 4 Feb 2017, 17:00 GMT-08:00",Canada,CAN,Argentina XV,ARG,6,20,2017 Americas Rugby Championship,mru,"{""matchId"":23929.0,""description"":""Match 2"",""ve..."


In [4]:
#Clean Up Inconsistent Naming of Teams: Canada vs Canada A. 
def team_cleanup(x):
    if "A" == x.split()[-1] or "XV" == x.split()[-1]:
        return x.split()[0]
    elif x == "St Vincent and the Grenadines":
        return "St Vincent and Grenadines"
    elif x == "Maori All Blacks" or x == "New Zealand Maori":
        return "New Zealand"
    elif x == "South":
        return "South Africa"
    elif x == "Hong":
        return "Hong Kong"
    else:
        return x

#Clean Up Inconsistent Naming of Venues: Russian Federation vs Russia.     
def venue_cleanup(x):
    if x == "St Vincent and the Grenadines":
        return "St Vincent and Grenadines"
    elif x == "South Korea":
        return "Korea"
    elif x == "Russian Federation":
        return "Russia"
    elif x == "People's Republic of China":
        return "China"
    elif x == "Laos People's Democratic Republic":
        return "Laos"
    elif x == "Bosnia and Herzegovina":
        return "Bosnia & Herzegovina"
    else:
        return x
    
#Apply Cleaning Functions
df['team_a'] = df['team_a'].apply(team_cleanup)
df['team_b'] = df['team_b'].apply(team_cleanup)
df['venue_country'] = df['venue_country'].apply(venue_cleanup)

#Convert Timestamp to pull through the year of the match only
df['match_year'] = pd.DatetimeIndex(pd.to_datetime(df['match_timestamp'], utc = True)).year

#Merge in Rugby Union World Rankings for Teams A and B
df = df.merge(world_rankings, how='inner', left_on=['team_a', 'match_year'], right_on=['team', 'year'])
df = df.merge(world_rankings, how='inner', left_on=['team_b', 'match_year'], right_on=['team', 'year'])

#Keep only relevant columns
df= df[['tournament', 'venue_name', 'venue_city', 'venue_country', 'match_year', 
        'team_a', 'team_b', 'score_a', 'score_b', 'world_rank_score_x', 'world_rank_score_y']]

#Rename some of the feature columns
df.columns = ['tournament', 'venue_name', 'venue_city', 'venue_country', 'match_year', 
              'team_a', 'team_b', 'score_a', 'score_b', 'team_a_world_rank', 'team_b_world_rank']

#Add binary columns denoting home game for either Team A or B or neither
df['team_a_home'] = np.where(df['venue_country'] == df['team_a'], 1 ,0)
df['team_b_home'] = np.where(df['venue_country'] == df['team_b'], 1 ,0)

#df #3,841 records

In [5]:
#Create a melted dataframe from the above table that allows for moving average calculations
temp_ma = pd.melt(df, id_vars=['match_year', 'score_a', 'score_b'], value_vars=['team_a', 'team_b'], value_name='team', 
                  ignore_index = False) \
            .reset_index()

#Team A and B are now given their own row and the appropriate points for and against are carried through
temp_ma['points_for'] = np.where(temp_ma['variable'] == 'team_a', temp_ma['score_a'], temp_ma['score_b'])
temp_ma['points_against'] = np.where(temp_ma['variable'] == 'team_a', temp_ma['score_b'], temp_ma['score_a'])
#Rows are sorted by team and index (substitute for match date) to assist with the moving average calculation
temp_ma.sort_values(by=['team', 'index'], inplace = True)

In [6]:
#Create the 3-day moving average columns that do NOT consider the current row in its determination
temp_ma['points_for_moving'] = temp_ma.groupby('team')['points_for'].apply(lambda x: x.rolling(3).mean().shift())
temp_ma['points_against_moving'] = temp_ma.groupby('team')['points_against'].apply(lambda x: x.rolling(3).mean().shift())

moving_average = temp_ma[['index', 'match_year', 'variable', 'team', 
                          'points_for_moving', 'points_against_moving']].dropna()

#moving_average

In [7]:
#Create the final dataset to pull in moving averages for both teams

column_names = ['tournament', 'venue_name', 'venue_city', 'venue_country', 'match_year', 'team_a', 'team_b', 
                'score_a', 'score_b', 'world_rank_a', 'world_rank_b', 'home_a', 'home_b',
               'points_for_moving_a', 'points_against_moving_a', 'points_for_moving_b', 'points_against_moving_b']

team_a_merge = moving_average[moving_average['variable'] == 'team_a']
team_b_merge = moving_average[moving_average['variable'] == 'team_b']


df = df.merge(team_a_merge, how = 'inner', left_index = True, right_on = 'index')
df = df.merge(team_b_merge, how = 'inner', left_index = True, right_on = 'index')
df = df[['tournament', 'venue_name', 'venue_city', 'venue_country', 'match_year_x', 'team_a', 'team_b', 'score_a', 'score_b',
       'team_a_world_rank', 'team_b_world_rank', 'team_a_home', 'team_b_home', 'points_for_moving_x', 
         'points_against_moving_x', 'points_for_moving_y', 'points_against_moving_y']]

df.columns = column_names
df.head() #3,596 records

Unnamed: 0,tournament,venue_name,venue_city,venue_country,match_year,team_a,team_b,score_a,score_b,world_rank_a,world_rank_b,home_a,home_b,points_for_moving_a,points_against_moving_a,points_for_moving_b,points_against_moving_b
3885,World Rugby Americas Pacific Challenge 2017,Estadio Charrua,Montevideo,Uruguay,2017,Tonga,Samoa,31,28,71.96,71.29,0,0,22.333333,21.666667,17.666667,44.0
3889,2017 New Zealand Tour (FRA/SCO/WAL),Principality Stadium,Cardiff,Wales,2017,Wales,New Zealand,18,33,82.56,94.78,1,0,20.333333,14.333333,50.333333,8.666667
3896,2017 Australia Tour (JPN/WAL/ENG/SCO),Murrayfield,Edinburgh,Scotland,2017,Scotland,Australia,53,24,80.71,86.36,1,0,30.0,20.0,32.333333,33.0
3898,2017 Australia Tour (JPN/WAL/ENG/SCO),Principality Stadium,Cardiff,Wales,2017,Wales,Australia,21,29,82.56,86.36,1,0,20.333333,18.666667,17.666667,33.666667
3899,2017 The Rugby Championship,Estadio Malvinas Argentinas,Mendoza,Argentina,2017,Argentina,Australia,20,37,79.93,86.36,1,0,55.333333,22.333333,19.666667,34.666667


In [8]:
#Prepare X and y datasets
X = df.drop(columns=['tournament', 'venue_name', 'venue_city', 'venue_country', 
                     'match_year', 'team_a', 'team_b', 'score_a', 'score_b'])
y = df['score_a'] - df['score_b']

In [9]:
#Random Forest Regression GridSearch

#rf = RandomForestRegressor(random_state = random_state)
#
#Parameters to run through GridSearch
#param_grid = {
#    'max_depth': [None, 2, 5, 10, 15],
#    'min_samples_split': [2, 5, 10, 15],
#    'min_samples_leaf': [1, 2, 10, 20]
#             }

#Run GridSearch
#grid_search = GridSearchCV(rf, param_grid, cv=5, scoring = 'neg_mean_absolute_error')
#grid_search.fit(X, y)

#Print out best parameters and best score
#print("Best parameters: ", grid_search.best_params_)
#print("Best score: ", grid_search.best_score_)

In [10]:
#Best Random Forest Model

rf_final = RandomForestRegressor(random_state = random_state,
                                  n_estimators = 100,
                                  max_depth = 10,
                                  min_samples_split = 2,
                                  min_samples_leaf = 20)

rf_cross_val = cross_val_score(rf_final, X, y, cv=5, scoring = 'neg_mean_absolute_error')

#The Mean Negative Mean Absolute Error following 5-fold cross-validation and its standard deviation
print("Mean Negative Mean Absolute Error - Random Forest Best Model: ", np.mean(rf_cross_val))
print("St. Dev. Negative Mean Absolute Error - Random Forest Best Model: ", np.std(rf_cross_val))

Mean Negative Mean Absolute Error - Random Forest Best Model:  -15.323795794725594
St. Dev. Negative Mean Absolute Error - Random Forest Best Model:  0.6088942095489058


In [11]:
#Support Vector Regression GridSearch

#svr_model = SVR()

#Parameters to run through GridSearch
#param_grid = {
#              'C': [0.1, 1, 5, 10], 
#              'kernel': ['linear', 'rbf', 'sigmoid'],
#              'gamma': [1, 0.1, 0.01, 0.001]
#             }

#Run GridSearch
#grid_search = GridSearchCV(svr_model, param_grid, cv=5, scoring='neg_mean_absolute_error')
#grid_search.fit(X, y)

#Print out best parameters and best score
#print("Best parameters: ", grid_search.best_params_)
#print("Best score: ", grid_search.best_score_)

In [12]:
#Best SVR Model

svr_final = SVR(C = 1,
                kernel = 'linear',
                gamma = 1)

svr_cross_val = cross_val_score(svr_final, X, y, cv=5, scoring = 'neg_mean_absolute_error')

#The Mean Negative Mean Absolute Error following 5-fold cross-validation and its standard deviation
print("Mean Negative Mean Absolute Error - SVR Best Model: ", np.mean(svr_cross_val))
print("St. Dev. Negative Mean Absolute Error - SVR Best Model: ", np.std(svr_cross_val))

Mean Negative Mean Absolute Error - SVR Best Model:  -15.076681650575892
St. Dev. Negative Mean Absolute Error - SVR Best Model:  0.4609272885504068


In [13]:
#Neural Network GridSearch and Build Function

#Save GridSearch parameters and results

#param_grid = {'optimizer': ['SGD', 'RMSprop', 'Adam'],
#              'learning_rate': [.001, .005, .01, .1],
#              'activation': ['relu', 'tanh', 'sigmoid'],
#              'batch_size': [8, 16, 32, 64],
#              'epochs': [10, 20, 100]}

#Best parameters:  {'activation': 'relu', 'batch_size': 16, 'epochs': 100, 'learning_rate': 0.01, 'optimizer': 'RMSprop'}
#Best score:  -15.065056332763906

#Function to ensure repeatability of results
def reset_seeds():
   np.random.seed(random_state) 
   python_random.seed(random_state)
   tensorflow.random.set_seed(random_state)

reset_seeds() 

#Format input data appropriately
X = np.asarray(X)
y = np.asarray(y)

#To be used as the build function for KerasRegression
#Default parameter values are from the grid searches previously run
def build_ann_model(optimizer = 'RMSprop', learning_rate = 0.01, activation = 'relu'):
    
    model = Sequential()
    model.add(Dense(14, input_dim = 8, activation = activation))
    model.add(Dense(1))
    
    if optimizer == 'RMSprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer == 'Adam':
        optimizer = Adam(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)
    
    model.compile(loss = 'mean_absolute_error', optimizer = optimizer, metrics = 'mean_absolute_error')
    return model


#Run Grid Search with parameter grid above
#ann_model = KerasRegressor(build_fn = build_ann_model)
#grid_search = GridSearchCV(estimator = ann_model, param_grid = param_grid, cv = 5, scoring = 'neg_mean_absolute_error')
#grid_search.fit(X, y, verbose = 0)

#Print out best parameters and best score
#print("Best parameters: ", grid_search.best_params_)
#print("Best score: ", grid_search.best_score_)

In [14]:
#Best Neural Network Model

reset_seeds()
nn_final = KerasRegressor(build_fn = build_ann_model, batch_size = 16, epochs = 100, verbose = 0)
nn_cross_val = cross_val_score(nn_final, X, y, cv=5, scoring = 'neg_mean_absolute_error')

#The Mean Negative Mean Absolute Error following 5-fold cross-validation and its standard deviation
print("Mean Negative Mean Absolute Error - Neural Network Best Model: ", np.mean(nn_cross_val))
print("St. Dev. Negative Mean Absolute Error - Neural Network Best Model: ", np.std(nn_cross_val))

Mean Negative Mean Absolute Error - Neural Network Best Model:  -15.051403925852972
St. Dev. Negative Mean Absolute Error - Neural Network Best Model:  0.49319223849010035


In [15]:
#Pull through point differentials for the actual match and as predicted by SVR, Random Forest, and Neural Network
df['actual_point_diff'] = df['score_a'] - df['score_b']
df['svr_point_diff'] = svr_final.fit(X, y).predict(X)
df['rf_point_diff'] = rf_final.fit(X, y).predict(X)

reset_seeds()
history = nn_final.fit(X, y, verbose = 0)
df['nn_point_diff'] = nn_final.predict(X)

#Pull through actual winner of the match and as predicted by SVR, Random Forest, and Neural Network
df['actual_winner'] = np.where(df['actual_point_diff'] > 0, df['team_a'], df['team_b'])
df['svr_winner'] = np.where(df['svr_point_diff'] > 0, df['team_a'], df['team_b'])
df['rf_winner'] = np.where(df['rf_point_diff'] > 0, df['team_a'], df['team_b'])
df['nn_winner'] = np.where(df['nn_point_diff'] > 0, df['team_a'], df['team_b'])

#Accuracy Score of Match Prediction for SVR, Random Forest, and Neural Network following full dataset training
print("Accuracy Score - SVR Best Model: ", '{0:.1%}'.format(sum(np.where(df['actual_winner'] == df['svr_winner'], 1, 0))/len(df)))
print("Accuracy Score - RF Best Model: ", '{0:.1%}'.format(sum(np.where(df['actual_winner'] == df['rf_winner'], 1, 0))/len(df)))
print("Accuracy Score - NN Best Model: ", '{0:.1%}'.format(sum(np.where(df['actual_winner'] == df['nn_winner'], 1, 0))/len(df)))

Accuracy Score - SVR Best Model:  74.1%
Accuracy Score - RF Best Model:  77.1%
Accuracy Score - NN Best Model:  74.3%


In [16]:
#Ablation Analysis

X_ablation = df.drop(columns=['tournament', 'venue_name', 'venue_city', 'venue_country', 
                              'match_year', 'team_a', 'team_b', 'score_a', 'score_b',
                              'actual_point_diff', 'svr_point_diff', 'rf_point_diff',
                              'nn_point_diff', 'actual_winner', 'svr_winner','rf_winner',
                              'nn_winner'])

ablation_analysis = []

for i in range(len(X_ablation.columns)):
    X = X_ablation[X_ablation.columns[X_ablation.columns != X_ablation.columns[i]]]
    
    reset_seeds() 

    #Format input data appropriately
    X = np.asarray(X)
    y = np.asarray(y)

    #To be used as the build function for KerasRegression
    #Default parameter values are from the grid searches previously run
    def build_ann_model(optimizer = 'RMSprop', learning_rate = 0.01, activation = 'relu'):
    
        model = Sequential()
        model.add(Dense(14, input_dim = 7, activation = activation))
        model.add(Dense(1))
    
        if optimizer == 'RMSprop':
            optimizer = RMSprop(learning_rate=learning_rate)
        elif optimizer == 'Adam':
            optimizer = Adam(learning_rate=learning_rate)
        else:
            optimizer = SGD(learning_rate=learning_rate)
    
        model.compile(loss = 'mean_absolute_error', optimizer = optimizer, metrics = 'mean_absolute_error')
        return model
    
    ablation_final = KerasRegressor(build_fn = build_ann_model, batch_size = 16, epochs = 100, verbose = 0)
    ablation_cross_val = cross_val_score(ablation_final, X, y, cv=5, scoring = 'neg_mean_absolute_error')
    
    ablation_analysis.append((X_ablation.columns[i], np.mean(ablation_cross_val), np.std(ablation_cross_val)))
    
ablation_analysis

[('world_rank_a', -18.343375634228757, 0.7402195140560642),
 ('world_rank_b', -18.382604597612758, 0.5499097876291671),
 ('home_a', -15.089157933740264, 0.5532417192064852),
 ('home_b', -15.121760157834036, 0.4973002272072297),
 ('points_for_moving_a', -15.125933563557277, 0.4974493365103749),
 ('points_against_moving_a', -15.061413649138212, 0.36438049135664846),
 ('points_for_moving_b', -15.10466696653062, 0.49520282635716334),
 ('points_against_moving_b', -15.130769517945371, 0.4820605303303191)]

In [17]:
#Sensitivity Analyses

########### Optimizer ###############################

X_optimizer = df.drop(columns=['tournament', 'venue_name', 'venue_city', 'venue_country', 
                               'match_year', 'team_a', 'team_b', 'score_a', 'score_b',
                               'actual_point_diff', 'svr_point_diff', 'rf_point_diff',
                               'nn_point_diff', 'actual_winner', 'svr_winner','rf_winner',
                               'nn_winner'])

optimizers = ['Adam', 'RMSprop', 'SGD']
optimizer_analysis = []

for i in range(len(optimizers)):
    
    reset_seeds() 

    #Format input data appropriately
    X = np.asarray(X_optimizer)
    y = np.asarray(y)

    #To be used as the build function for KerasRegression
    #Default parameter values are from the grid searches previously run
    def build_ann_model(optimizer = optimizers[i], learning_rate = 0.01, activation = 'relu'):
    
        model = Sequential()
        model.add(Dense(14, input_dim = 8, activation = activation))
        model.add(Dense(1))
    
        if optimizer == 'RMSprop':
            optimizer = RMSprop(learning_rate=learning_rate)
        elif optimizer == 'Adam':
            optimizer = Adam(learning_rate=learning_rate)
        else:
            optimizer = SGD(learning_rate=learning_rate)
    
        model.compile(loss = 'mean_absolute_error', optimizer = optimizer, metrics = 'mean_absolute_error')
        return model
    
    optimizer_final = KerasRegressor(build_fn = build_ann_model, batch_size = 16, epochs = 100, verbose = 0)
    optimizer_cross_val = cross_val_score(optimizer_final, X, y, cv=5, scoring = 'neg_mean_absolute_error')
    
    optimizer_analysis.append((optimizers[i], np.mean(optimizer_cross_val), np.std(optimizer_cross_val)))
    
    


########### Learning Rate ###############################

X_learning = df.drop(columns=['tournament', 'venue_name', 'venue_city', 'venue_country', 
                              'match_year', 'team_a', 'team_b', 'score_a', 'score_b',
                              'actual_point_diff', 'svr_point_diff', 'rf_point_diff',
                              'nn_point_diff', 'actual_winner', 'svr_winner','rf_winner',
                              'nn_winner'])

learning = [.001, .005, .01, .1]
learning_analysis = []

for i in range(len(learning)):
    
    reset_seeds() 

    #Format input data appropriately
    X = np.asarray(X_learning)
    y = np.asarray(y)

    #To be used as the build function for KerasRegression
    #Default parameter values are from the grid searches previously run
    def build_ann_model(optimizer = 'RMSprop', learning_rate = learning[i], activation = 'relu'):
    
        model = Sequential()
        model.add(Dense(14, input_dim = 8, activation = activation))
        model.add(Dense(1))
    
        if optimizer == 'RMSprop':
            optimizer = RMSprop(learning_rate=learning_rate)
        elif optimizer == 'Adam':
            optimizer = Adam(learning_rate=learning_rate)
        else:
            optimizer = SGD(learning_rate=learning_rate)
    
        model.compile(loss = 'mean_absolute_error', optimizer = optimizer, metrics = 'mean_absolute_error')
        return model
    
    learning_final = KerasRegressor(build_fn = build_ann_model, batch_size = 16, epochs = 100, verbose = 0)
    learning_cross_val = cross_val_score(learning_final, X, y, cv=5, scoring = 'neg_mean_absolute_error')
    
    learning_analysis.append((learning[i], np.mean(learning_cross_val), np.std(learning_cross_val)))
    
    
    

    
########### Activation ###############################

X_activation = df.drop(columns=['tournament', 'venue_name', 'venue_city', 'venue_country', 
                                'match_year', 'team_a', 'team_b', 'score_a', 'score_b',
                                'actual_point_diff', 'svr_point_diff', 'rf_point_diff',
                                'nn_point_diff', 'actual_winner', 'svr_winner','rf_winner',
                                'nn_winner'])

activation = ['relu', 'tanh', 'sigmoid']
activation_analysis = []

for i in range(len(activation)):
    
    reset_seeds() 

    #Format input data appropriately
    X = np.asarray(X_activation)
    y = np.asarray(y)

    #To be used as the build function for KerasRegression
    #Default parameter values are from the grid searches previously run
    def build_ann_model(optimizer = 'RMSprop', learning_rate = .01, activation = activation[i]):
    
        model = Sequential()
        model.add(Dense(14, input_dim = 8, activation = activation))
        model.add(Dense(1))
    
        if optimizer == 'RMSprop':
            optimizer = RMSprop(learning_rate=learning_rate)
        elif optimizer == 'Adam':
            optimizer = Adam(learning_rate=learning_rate)
        else:
            optimizer = SGD(learning_rate=learning_rate)
    
        model.compile(loss = 'mean_absolute_error', optimizer = optimizer, metrics = 'mean_absolute_error')
        return model
    
    activation_final = KerasRegressor(build_fn = build_ann_model, batch_size = 16, epochs = 100, verbose = 0)
    activation_cross_val = cross_val_score(activation_final, X, y, cv=5, scoring = 'neg_mean_absolute_error')
    
    activation_analysis.append((activation[i], np.mean(activation_cross_val), np.std(activation_cross_val)))

In [18]:
#Failure Analysis

#View Predictions
#df.loc[5216] #Huge Japan Upset - Huge reliability on world rank
#df.loc[5212] #England Blows out Ireland - Resting players?
df.loc[3948] #Model not accounting for ties

tournament                 Rugby World Cup 2019 Qualifying - Americas
venue_name                                          Tim Hortons Field
venue_city                                                   Hamilton
venue_country                                                  Canada
match_year                                                       2017
team_a                                                         Canada
team_b                                                            USA
score_a                                                            28
score_b                                                            28
world_rank_a                                                    63.92
world_rank_b                                                    64.62
home_a                                                              1
home_b                                                              0
points_for_moving_a                                         13.333333
points_against_movin

In [19]:
#2023 Men's Rugby Union World Cup Prediction
#Generate Group Stage Dataset

#Pull in Men's Rugby Union World Cup Shell
world_cup = pd.read_csv('data/rugby_world_cup_gs_2023.csv')

#Merge in Rugby Union World Rankings for Teams A and B
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_a', 'match_year'], right_on=['team', 'year'])
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_b', 'match_year'], right_on=['team', 'year'])

#Keep only relevant columns
world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                       'home_a', 'home_b', 'world_rank_score_x', 'world_rank_score_y']]

#Rename some of the feature columns
world_cup.columns = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                     'home_a', 'home_b', 'team_a_world_rank', 'team_b_world_rank']

#Create a melted dataframe that allows for moving average calculations
wc_ma = pd.melt(df, id_vars=['match_year', 'score_a', 'score_b'], value_vars=['team_a', 'team_b'], value_name='team', 
                  ignore_index = False) \
          .reset_index()

#Team A and B are now given their own row and the appropriate points for and against are carried through
wc_ma['points_for'] = np.where(wc_ma['variable'] == 'team_a', wc_ma['score_a'], wc_ma['score_b'])
wc_ma['points_against'] = np.where(wc_ma['variable'] == 'team_a', wc_ma['score_b'], wc_ma['score_a'])

#Get historical average for points for and against
wc_ma = wc_ma.groupby(['team'])['points_for', 'points_against'].mean().reset_index()

#Merge in moving averages for points for and points against for both teams
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_a'], right_on=['team'])
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_b'], right_on=['team'])

#Create the final dataset

world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 'team_a_world_rank', 
                       'team_b_world_rank', 'home_a', 'home_b', 'points_for_x', 'points_against_x', 
                       'points_for_y', 'points_against_y']]

column_names = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                'world_rank_a', 'world_rank_b', 'home_a', 'home_b',
               'points_for_moving_a', 'points_against_moving_a', 'points_for_moving_b', 'points_against_moving_b']

world_cup.columns = column_names

#Predict the World Cup Group Stage

#Prepare X dataset
X_test = world_cup.drop(columns=['tournament', 'venue_country', 'match_year', 'team_a', 'team_b'])

#Format input data appropriately
X_test = np.asarray(X_test)

reset_seeds() 
world_cup['nn_point_diff'] = nn_final.predict(X_test)
world_cup['nn_winner'] = np.where(world_cup['nn_point_diff'] > 0, world_cup['team_a'], world_cup['team_b'])
world_cup.head()

Unnamed: 0,tournament,venue_country,match_year,team_a,team_b,world_rank_a,world_rank_b,home_a,home_b,points_for_moving_a,points_against_moving_a,points_for_moving_b,points_against_moving_b,nn_point_diff,nn_winner
0,2023 World Cup,France,2023,France,New Zealand,90.01,88.98,1,0,23.816038,20.443396,35.768627,16.117647,3.65244,France
1,2023 World Cup,France,2023,France,Uruguay,90.01,66.24,1,0,23.816038,20.443396,23.457831,25.909639,54.24091,France
2,2023 World Cup,France,2023,Italy,Uruguay,75.95,66.24,0,0,17.511848,28.853081,23.457831,25.909639,18.401226,Italy
3,2023 World Cup,France,2023,New Zealand,Uruguay,88.98,66.24,0,0,35.768627,16.117647,23.457831,25.909639,50.54705,New Zealand
4,2023 World Cup,France,2023,France,Namibia,90.01,61.6,1,0,23.816038,20.443396,30.363636,25.59596,66.851006,France


In [20]:
#2023 Men's Rugby Union World Cup Prediction
#Generate Quarterfinals Dataset

#Pull in Men's Rugby Union World Cup Shell
world_cup = pd.read_csv('data/rugby_world_cup_q_2023.csv')

#Merge in Rugby Union World Rankings for Teams A and B
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_a', 'match_year'], right_on=['team', 'year'])
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_b', 'match_year'], right_on=['team', 'year'])

#Keep only relevant columns
world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                       'home_a', 'home_b', 'world_rank_score_x', 'world_rank_score_y']]

#Rename some of the feature columns
world_cup.columns = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                     'home_a', 'home_b', 'team_a_world_rank', 'team_b_world_rank']

#Merge in moving averages for points for and points against for both teams
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_a'], right_on=['team'])
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_b'], right_on=['team'])

#Create the final dataset

world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 'team_a_world_rank', 
                       'team_b_world_rank', 'home_a', 'home_b', 'points_for_x', 'points_against_x', 
                       'points_for_y', 'points_against_y']]

column_names = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                'world_rank_a', 'world_rank_b', 'home_a', 'home_b',
               'points_for_moving_a', 'points_against_moving_a', 'points_for_moving_b', 'points_against_moving_b']

world_cup.columns = column_names

#Predict the World Cup Group Stage

#Prepare X dataset
X_test = world_cup.drop(columns=['tournament', 'venue_country', 'match_year', 'team_a', 'team_b'])

#Format input data appropriately
X_test = np.asarray(X_test)

reset_seeds() 
world_cup['nn_point_diff'] = nn_final.predict(X_test)
world_cup['nn_winner'] = np.where(world_cup['nn_point_diff'] > 0, world_cup['team_a'], world_cup['team_b'])
world_cup

Unnamed: 0,tournament,venue_country,match_year,team_a,team_b,world_rank_a,world_rank_b,home_a,home_b,points_for_moving_a,points_against_moving_a,points_for_moving_b,points_against_moving_b,nn_point_diff,nn_winner
0,2023 World Cup,France,2023,France,South Africa,90.01,88.97,1,0,23.816038,20.443396,27.118483,19.687204,3.578348,France
1,2023 World Cup,France,2023,New Zealand,Ireland,88.98,90.63,0,0,35.768627,16.117647,25.483721,17.962791,-0.068371,Ireland
2,2023 World Cup,France,2023,Australia,Argentina,81.8,80.72,0,0,25.834677,21.854839,30.170635,22.694444,2.585794,Australia
3,2023 World Cup,France,2023,Wales,England,78.09,83.66,0,0,24.425339,21.104072,25.432558,19.060465,-5.766216,England


In [21]:
#2023 Men's Rugby Union World Cup Prediction
#Generate Semifinals Dataset

#Pull in Men's Rugby Union World Cup Shell
world_cup = pd.read_csv('data/rugby_world_cup_s_2023.csv')

#Merge in Rugby Union World Rankings for Teams A and B
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_a', 'match_year'], right_on=['team', 'year'])
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_b', 'match_year'], right_on=['team', 'year'])

#Keep only relevant columns
world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                       'home_a', 'home_b', 'world_rank_score_x', 'world_rank_score_y']]

#Rename some of the feature columns
world_cup.columns = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                     'home_a', 'home_b', 'team_a_world_rank', 'team_b_world_rank']

#Merge in moving averages for points for and points against for both teams
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_a'], right_on=['team'])
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_b'], right_on=['team'])

#Create the final dataset

world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 'team_a_world_rank', 
                       'team_b_world_rank', 'home_a', 'home_b', 'points_for_x', 'points_against_x', 
                       'points_for_y', 'points_against_y']]

column_names = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                'world_rank_a', 'world_rank_b', 'home_a', 'home_b',
               'points_for_moving_a', 'points_against_moving_a', 'points_for_moving_b', 'points_against_moving_b']

world_cup.columns = column_names

#Predict the World Cup Group Stage

#Prepare X dataset
X_test = world_cup.drop(columns=['tournament', 'venue_country', 'match_year', 'team_a', 'team_b'])

#Format input data appropriately
X_test = np.asarray(X_test)

reset_seeds() 
world_cup['nn_point_diff'] = nn_final.predict(X_test)
world_cup['nn_winner'] = np.where(world_cup['nn_point_diff'] > 0, world_cup['team_a'], world_cup['team_b'])
world_cup

Unnamed: 0,tournament,venue_country,match_year,team_a,team_b,world_rank_a,world_rank_b,home_a,home_b,points_for_moving_a,points_against_moving_a,points_for_moving_b,points_against_moving_b,nn_point_diff,nn_winner
0,2023 World Cup,France,2023,Australia,Ireland,81.8,90.63,0,0,25.834677,21.854839,25.483721,17.962791,-9.998753,Ireland
1,2023 World Cup,France,2023,England,France,83.66,90.01,0,1,25.432558,19.060465,23.816038,20.443396,-12.418156,France


In [22]:
#2023 Men's Rugby Union World Cup Prediction
#Generate Finals Dataset

#Pull in Men's Rugby Union World Cup Shell
world_cup = pd.read_csv('data/rugby_world_cup_f_2023.csv')

#Merge in Rugby Union World Rankings for Teams A and B
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_a', 'match_year'], right_on=['team', 'year'])
world_cup = world_cup.merge(world_rankings, how='inner', left_on=['team_b', 'match_year'], right_on=['team', 'year'])

#Keep only relevant columns
world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                       'home_a', 'home_b', 'world_rank_score_x', 'world_rank_score_y']]

#Rename some of the feature columns
world_cup.columns = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                     'home_a', 'home_b', 'team_a_world_rank', 'team_b_world_rank']

#Merge in moving averages for points for and points against for both teams
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_a'], right_on=['team'])
world_cup = world_cup.merge(wc_ma, how='inner', left_on=['team_b'], right_on=['team'])

#Create the final dataset

world_cup = world_cup[['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 'team_a_world_rank', 
                       'team_b_world_rank', 'home_a', 'home_b', 'points_for_x', 'points_against_x', 
                       'points_for_y', 'points_against_y']]

column_names = ['tournament', 'venue_country', 'match_year', 'team_a', 'team_b', 
                'world_rank_a', 'world_rank_b', 'home_a', 'home_b',
               'points_for_moving_a', 'points_against_moving_a', 'points_for_moving_b', 'points_against_moving_b']

world_cup.columns = column_names

#Predict the World Cup Group Stage

#Prepare X dataset
X_test = world_cup.drop(columns=['tournament', 'venue_country', 'match_year', 'team_a', 'team_b'])

#Format input data appropriately
X_test = np.asarray(X_test)

reset_seeds() 
world_cup['nn_point_diff'] = nn_final.predict(X_test)
world_cup['nn_winner'] = np.where(world_cup['nn_point_diff'] > 0, world_cup['team_a'], world_cup['team_b'])
world_cup

Unnamed: 0,tournament,venue_country,match_year,team_a,team_b,world_rank_a,world_rank_b,home_a,home_b,points_for_moving_a,points_against_moving_a,points_for_moving_b,points_against_moving_b,nn_point_diff,nn_winner
0,2023 World Cup,France,2023,France,Ireland,90.01,90.63,1,0,23.816038,20.443396,25.483721,17.962791,1.454087,France
