# Comparison of Basic Models for Predicting AFL Match Outcomes

Footy tipping is the popular Australian pasttime of picking which teams will win their matches every week (like filling out a March Madness bracket, except it's done a round at a time, and everyone drops their 'r's). At the end of the season, the group of competitors will add up the number of matches that they predicted correctly, with the winner usually walking away with a pot of money.

There are a variety of heuristics that one could use to pick the winners and losers for the week. In my conversations with other tippers, I've found that the average perticipant tends to take in data such as which team is favoured, which is the home team, how well each team has been playing lately, and whom they expect fellow competitors to tip, then process the input in their gut, which produces the predicted result.

Below, I will present a few naive heuristics for predicting match results and compare their respective accuracies.

In [1]:
# Import dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [286]:
# Load data

df = pd.read_csv('../data/full_data.csv')
# Betting data only goes back to 2010, so dropping earlier years for consistency
df = df[(df['year'] >= 2010)]
df

Unnamed: 0,home_team,last_finals_reached,oppo_score,oppo_team,point_spread,round_number,score,team,venue,win,win_odds,year
1080,0.0,2.0,118.0,Fremantle,-1.5,1,62.0,Adelaide,Domain Stadium,0.0,1.85,2010
1081,1.0,2.0,118.0,Sydney,-12.5,2,75.0,Adelaide,AAMI Stadium,0.0,1.54,2010
1082,0.0,2.0,57.0,Melbourne,-13.5,3,41.0,Adelaide,MCG,0.0,1.52,2010
1083,1.0,2.0,103.0,Carlton,-2.5,4,55.0,Adelaide,AAMI Stadium,0.0,1.81,2010
1084,0.0,2.0,121.0,Western Bulldogs,40.5,5,72.0,Adelaide,Etihad Stadium,0.0,7.50,2010
1085,1.0,2.0,97.0,Port Adelaide,12.5,6,74.0,Adelaide,AAMI Stadium,0.0,2.55,2010
1086,1.0,2.0,54.0,Richmond,-29.5,7,104.0,Adelaide,AAMI Stadium,1.0,1.20,2010
1087,0.0,2.0,84.0,North Melbourne,9.5,8,75.0,Adelaide,Etihad Stadium,0.0,2.38,2010
1088,1.0,2.0,81.0,Brisbane,5.5,9,93.0,Adelaide,AAMI Stadium,1.0,2.14,2010
1089,0.0,2.0,123.0,St Kilda,26.5,10,76.0,Adelaide,Etihad Stadium,0.0,4.25,2010


In [289]:
# Always tip the home team

# Filter out bye weeks
home_df = (df[df['oppo_team'] != '0']
             .copy()
             .assign(pred_win=df['home_team'])
             .set_index(['team', 'year', 'round_number'])
             .sort_index())
home_df.loc[:, 'correct_tip'] = (home_df['win'] == home_df['pred_win']).astype('float')
home_accuracy = round((home_df['correct_tip'].sum() / len(home_df)) * 10000) / 100

print('Always Tip Home Team Accuracy: {}'.format(home_accuracy))

Always Tip Home Team Accuracy: 56.87


In [290]:
# Always tip the favourite

# Negative point spread indicates the favourite
def point_spread_prediction(value):
    if value < 0:
        return 1
    if value > 0:
        return 0
    return 0.5
    
def point_spread_predictions(df):
    return df['point_spread'].apply(point_spread_prediction)

# Filter out bye weeks
favourite_df = (df[df['oppo_team'] != '0']
                  .copy()
                  .assign(pred_win=point_spread_predictions)
                  .set_index(['team', 'year', 'round_number'])
                  .sort_index())
# A little convoluted, but this gets matches without a favourite to be 0.5 'correct'
favourite_df.loc[:, 'correct_tip'] = ((favourite_df['win'] - favourite_df['pred_win']).apply(abs) - 1).apply(abs)
favourite_accuracy = round((favourite_df['correct_tip'].sum() / len(favourite_df)) * 10000) / 100

print('Always Tip Favourite Accuracy: {}'.format(favourite_accuracy))

Always Tip Favourite Accuracy: 71.9


In [291]:
# Always tip the higher-ranked team

ladder_df = (df.copy()
               .assign(draw=((df['score'] == df['oppo_score']) & (df['score'] != 0)).astype('int'),
                       cum_oppo_score=df.groupby(['year', 'team'])['oppo_score'].cumsum().shift())
               .set_index(['team', 'year', 'round_number'])
               .sort_index()
               .fillna(0))

# Get cumulative stats by team & year, then shift by team to carry over end of last season for first round rankings
ladder_df.loc[:, 'cum_score'] = ladder_df.groupby(level=[0, 1])['score'].cumsum().groupby(level=[0]).shift().fillna(0)
ladder_df.loc[:, 'cum_oppo_score'] = ladder_df.groupby(level=[0, 1])['score'].cumsum().groupby(level=[0]).shift().fillna(0)
ladder_df.loc[:, 'cum_percent'] = ladder_df['cum_score'] / ladder_df['cum_oppo_score']
# Draws are counted as 'wins', so subtract a draw's 2 points from the win's 4 points
ladder_df.loc[:, 'cum_win_points'] = (((ladder_df.groupby(level=[0, 1])['win'].cumsum() * 4) -
                                       (ladder_df.groupby(level=[0, 1])['draw'].cumsum() * 2)).groupby(level=[0])
                                                                                              .shift()
                                                                                              .fillna(0))

# Pivot to get round-by-round match points and cumulative percent
ladder_pivot_table = (ladder_df.loc[:, ['cum_win_points', 'cum_percent']]
                               .pivot_table(index=['year', 'round_number'],
                                            values=['cum_win_points', 'cum_percent'],
                                            columns='team',
                                            aggfunc={'cum_win_points': np.sum, 'cum_percent': np.mean}))

# Sort each round by points & percent, then save rank numbers
ladder_index = []
ladder_values = []

for idx, row in ladder_pivot_table.iterrows():
    sorted_row = (row.unstack(level=0)
                     .sort_values(['cum_win_points', 'cum_percent'], ascending=False))
    ladder_ranks = np.array(range(len(sorted_row)))
    
    
    for ladder_idx, team_name in enumerate(sorted_row.index.get_values()):
        ladder_index.append(tuple([team_name, *idx]))
        ladder_values.append(ladder_idx + 1)
        
ladder_ranks = pd.Series(ladder_values,
                         index=pd.MultiIndex.from_tuples(ladder_index, names=('team', 'year', 'round_number')),
                         name='ladder_rank')

# Add ladder_rank to ladder_df
ladder_df = pd.concat([ladder_df, ladder_ranks], axis=1)

# Get opponent's ladder rank
oppo_team_ladder = (ladder_df[ladder_df['oppo_team'] != '0']
                             .loc[:, ['oppo_team', 'ladder_rank']]
                             .reset_index()
                             .drop('team', axis=1)
                             .rename(columns={'ladder_rank': 'oppo_ladder_rank', 'oppo_team': 'team'})
                             .set_index(['team', 'year', 'round_number'])
                             .sort_index())

# Add oppo_ladder_rank to ladder_df
ladder_df = pd.concat([ladder_df, oppo_team_ladder], axis=1).fillna(0)

# Filter out byes for predictions and accuracy calculations
ladder_df.loc[:, 'pred_win'] = ((ladder_df['ladder_rank'] < ladder_df['oppo_ladder_rank']) &
                                (ladder_df['oppo_team'] != '0')).astype('float')
ladder_df.loc[:, 'correct_tip'] = ((ladder_df['win'] == ladder_df['pred_win']) &
                                   (ladder_df['oppo_team'] != '0')).astype('float')
ladder_accuracy = round((ladder_df['correct_tip'].sum() / len(ladder_df[ladder_df['oppo_team'] != '0'])) * 10000) / 100

print('Always Tip Higher Ranked Accuracy: {}'.format(ladder_accuracy))

Always Tip Higher Ranked Accuracy: 64.74


In [323]:
# Combine all 3 heuristics via weighted voting

voting_df = (pd.concat([home_df['pred_win'], favourite_df['pred_win'],
                        ladder_df[ladder_df['oppo_team'] != '0']['pred_win'],
                        df[df['oppo_team'] != '0'].set_index(['team', 'year', 'round_number']).sort_index()['win']],
                       axis=1))
voting_df.columns = ['home_pred', 'favourite_pred', 'ladder_pred', 'win']
voting_df.loc[:, 'vote_pred'] = voting_df.drop('win', axis=1).mean(axis=1).round()
voting_df.loc[:, 'correct_tip'] = (voting_df['win'] == voting_df['vote_pred']).astype('float')
voting_accuracy = round((voting_df['correct_tip'].sum() / len(voting_df)) * 10000) / 100

print('Voting Accuracy: {}'.format(voting_accuracy))

# Note: adding weights based on each heuristics accuracy did not significantly improve the accuracy (just 69.61%)

Voting Accuracy: 69.58
