In [37]:
import sys
sys.path.append('C:\Anaconda3\Lib\site-packages')

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import datetime
import pandas as pd
import time
import glob

# For sorting lists of tuples
import operator

import seaborn as sns
sns.set_style('whitegrid',{'grid.linestyle': '--', 'grid.color': '0.9', 'axes.edgecolor': '0.3'})
sns.set_context('poster')

### Load .csv file with NHL win data and create a dataframe

In [3]:
# Make sure to get the most recent one if multiple exist
filename = glob.glob('./WOI_game_tables/war-on-ice-2016*')[0]
print('Opening the file: ',filename)

Opening the file:  ./WOI_game_tables\war-on-ice-2016-04-12 17-47-47.csv


In [70]:
df = pd.read_csv(filename).set_index('Unnamed: 0')
df = df[df['status'] != 'NotStarted']
#df['date'] = pd.to_datetime(df['date'])
df.head(10)

Unnamed: 0_level_0,season,session,gamenumber,gcode,status,awayteam,hometeam,awayscore,homescore,date,game.start,game.end,periods,seconds,awaycorsi,homecorsi,awayafteraway,awayafterhome,homeafteraway,homeafterhome
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
13556,20152016,Regular,716,20716,Complete,PHI,NYI,5,2,2016-04-10,7:09 EDT,9:27 EDT,3,3600,47,33,0,0,1,0
13544,20152016,Regular,704,20704,Complete,ANA,WSH,2,0,2016-04-10,7:46 EDT,10:08 EDT,3,3600,45,56,0,0,0,0
14070,20152016,Regular,1230,21230,Complete,ARI,S.J,0,1,2016-04-09,7:38 PDT,9:57 PDT,3,3600,45,69,0,0,0,0
14069,20152016,Regular,1229,21229,Complete,WPG,L.A,4,3,2016-04-09,7:09 PDT,9:55 PDT,5,3900,57,65,0,0,0,0
14068,20152016,Regular,1228,21228,Complete,EDM,VAN,3,4,2016-04-09,7:11 PDT,10:03 PDT,5,3900,63,62,0,0,0,0
14067,20152016,Regular,1227,21227,Complete,NSH,DAL,2,3,2016-04-09,7:07 CDT,9:36 CDT,3,3600,63,68,0,0,0,0
14066,20152016,Regular,1226,21226,Complete,WSH,STL,5,1,2016-04-09,7:07 CDT,9:30 CDT,3,3600,60,41,0,0,0,0
14065,20152016,Regular,1225,21225,Complete,BUF,NYI,4,3,2016-04-09,7:37 EDT,10:06 EDT,4,3766,55,49,0,1,0,0
14064,20152016,Regular,1224,21224,Complete,CGY,MIN,2,1,2016-04-09,6:15 CDT,8:41 CDT,3,3600,39,68,0,0,0,0
14063,20152016,Regular,1223,21223,Complete,CHI,CBJ,4,5,2016-04-09,7:08 EDT,9:39 EDT,4,3748,46,39,0,0,1,0


In [71]:
df.dtypes

season            int64
session          object
gamenumber        int64
gcode             int64
status           object
awayteam         object
hometeam         object
awayscore         int64
homescore         int64
date             object
game.start       object
game.end         object
periods           int64
seconds           int64
awaycorsi         int64
homecorsi         int64
awayafteraway     int64
awayafterhome     int64
homeafteraway     int64
homeafterhome     int64
dtype: object

### Extract win/loss information from the dataframe

In [6]:
def fill_game_list(df, team, season):
    ''' Input is the games database, team name, and season.  Function
    returns a list where each item is a list of elements indexed as:
    0 - game datetime object
    1 - boulian type variable for wins (1/0)
    2 - boulian type variable for losses (1/0) '''

    games = [[] for i in range(len(df.index))]

    i=0
    
    for j in df.index:
    
        if df.loc[j, 'season'] == season:
        
            if df.loc[j, 'awayteam'] == team:
                win_bool = 1*(df.loc[j, 'awayscore'] > df.loc[j, 'homescore'])
                loss_bool = 1*(not win_bool)
                games[i] = [df.loc[j, 'date'], win_bool, loss_bool]
                i += 1

            if df.loc[j, 'hometeam'] == team:
                win_bool = 1*(df.loc[j, 'homescore'] > df.loc[j, 'awayscore'])
                loss_bool = 1*(not win_bool)
                games[i] = [df.loc[j, 'date'], win_bool, loss_bool]
                i += 1
                
    games = [game for game in games if len(game) != 0] # remove all the extras
    return games

In [7]:
CHI_games = fill_game_list(df, team='CHI', season=20142015)
print(CHI_games[:10])

[[Timestamp('2015-06-15 00:00:00'), 1, 0], [Timestamp('2015-06-13 00:00:00'), 1, 0], [Timestamp('2015-06-10 00:00:00'), 1, 0], [Timestamp('2015-06-08 00:00:00'), 0, 1], [Timestamp('2015-06-06 00:00:00'), 0, 1], [Timestamp('2015-06-03 00:00:00'), 1, 0], [Timestamp('2015-05-30 00:00:00'), 1, 0], [Timestamp('2015-05-27 00:00:00'), 1, 0], [Timestamp('2015-05-25 00:00:00'), 0, 1], [Timestamp('2015-05-23 00:00:00'), 1, 0]]


### Extract win/loss streak data

In [72]:
def get_streaks(streak_type, game_list):
    ''' Input a list of games where the 2nd element in each item is a boulian type
    variable for win/loss (1/0).  Output is a dictionary with key=length of streak. '''
    
    streak_list = []
    if streak_type == 'win':
        ig = 1
    elif streak_type == 'loss':
        ig = 2
    else:
        return 0
    
    streak = 0
    for i, g in enumerate(game_list):
        if g[ig] == 1:
            streak += 1
        if g[ig] == 0:
            streak_list.append(streak)
            streak = 0
        if i == len(game_list)-1:
            streak_list.append(streak)
    
    counter = []
    for i in range(0,max(streak_list)+1):
        counter.append(streak_list.count(i))
        
    streak_dict = {key: s for key, s in zip(range(0,len(counter)), counter)}
    del streak_dict[0]
        
    return streak_dict

In [73]:
get_streaks('win', CHI_games)

{1: 12, 2: 11, 3: 3, 4: 2, 5: 1, 6: 0, 7: 0, 8: 1}

### Input results from previous playoffs

In [10]:
def check_team_names(teams):
    ''' Makes sure team name ID's are correct. '''
    for team in teams:
        print(team, team in list(df.awayteam))
        
def check_results(results_dict):
    ''' Makes sure results are consistent. '''
    for i in range(0,5)[::-1]:
        count = []
        for team in results_dict.keys():
            if results_dict[team] == i:
                count.append(team)
        print(count)

In [11]:
teams_all, rounds_all = {}, {}
seasons_all = [20102011, 20112012, 20132014, 20142015]

# Teams that made it to the playoffs
# teams[:8]=east and teams[8:]=west
teams_all[20102011] = ['WSH', 'PHI', 'BOS', 'PIT', 'T.B', 'MTL', 'BUF', 'NYR',
                       'VAN', 'S.J', 'DET', 'ANA', 'NSH', 'PHX', 'L.A', 'CHI']
# Furthest round teams made it past
rounds_all[20102011] = [1, 1, 4, 0, 2, 0, 0, 0,
                        3, 2, 1, 0, 1, 0, 0, 0]

teams_all[20112012] = ['NYR', 'BOS', 'FLA', 'PIT', 'PHI', 'N.J', 'WSH', 'OTT',
                       'VAN', 'STL', 'PHX', 'NSH', 'DET', 'CHI', 'S.J', 'L.A']
rounds_all[20112012] = [2, 0, 0, 0, 1, 3, 1, 0,
                        0, 1, 2, 1, 0, 0, 0, 4]

teams_all[20132014] = ['BOS', 'T.B', 'MTL', 'PIT', 'NYR', 'PHI', 'CBJ', 'DET',
                       'COL', 'STL', 'CHI', 'ANA', 'S.J', 'L.A', 'MIN', 'DAL']
rounds_all[20132014] = [1, 0, 2, 1, 3, 0, 0, 0,
                        0, 0, 2, 1, 0, 4, 1, 0]

teams_all[20142015] = ['MTL', 'T.B', 'DET', 'NYR', 'WSH', 'NYI', 'OTT', 'PIT',
                       'STL', 'NSH', 'CHI', 'ANA', 'VAN', 'CGY', 'MIN', 'WPG']
rounds_all[20142015] = [1, 3, 0, 2, 1, 0, 0, 0,
                        0, 0, 4, 2, 0, 1, 1, 0]

results_all = {s: {team: r for team, r in zip(teams_all[s], rounds_all[s])}
               for s in seasons_all}

In [12]:
#print(check_team_names(teams_all[20132014]))

In [13]:
# results_20142015 = list(zip(teams, results))
# results_20142015 = sorted(results_20142015,
#                           key=operator.itemgetter(1))[::-1]
# results_20142015

### Collect streak data for teams making it to playoffs

In [14]:
def get_all_streaks(df, teams, season):
    ''' Input list of team ID's and return list of streaks for every team.
    This takes a bit of time. '''

    wins, losses = {}, {}
    
    for t in teams:
        
        games = fill_game_list(df, team=t, season=season)
        win_streaks = get_streaks('win', games)
        loss_streaks = get_streaks('loss', games)
        wins[t] = win_streaks
        losses[t] = loss_streaks
        
    return wins, losses

#### Warning: this takes a while (~10 mins)

In [15]:
wins_all, losses_all = {}, {}
for s in seasons_all:
    wins_all[s], losses_all[s] = get_all_streaks(df, teams_all[s], s)

In [16]:
wins_all[20102011]

{'ANA': {0: 19, 1: 8, 2: 3, 3: 7, 4: 2, 5: 0, 6: 1},
 'BOS': {0: 18, 1: 10, 2: 9, 3: 6, 4: 1, 5: 1, 6: 0, 7: 1},
 'BUF': {0: 16, 1: 15, 2: 10, 3: 2, 4: 0, 5: 1},
 'CHI': {0: 20, 1: 12, 2: 5, 3: 3, 4: 2, 5: 0, 6: 0, 7: 0, 8: 1},
 'DET': {0: 18, 1: 7, 2: 5, 3: 5, 4: 3, 5: 2},
 'L.A': {0: 19, 1: 11, 2: 2, 3: 5, 4: 3, 5: 0, 6: 1},
 'MTL': {0: 17, 1: 16, 2: 3, 3: 4, 4: 2, 5: 1},
 'NSH': {0: 22, 1: 10, 2: 7, 3: 3, 4: 0, 5: 1, 6: 2},
 'NYR': {0: 20, 1: 8, 2: 10, 3: 4, 4: 0, 5: 1},
 'PHI': {0: 19, 1: 11, 2: 6, 3: 3, 4: 2, 5: 1, 6: 1},
 'PHX': {0: 23, 1: 15, 2: 2, 3: 0, 4: 1, 5: 1, 6: 0, 7: 1, 8: 1},
 'PIT': {0: 19,
  1: 8,
  2: 4,
  3: 2,
  4: 2,
  5: 2,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 1},
 'S.J': {0: 21, 1: 9, 2: 6, 3: 1, 4: 5, 5: 1, 6: 0, 7: 0, 8: 1},
 'T.B': {0: 19, 1: 14, 2: 3, 3: 3, 4: 1, 5: 2, 6: 1, 7: 0, 8: 1},
 'VAN': {0: 15, 1: 11, 2: 3, 3: 1, 4: 3, 5: 2, 6: 2, 7: 1, 8: 1},
 'WSH': {0: 21, 1: 5, 2: 8, 3: 1, 4: 3, 5: 0, 6: 1, 7: 0, 8: 0, 9: 1}}

In [17]:
losses_all[20102011]

{'ANA': {0: 28, 1: 13, 2: 6, 3: 1, 4: 0, 5: 1, 6: 1},
 'BOS': {0: 35, 1: 16, 2: 8, 3: 3, 4: 1},
 'BUF': {0: 19, 1: 20, 2: 5, 3: 1, 4: 0, 5: 2},
 'CHI': {0: 24, 1: 12, 2: 7, 3: 4, 4: 1},
 'DET': {0: 33, 1: 10, 2: 8, 3: 3, 4: 1},
 'L.A': {0: 27, 1: 12, 2: 5, 3: 3, 4: 1, 5: 1},
 'MTL': {0: 21, 1: 18, 2: 3, 3: 6},
 'NSH': {0: 28, 1: 15, 2: 2, 3: 1, 4: 3, 5: 2},
 'NYR': {0: 23, 1: 10, 2: 10, 3: 2, 4: 0, 5: 0, 6: 1},
 'PHI': {0: 28, 1: 15, 2: 4, 3: 2, 4: 2, 5: 1},
 'PHX': {0: 23, 1: 8, 2: 8, 3: 3, 4: 0, 5: 2},
 'PIT': {0: 33, 1: 8, 2: 8, 3: 3, 4: 1},
 'S.J': {0: 35, 1: 11, 2: 7, 3: 4, 4: 0, 5: 0, 6: 1},
 'T.B': {0: 33, 1: 14, 2: 6, 3: 3, 4: 2},
 'VAN': {0: 45, 1: 17, 2: 5, 3: 1, 4: 2},
 'WSH': {0: 32, 1: 13, 2: 1, 3: 4, 4: 1, 5: 0, 6: 0, 7: 0, 8: 1}}

### An objective function that will be used to ranks teams

In [18]:
def objective_func(win_streaks, loss_streaks,
                    a1=0, a2=0, a3=0,
                    b1=0, b2=0, b3=0):
    ''' Input win and loss lists that are assumed to contain values for
    streak lengths of 0, 1, 2, ... in order.  Returns ranking value float
    number that can be positive or negative. '''
    
#     print('win')
#     print(win_streaks)
#     print('loss')
#     print(loss_streaks)
    
    print_stuff = False
    
    # Lets include 2, 3, and 4+ game winning streaks
    try:
        W1 = win_streaks[2]
    except:
        if print_stuff:
            print('exception raised in objective function')
            print(win_streaks)
        W1 = 0
    try:
        W2 = win_streaks[3]
    except:
        if print_stuff:
            print('exception raised in objective function')
            print(win_streaks)
        W2 = 0
    try:
        W3 = sum(win_streaks[4:])
    except:
        if print_stuff:
            print('exception raised in objective function')
            print(win_streaks)
        W3 = 0
    
    # Lets include 2, 3, and 4+ game losing streaks
    try:
        L1 = loss_streaks[2]
    except:
        if print_stuff:
            print('exception raised in objective function')
            print(loss_streaks)
        L1 = 0    
    try:
        L2 = loss_streaks[3]
    except:
        if print_stuff:
            print('exception raised in objective function')
            print(loss_streaks)
        L2 = 0
    try:
        L3 = sum(loss_streaks[4:])
    except:
        if print_stuff:
            print('exception raised in objective function')
            print(loss_streaks)
        L3 = 0 
    
    #print('W', W1, W2, W3, 'L', L1, L2, L3)
    
    # Use symmetric parameters
    b1, b2, b3 = -a1, -a2, -a3
    
    # Finally, we can compute the objective function
    f = a1*W1 + a2*W2 + a3*W3 + \
        b1*L1 + b2*L2 + b3*L3
        
    
    # Add a small 'perturbation' to break ties
    max_w = len(win_streaks)
    N_max_w = win_streaks[-1]
    max_l = len(loss_streaks)
    N_max_l = loss_streaks[-1]
    
    f = f + 0.01*max_w + 0.004*N_max_w \
          - 0.01*max_l + 0.004*N_max_l
    
    return f

### Predict results using objective function
#### I didn't really consider this but it doesn't make a lot of sense to bother with trying to predict rounds because I'm not accounting for the order in which teams face eachother.  Will have to make a lot of changes to my algorithm next year so this function (below) can be of use.

In [74]:
def predict_rounds(teams, wins, losses,
                   a1=0, a2=0, a3=0,
                   b1=0, b2=0, b3=0,
                   EastWestCorrect=True,
                   PredictRounds=True):
    ''' Input dictionaries of wins and loss streaks along with objective
    function parameters.  This function computes the objective function
    and sorts teams accordingly. Returns a dictionary of round predictions. '''
    
    w = [list(wins[t].values()) for t in teams]
    l = [list(losses[t].values()) for t in teams]

    rank_tuples = []

    for i, t in enumerate(teams):
        # Calculate objective function
        f = objective_func(w[i], l[i],
                           a1, a2, a3)
        rank_tuples.append((t, round(f, 3)))

    # Sort the tuples for round assignment (below)
    ranks = sorted(rank_tuples, key=operator.itemgetter(1))[::-1]

    # Correct for playoff format east vs west
    # ---------------------------------------
    # A BETTER WAY OF DOING THIS is to rank the east
    # and west separately and then zip them together
    # the lists of ordered tuples - where each element
    # is of the form e.g. ('TOR', 34.562) - making
    # sure that highest ranking tem is on top
    # ---------------------------------------
    # Must be feeding in a sorted list of tuples
    # with the largest score as the first element
    E_teams = teams[:8]
    if EastWestCorrect:
        ''' Iterate over list checking "manually". '''
        east_team = lambda team: team[0] in E_teams
        east_west_ranks = []
        for i in range(len(ranks)):
            
            if i == 0:
                if east_team(ranks[0]):
                    E_boo = True
                else:
                    E_boo = False
                east_west_ranks.append(ranks.pop(0))
            elif i == 1:
                if E_boo:
                    for j in range(len(ranks)):
                        if not east_team(ranks[j]):
                            east_west_ranks.append(ranks.pop(j))
                            break
                else:
                    for j in range(len(ranks)):
                        if east_team(ranks[j]):
                            east_west_ranks.append(ranks.pop(j))
                            break
                            
            elif i == 2:
                if east_team(ranks[0]):
                    E_boo = True
                else:
                    E_boo = False
                east_west_ranks.append(ranks.pop(0))
            elif i == 3:
                if E_boo:
                    for j in range(len(ranks)):
                        if not east_team(ranks[j]):
                            east_west_ranks.append(ranks.pop(j))
                            break
                else:
                    for j in range(len(ranks)):
                        if east_team(ranks[j]):
                            east_west_ranks.append(ranks.pop(j))
                            break
            
            elif 4 <= i <= 7:
                if i == 4:
                    EW_list = []
                if EW_list.count('E') == 2:
                    for j in range(len(ranks)):
                        if not east_team(ranks[j]):
                            EW_list.append('W')
                            #print(i, 'east full')
                            #print(ranks[j])
                            #print(EW_list)
                            east_west_ranks.append(ranks.pop(j))
                            break
                elif EW_list.count('W') == 2:
                    for j in range(len(ranks)):
                        if east_team(ranks[j]):
                            EW_list.append('E')
                            #print(i, 'west full')
                            #print(ranks[j])
                            #print(EW_list)
                            east_west_ranks.append(ranks.pop(j))
                            break
                else:
                    if east_team(ranks[0]):
                        EW_list.append('E')
                    else:
                        EW_list.append('W')
                    #print(i, 'none full')
                    #print(ranks[0])
                    #print(EW_list)
                    east_west_ranks.append(ranks.pop(0))
                    
        for i in range(len(ranks)):
            east_west_ranks.append(ranks.pop(0))
        
        ranks = east_west_ranks
            
    if PredictRounds:
        # Predictions for the number of rounds each team will make it past
        predicted_rounds = []
        for i, r in enumerate(ranks):
            if i == 0:
                predicted_rounds.append((r[0], 4))
            elif i == 1:
                predicted_rounds.append((r[0], 3))
            elif 2 <= i <= 3:
                predicted_rounds.append((r[0], 2))
            elif 4 <= i <= 7:
                predicted_rounds.append((r[0], 1))
            elif 8 <= i <= 15:
                predicted_rounds.append((r[0], 0))

        return {key: p for key, p in predicted_rounds}
    else:
        return ranks

### Compare to results (i.e. test data)

In [20]:
def prediction_score(prediction, result):
    ''' Input dictionaries of predictions and results.  This function
    copmares the two and provides an integer value as a score, where
    a perfect match is 0 and a large value indicates a poor match.'''
    
    scores = []
    # Iterate over the team name ID's e.g. 'TOR'
    for k in prediction.keys():
        # Predicted round team will make it past
        p = prediction[k]
        # Actual result
        r = result[k]
        
        # Weights for the score
        # Lets give more weight to correctly
        # predicting the later rounds
        if p == 4 or p == 3:
            w = 3
        elif p == 2:
            w = 2
        elif p == 1 or p == 0:
            w = 1
        
        # Add score to the list
        scores.append(w*abs(p - r))

    return sum(scores)

### Brute force optimization

In [21]:
def brute_force_optimize():
    ''' Iterate over a range of values for the coefficients and find optimal
    values on N-dimensional mesh.  This is done using brute force which will
    be time consuming if rnge is too large and/or we are using too many
    coefficients in the objective function. '''
    
    delta = 0.25
    rnge = 5
    a = [0.5, 1.0, 1.5]
    a = [i - delta for i in a]

    prediction_stack = {}
    for i, s in enumerate(seasons_all):
        results = results_all[s]
        wins = wins_all[s]
        losses = losses_all[s]
        teams = teams_all[s]
        prediction_stack[s] = []

        a1 = a[0]
        for i in range(0,rnge):
            a1 += delta
            a2 = a[1]
            for j in range(0,rnge):
                a2 += delta
                a3 = a[2]
                for k in range(0,rnge):
                    a3 += delta

                    # Only accept combinations where coefficients
                    # gradually increase in size i.e., longer win
                    # streaks more substantially correlate to
                    # playoff success
                    if a1 < a2 < a3:
                        prediction = predict_rounds(teams, wins, losses, a1, a2, a3)
                        score = prediction_score(prediction, results)
                        prediction_stack[s].append((str(a1)+' '+str(a2)+' '+str(a3),
                                                    prediction, score))
                    #print('{0:.2f} {1:.2f} {2:.2f}'.format(a1, a2, a3), ':', score)

    return prediction_stack

a = brute_force_optimize()

In [22]:
len(a[20102011])

66

In [23]:
# Let's look for lowest overall score by creating
# dictionary with ID = str(a1) + str(a2) + str(a3)
# as the key. Using 20102011 as season arbitrarily
b = {ID: 0 for ID in [a[20102011][i][0] for i in range(len(a[20102011]))]}

for s in seasons_all:
    for ID, prediction, score in a[s]:
        b[ID] += score

b_tuples = list(zip(b.keys(), b.values()))
b_sorted = sorted(b_tuples, key=operator.itemgetter(1))
print(b_sorted[:10])
print('')
optimal_ID = b_sorted[0][0]
print('optimal ID:', optimal_ID)
print('')

for s in seasons_all:
    for ID, prediction, score in a[s]:
        if optimal_ID == ID:
            print(s, score)
            print('prediction', prediction)
            print('result', results_all[s])
            print('')

[('1.25 1.5 2.5', 84), ('0.75 1.75 2.25', 85), ('0.75 2.0 2.5', 85), ('1.5 2.0 2.5', 86), ('0.75 1.0 1.5', 86), ('0.75 1.5 2.0', 86), ('1.0 1.75 2.5', 86), ('0.5 1.25 1.5', 86), ('1.25 1.75 2.5', 86), ('1.0 1.5 2.0', 87)]

optimal ID: 1.25 1.5 2.5

20102011 18
prediction {'DET': 1, 'S.J': 2, 'MTL': 0, 'PHX': 0, 'WSH': 3, 'PHI': 1, 'BUF': 1, 'CHI': 0, 'ANA': 1, 'VAN': 4, 'T.B': 0, 'NSH': 0, 'BOS': 2, 'PIT': 0, 'NYR': 0, 'L.A': 0}
result {'DET': 1, 'S.J': 2, 'MTL': 0, 'PHX': 0, 'WSH': 1, 'CHI': 0, 'BUF': 0, 'PHI': 1, 'VAN': 3, 'T.B': 2, 'NSH': 1, 'L.A': 0, 'BOS': 4, 'PIT': 0, 'NYR': 0, 'ANA': 0}

20112012 22
prediction {'DET': 0, 'S.J': 0, 'WSH': 0, 'PHX': 0, 'STL': 3, 'CHI': 2, 'FLA': 0, 'N.J': 4, 'PHI': 1, 'VAN': 1, 'NSH': 1, 'OTT': 0, 'BOS': 0, 'PIT': 1, 'NYR': 2, 'L.A': 0}
result {'DET': 0, 'S.J': 0, 'WSH': 1, 'PHX': 2, 'STL': 1, 'CHI': 0, 'FLA': 0, 'N.J': 3, 'PHI': 1, 'VAN': 0, 'NSH': 1, 'L.A': 4, 'BOS': 0, 'PIT': 0, 'NYR': 2, 'OTT': 0}

20132014 35
prediction {'PHI': 0, 'S.J': 1, '

### Use optimal coefficients to predict 2016 playoff results

In [33]:
# Due to my sorting algorithm, must have east teams on top row
# and west teams on bottom row
teams_20152016 = ['T.B', 'FLA', 'DET', 'WSH', 'NYR', 'PIT', 'NYI', 'PHI',
                  'DAL', 'STL', 'CHI', 'NSH', 'MIN', 'ANA', 'L.A', 'S.J']

In [25]:
# Populate win and loss streak dictionaries for teams above
wins_20152016, losses_20152016 = get_all_streaks(df, teams_20152016, 20152016)

In [35]:
prediction_20152016 = predict_rounds(teams_20152016,
                                     wins_20152016,
                                     losses_20152016,
                                     a1=1.25, a2=1.75, a3=2.5,
                                     PredictRounds=False)
prediction_20152016

[('WSH', 18.318000000000005),
 ('DAL', 14.025999999999998),
 ('STL', 10.267999999999999),
 ('NYI', 6.515999999999999),
 ('S.J', 7.507999999999999),
 ('L.A', 6.555999999999999),
 ('PIT', 6.287999999999999),
 ('NYR', 4.083999999999999),
 ('CHI', 3.588),
 ('FLA', 2.328),
 ('PHI', 1.9919999999999998),
 ('ANA', 1.318),
 ('NSH', -0.742),
 ('T.B', -0.9420000000000001),
 ('DET', -0.988),
 ('MIN', -3.262)]

### Let's instead optimize based on only matching to the top 2 and bottom 8 teams

In [38]:
def prediction_score_2(prediction, result):
    ''' Changed weights for p=1 and p=2.
    Input dictionaries of predictions and results.  This function
    copmares the two and provides an integer value as a score, where
    a perfect match is 0 and a large value indicates a poor match.'''
    
    scores = []
    # Iterate over the team name ID's e.g. 'TOR'
    for k in prediction.keys():
        # Predicted round team will make it past
        p = prediction[k]
        # Actual result
        r = result[k]
        
        # Weights for the score
        # Lets give more weight to correctly
        # predicting the later rounds
        if p == 4 or p == 3:
            w = 3
        elif p == 2 or p == 1:
            w = 0
        elif p == 0:
            w = 1
        
        # Add score to the list
        scores.append(w*abs(p - r))

    return sum(scores)

In [62]:
def brute_force_optimize_2():
    ''' 
    Iterate over a range of values for the coefficients and find optimal
    values on N-dimensional mesh.  This is done using brute force which will
    be time consuming if rnge is too large and/or we are using too many
    coefficients in the objective function. '''
    
    # Define the amount to shift parameters by
    # and the range (i.e. number of iterations)
    delta = 0.25
    rnge = 17
    # Define the starting values for the coefficients
    a = [0.5, 0.5, 0.5]
    a = [i - delta for i in a]

    prediction_stack = {}
    for i, s in enumerate(seasons_all):
        results = results_all[s]
        wins = wins_all[s]
        losses = losses_all[s]
        teams = teams_all[s]
        prediction_stack[s] = []

        a1 = a[0]
        for i in range(0,rnge):
            a1 += delta
            a2 = a[1]
            for j in range(0,rnge):
                a2 += delta
                a3 = a[2]
                for k in range(0,rnge):
                    a3 += delta

                    # Only look for combinations where coefficients
                    # gradually increase in size i.e., longer win
                    # streaks more substantially correlate to
                    # playoff success i.e., a1 < a2 < a3:
                    if True:
                        prediction = predict_rounds(teams, wins, losses, a1, a2, a3)
                        score = prediction_score_2(prediction, results)
                        prediction_stack[s].append((str(a1)+' '+str(a2)+' '+str(a3),
                                                    prediction, score))
                    #print('{0:.2f} {1:.2f} {2:.2f}'.format(a1, a2, a3), ':', score)

    return prediction_stack

In [63]:
a = brute_force_optimize_2()

In [64]:
# Let's look for lowest overall score by creating
# dictionary with ID = str(a1) + str(a2) + str(a3)
# as the key. Using 20102011 as season arbitrarily,
# this should not be concerning because each season
# has scores for the same parameters.
b = {ID: 0 for ID in [a[20102011][i][0] for i in range(len(a[20102011]))]}

for s in seasons_all:
    for ID, prediction, score in a[s]:
        b[ID] += score

b_tuples = list(zip(b.keys(), b.values()))
b_sorted = sorted(b_tuples, key=operator.itemgetter(1))
print(b_sorted[:10])
print('')
optimal_ID = b_sorted[0][0]
print('optimal ID:', optimal_ID)
print('')

for s in seasons_all:
    for ID, prediction, score in a[s]:
        if optimal_ID == ID:
            print(s, score)
            print('prediction', prediction)
            print('result', results_all[s])
            print('')

[('2.5 4.25 4.0', 51), ('1.75 2.75 2.5', 52), ('2.75 4.25 4.0', 52), ('2.5 3.75 3.5', 52), ('2.0 3.0 2.75', 52), ('2.25 3.5 3.25', 52), ('3.0 4.5 4.25', 52), ('3.0 4.5 4.0', 53), ('1.5 2.25 2.0', 53), ('3.75 1.0 4.5', 54)]

optimal ID: 2.5 4.25 4.0

20102011 4
prediction {'DET': 1, 'S.J': 0, 'MTL': 0, 'PHX': 0, 'WSH': 2, 'PHI': 1, 'BUF': 1, 'CHI': 0, 'ANA': 2, 'VAN': 3, 'T.B': 0, 'NSH': 1, 'BOS': 4, 'PIT': 0, 'NYR': 0, 'L.A': 0}
result {'DET': 1, 'S.J': 2, 'MTL': 0, 'PHX': 0, 'WSH': 1, 'CHI': 0, 'BUF': 0, 'PHI': 1, 'VAN': 3, 'T.B': 2, 'NSH': 1, 'L.A': 0, 'BOS': 4, 'PIT': 0, 'NYR': 0, 'ANA': 0}

20112012 16
prediction {'DET': 0, 'S.J': 0, 'WSH': 0, 'PHX': 0, 'STL': 1, 'CHI': 1, 'FLA': 0, 'N.J': 4, 'PHI': 1, 'VAN': 2, 'NSH': 3, 'OTT': 0, 'BOS': 0, 'PIT': 1, 'NYR': 2, 'L.A': 0}
result {'DET': 0, 'S.J': 0, 'WSH': 1, 'PHX': 2, 'STL': 1, 'CHI': 0, 'FLA': 0, 'N.J': 3, 'PHI': 1, 'VAN': 0, 'NSH': 1, 'L.A': 4, 'BOS': 0, 'PIT': 0, 'NYR': 2, 'OTT': 0}

20132014 25
prediction {'PHI': 0, 'S.J': 1, '

#### Optimal coefficients (0.5-4.5) with a1 < a2 < a3: ('3.25 4.0 4.25', 55)

In [65]:
prediction_20152016_2a = predict_rounds(teams_20152016,
                                     wins_20152016,
                                     losses_20152016,
                                     a1=3.25, a2=4.0, a3=4.25,
                                     PredictRounds=False)
prediction_20152016_2a

[('WSH', 34.568),
 ('DAL', 30.275999999999996),
 ('STL', 18.268000000000004),
 ('NYI', 10.766),
 ('S.J', 15.008),
 ('PIT', 9.537999999999998),
 ('NYR', 7.083999999999999),
 ('L.A', 6.805999999999999),
 ('CHI', 5.837999999999999),
 ('PHI', 3.492),
 ('FLA', 3.078),
 ('ANA', 1.068),
 ('DET', -1.988),
 ('NSH', -2.492),
 ('T.B', -6.442000000000001),
 ('MIN', -9.012)]

#### Optimal coefficients (0.5-4.5) unrestricted: ('2.5 4.25 4.0', 51)

In [75]:
prediction_20152016_2b = predict_rounds(teams_20152016, wins_20152016, losses_20152016,
                                        a1=2.5, a2=4.25, a3=4.0, PredictRounds=False)
prediction_20152016_2b

[('WSH', 29.818),
 ('DAL', 28.276),
 ('STL', 19.768),
 ('NYI', 13.266),
 ('S.J', 13.008),
 ('PIT', 9.538),
 ('CHI', 7.588),
 ('NYR', 4.584),
 ('L.A', 6.556),
 ('PHI', 2.242),
 ('ANA', 1.568),
 ('DET', 1.262),
 ('FLA', 0.328),
 ('T.B', -0.692),
 ('NSH', -0.742),
 ('MIN', -5.762)]

### Null hypothesis - wins/losses are the main factor, regardless of streaks

In [28]:
def wins_losses_from_streaks(win_streaks, loss_streaks):
    ''' Since we already have all the information needed to determine the number
    of wins, lets calculate it instead of getting it manually / scraping it. This
    can also serve as a double check for our win/loss streaks. '''
    
    teams = win_streaks.keys()
    wins = {}
    losses = {}
    
    for t in teams:
        w, l = 0, 0
        w_streaks = win_streaks[t]
        l_streaks = loss_streaks[t]
        for s in w_streaks.keys():
            w = w + s*w_streaks[s]
        for s in l_streaks.keys():
            l = l + s*l_streaks[s]
        wins[t], losses[t] = w, l
    
    return wins, losses

In [29]:
win_number_20152016, loss_number_20152016 = wins_losses_from_streaks(wins_20152016, losses_20152016)
print('wins')
print(win_number_20152016)
print('')
print('losses')
print(loss_number_20152016)

wins
{'DET': 41, 'S.J': 46, 'MIN': 38, 'STL': 49, 'NYI': 45, 'WSH': 56, 'CHI': 47, 'FLA': 47, 'PHI': 41, 'T.B': 46, 'DAL': 50, 'NSH': 41, 'L.A': 48, 'PIT': 48, 'NYR': 46, 'ANA': 46}

losses
{'DET': 41, 'S.J': 36, 'MIN': 44, 'STL': 33, 'NYI': 37, 'WSH': 26, 'CHI': 35, 'FLA': 35, 'PHI': 41, 'T.B': 36, 'DAL': 32, 'NSH': 41, 'L.A': 34, 'PIT': 34, 'NYR': 36, 'ANA': 36}


In [30]:
def wins_losses_predict_rounds(teams, wins, losses):
    ''' Input two dictionaries, convert to lists of tuples
    for sorting and then combine into ordered list to output. '''

    ranks = [(t, wins[t] - losses[t]) for t in wins.keys()]
    
    ranks = sorted(ranks, key=operator.itemgetter(1))[::-1]
    
#     # Correct for playoff format east vs west
#     # Must be feeding in a sorted list of tuples
#     # with the largest score as the first element
#     E_teams = teams[:8]
#     ''' Iterate over list checking "manually". '''
#     east_team = lambda team: team[0] in E_teams
#     east_west_ranks = []
#     for i in range(len(ranks)):

#         if i == 0:
#             if east_team(ranks[0]):
#                 E_boo = True
#             else:
#                 E_boo = False
#             east_west_ranks.append(ranks.pop(0))
#         elif i == 1:
#             if E_boo:
#                 for j in range(len(ranks)):
#                     if not east_team(ranks[j]):
#                         east_west_ranks.append(ranks.pop(j))
#                         break
#             else:
#                 for j in range(len(ranks)):
#                     if east_team(ranks[j]):
#                         east_west_ranks.append(ranks.pop(j))
#                         break

#         elif i == 2:
#             if east_team(ranks[0]):
#                 E_boo = True
#             else:
#                 E_boo = False
#             east_west_ranks.append(ranks.pop(0))
#         elif i == 3:
#             if E_boo:
#                 for j in range(len(ranks)):
#                     if not east_team(ranks[j]):
#                         east_west_ranks.append(ranks.pop(j))
#                         break
#             else:
#                 for j in range(len(ranks)):
#                     if east_team(ranks[j]):
#                         east_west_ranks.append(ranks.pop(j))
#                         break

#         elif 4 <= i <= 7:
#             if i == 4:
#                 EW_list = []
#             if EW_list.count('E') == 2:
#                 for j in range(len(ranks)):
#                     if not east_team(ranks[j]):
#                         EW_list.append('W')
#                         #print(i, 'east full')
#                         #print(ranks[j])
#                         #print(EW_list)
#                         east_west_ranks.append(ranks.pop(j))
#                         break
#             elif EW_list.count('W') == 2:
#                 for j in range(len(ranks)):
#                     if east_team(ranks[j]):
#                         EW_list.append('E')
#                         #print(i, 'west full')
#                         #print(ranks[j])
#                         #print(EW_list)
#                         east_west_ranks.append(ranks.pop(j))
#                         break
#             else:
#                 if east_team(ranks[0]):
#                     EW_list.append('E')
#                 else:
#                     EW_list.append('W')
#                 #print(i, 'none full')
#                 #print(ranks[0])
#                 #print(EW_list)
#                 east_west_ranks.append(ranks.pop(0))

#     for i in range(len(ranks)):
#         east_west_ranks.append(ranks.pop(0))

#     ranks = east_west_ranks

    return ranks

ranks = wins_losses_predict_rounds(teams_20152016, win_number_20152016, loss_number_20152016)
ranks

[('WSH', 30),
 ('DAL', 18),
 ('STL', 16),
 ('PIT', 14),
 ('L.A', 14),
 ('FLA', 12),
 ('CHI', 12),
 ('ANA', 10),
 ('NYR', 10),
 ('T.B', 10),
 ('S.J', 10),
 ('NYI', 8),
 ('NSH', 0),
 ('PHI', 0),
 ('DET', 0),
 ('MIN', -6)]

## Secondary future work
### If matching to null hypothesis do one or more of the following
#### - try optimizing loss streak coefficients separately from wins (non-symmetric)
#### - let coefficients vary more rapidly to allow for stronger correlations between large win streaks and success
#### - include more seasons (say, back to 2008)