In [1]:
import pandas as pd
import datetime
from collections import Counter
from collections import defaultdict
import numpy as np
import random
df = pd.read_csv(r'NBA-PBP-2018-2019.csv')
df['Date'] = pd.to_datetime(df.Date)
df['GameID'] =df.groupby('URL').ngroup()


We will need a function to see how many games everyone has played at a certain date. This will let us set a 'minimum n games' need to be played to build a training set.

In [2]:
def games_played(team,date,df):
    """date should be in the form yyyy-mm-dd
    
    This function returns a pandas series of number of games played by team."""
    
    temp_df = df[df.Date<=date]
    
    v1 = temp_df.groupby(['Date','HomeTeam','AwayTeam'],as_index=False)['Quarter'].count()['HomeTeam'].value_counts()
    v2 = temp_df.groupby(['Date','HomeTeam','AwayTeam'],as_index=False)['Quarter'].count()['AwayTeam'].value_counts()
    
    return v1+v2

Creating a column that just contains the events we care about will make this easier to work with, rather than the data being kept in multiple columns.

In [3]:
def make_mmt_columns(df):
    """ 'mmt' column will denote make,miss,turnover for home/away team.
    These are the only events we care about in regards to a markov model.
    """
    df.HomePlay=df.HomePlay.astype(str)
    df.AwayPlay=df.AwayPlay.astype(str)
    
    x1 = df.HomePlay.apply(lambda x: 'make 2' if ' makes 2' in x.lower() else '')
    x2 = df.HomePlay.apply(lambda x: 'miss 2' if ' misses 2' in x.lower() else '')
    x3 = df.HomePlay.apply(lambda x: 'turnover' if 'turnover' in x.lower() else '')
    x4 = df.HomePlay.apply(lambda x: 'make 3' if ' makes 3' in x.lower() else '')
    x5 = df.HomePlay.apply(lambda x: 'miss 3' if ' misses 3' in x.lower() else '')
    x6 = df.HomePlay.apply(lambda x: 'ft make' if 'makes free throw' in x.lower() else '')
    x7 = df.HomePlay.apply(lambda x: 'ft miss' if 'misses free throw' in x.lower() else '')
    
    df['HomeMMT'] = x1+x2+x3+x4+x5+x6+x7
    
    y1 = df.AwayPlay.apply(lambda x: 'make 2' if ' makes 2' in x.lower() else '')
    y2 = df.AwayPlay.apply(lambda x: 'miss 2' if ' misses 2' in x.lower() else '')
    y3 = df.AwayPlay.apply(lambda x: 'turnover' if 'turnover' in x.lower() else '')
    y4 = df.AwayPlay.apply(lambda x: 'make 3' if ' makes 3' in x.lower() else '')
    y5 = df.AwayPlay.apply(lambda x: 'miss 3' if ' misses 3' in x.lower() else '')
    y6 = df.AwayPlay.apply(lambda x: 'ft make' if 'makes free throw' in x.lower() else '')
    y7 = df.AwayPlay.apply(lambda x: 'ft miss' if 'misses free throw' in x.lower() else '')
    
    
    df['AwayMMT'] = y1+y2+y3+y4+y5+y6+y7
    
    
    return df

We can build our sequences. We can just keep a list of the events that happened in the order that they happened, and chunk that up as we wish. 

In [4]:
events_clean=['make 2','miss 2','turnover','make 3','miss 3','ft miss','ft make']

def create_game_sequences(df,team,events_clean=events_clean):
    tempdfH = df[df.HomeTeam ==team]
    tempdfA = df[df.AwayTeam ==team]
    
    home_sequences = list(tempdfH.groupby('GameID')['HomeMMT'].apply(list))
    away_sequences = list(tempdfA.groupby('GameID')['AwayMMT'].apply(list))
    
    #home
    for idx,game in enumerate(home_sequences):
        game_cleaned=[]
        for event in game:
            if event in events_clean:
                game_cleaned.append(event)
        home_sequences[idx]=game_cleaned
    
    #away
    for idx,game in enumerate(away_sequences):
        game_cleaned=[]
        for event in game:
            if event in events_clean:
                game_cleaned.append(event)
        away_sequences[idx]=game_cleaned
        
    
        
    return home_sequences,away_sequences

Chunk up our sequences into whatever size we may want. Build a dictionary of these sequences with the last event being the key of our inner dictionary.

In [5]:
def split_sequences(sequences,group_size=3):
    splits=[]
    for seq in sequences:
        for i in range(len(seq)-(group_size-1)):
            split=tuple(seq[i:i+group_size])
            splits.append(split)
    
    return splits

def team_sequence_dict(sequences,group_size=4):
    """
    Create a dictionary where keys are sequences of length 'group_size-1',values are dictionaries.
    Keys of inner dictionary is next event in sequence, value is number of occurrences.
    """
    team_dict={}
    for k,v in Counter(split_sequences(sequences,group_size=group_size)).items():
        team_dict[k[:group_size-1]] = defaultdict(int)

    for k,v in Counter(split_sequences(sequences,group_size=group_size)).items():
        team_dict[k[:group_size-1]][k[group_size-1]]=v
        
    return team_dict

In [6]:
def todays_games(df,date):
    """
    Returned games will be of the form [(home team, away team),...]
    Date should be of the form yyyy-mm-dd.
    """
    
    today = df[df.Date==date][['HomeTeam','AwayTeam']].drop_duplicates()
    todays_games = list(zip(today.HomeTeam,today.AwayTeam))
    
    return todays_games

In [7]:
def teams_past_games(df,team,date):
    
    team_df = df[(df.Date<date)]
    
    team_df = df[(df.HomeTeam==team) | (df.AwayTeam==team)]
    
    return team_df

In [8]:
def get_first_half(df,home,away,date):
    temp_df = df[df.Date == date]
    temp_df = temp_df[(temp_df.HomeTeam==home) | (temp_df.AwayTeam==away)]
    temp_df = temp_df[(temp_df.Quarter==1) | (temp_df.Quarter==2)]
    home_score = temp_df.iloc[-1]['HomeScore']
    away_score = temp_df.iloc[-1]['AwayScore']
    temp_df = make_mmt_columns(temp_df)

    
    return temp_df,home_score,away_score

In [9]:
def get_second_half(df,home,away,date):
    temp_df = df[df.Date == date]
    temp_df = temp_df[(temp_df.HomeTeam==home) | (temp_df.AwayTeam==away)]
    temp_df = temp_df[(temp_df.Quarter==3) | (temp_df.Quarter==4)]
    home_score = temp_df.iloc[-1]['HomeScore']
    away_score = temp_df.iloc[-1]['AwayScore']
    temp_df = make_mmt_columns(temp_df)

    
    return temp_df,home_score,away_score

In [10]:
def number_possesions_change(df,date):
    temp_df=df[df.Date < date]
    all_teams = set(list(temp_df.HomeTeam.unique())+list(temp_df.AwayTeam.unique()))
    
    change_dict={}
    
    for team in all_teams:
        team_df=temp_df[(temp_df.HomeTeam==team) | (temp_df.HomeTeam==team)]
        team_df1 = team_df[(team_df.Quarter==1) | (team_df.Quarter==2)]
        team_df2 = team_df[(team_df.Quarter==3) | (team_df.Quarter==4)]
        
        home1,away1 = create_game_sequences(team_df1,team,events_clean=events_clean)
        home2,away2 = create_game_sequences(team_df2,team,events_clean=events_clean)
        
        home1_num_poss = [len(game) for game in home1]
        home2_num_poss = [len(game) for game in home2]
        away1_num_poss = [len(game) for game in away1]
        away2_num_poss = [len(game) for game in away2]
        
        change_home = [len(second)-len(first) for first,second in zip(home1,home2)]
        change_away = [len(second)-len(first) for first,second in zip(away1,away2)]
        
        change = change_home+change_away

        mean = np.mean(change)
        std = np.std(change)
                      
        change_dict[team]={'mean':mean,'std':std}
                      
    return change_dict

In [11]:
def score(counts):
    score=0
    score+=counts['make 3']*3
    score+=counts['make 2']*2
    score+=counts['ft make']
    
    return score

In [12]:
df= make_mmt_columns(df)

In [138]:
from datetime import timedelta
from datetime import datetime
import random
date = datetime.strptime('2019-3-01','%Y-%m-%d')
days_to_simulate = 30
ingame_weight = 0
markov_order=3
home_bias=3
wins=0
losses=0
spreads=[]
totals=[]
confidences=[]
winners=[] #items should be of the form (predicted winner, actual winner, halftime winner)

team_correct=defaultdict(int)
team_all=defaultdict(int)
for i in range(days_to_simulate):
    date+=timedelta(1)
    today = todays_games(df,date)
    
    #create dictionary that gives us the mean & std of change from number of possesions in 1st half compared to 2nd
    change_dict = number_possesions_change(df,date) 

    for game in today:
        #Get team abbs
        home_team = game[0]
        away_team = game[1]

        #Get all past games for each team
        home_df = teams_past_games(df,home_team,date)
        away_df = teams_past_games(df,home_team,date)
        
        
        #split past games into lists of sequential events
        home_seq,_ = create_game_sequences(home_df,home_team)
        _,away_seq = create_game_sequences(away_df,away_team)

        """split past game sequences into dictionaries of state probabilities. The order of our markov model = group_size-1.
        An example of an item in the dict is {['make 2','miss 2', 'turnover']:{'make 2':10, 'miss 2':15,....}},
        where the innermost values represent the number of occurrences seen for that event, given that state (outer key).
        """
        home_seq_dict = team_sequence_dict(home_seq,group_size=markov_order+1)
        away_seq_dict = team_sequence_dict(away_seq,group_size=markov_order+1)
        
        #Save first half df, scores
        first_half,home_score,away_score = get_first_half(df,home_team,away_team,date)

        #Extract home/away sequences
        first_half_home_seq,_ = create_game_sequences(first_half,home_team)
        _,first_half_away_seq = create_game_sequences(first_half,away_team)
        
        #First Half transition probabilities
        home_fh_seq_dict = team_sequence_dict(first_half_home_seq,group_size=2)
        away_fh_seq_dict = team_sequence_dict(first_half_away_seq,group_size=2)
        
        #Get the final scores
        end_home = df[(df.HomeTeam==home_team) & (df.AwayTeam==away_team) & (df.Date==date)].iloc[-1]['HomeScore']
        end_away = df[(df.HomeTeam==home_team) & (df.AwayTeam==away_team) & (df.Date==date)].iloc[-1]['AwayScore']

        #Initialize empty list to keep our predictions
        home_predict_list=[]
        away_predict_list=[]
        
        #Simulate our game for each team
        for it in range(1000):
            
            #Get the end of the first half so we have something to start us off, drop these when done
            second_half_sequence = first_half_home_seq[0][-markov_order:] 
            
            #Sample from normal distribution where mean is the mean diff from 1st half possesions to 2nd half,
            #and std is std of the same
            poss_diff_home = np.random.normal(loc=change_dict[home_team]['mean'],scale=change_dict[home_team]['std'])
            
            #Get the estimate/avg possesions in first half, add a random value sampled from our normal dist
            for i in range(round((len(first_half_home_seq[0])+len(first_half_away_seq[0]))/2+poss_diff_home+home_bias)):
                
                #If the previous states are a key in our transition dict, gather the weights
                try:
                    weights = np.array([home_seq_dict[tuple(second_half_sequence[-markov_order:])][event] for event in events_clean])
                    weights = weights/weights.sum()
                #If the previous states have not been seen/not a key, weight all possible events equally
                except:
                    
                    weights=np.array([1 for event in events_clean])
                    weights = weights/weights.sum()
                    
                #Weight season markov transition probabilities with ingame markov transition    
                try:
                    ingame_w = np.array([home_fh_seq_dict[tuple(second_half_sequence[-1:])][event] for event in events_clean])
                    ingame_w = ingame_w/ingame_w.sum()
                    weights = ingame_weighting*ingame_w + (1-ingame_weighting)*weights
                except:
                    pass
                
                #Generate a random choice based on the weights we created, append that to our sequence
                second_half_sequence.append(random.choices(events_clean,weights)[0])

                
            #Do the same thing for the away team
            second_half_sequence2 = first_half_away_seq[0][-markov_order:] #drop these when we are done simulating
            
            poss_diff_away = np.random.normal(loc=change_dict[away_team]['mean'],scale=change_dict[away_team]['std'])
            for i in range(round((len(first_half_home_seq[0])+len(first_half_away_seq[0]))/2+poss_diff_away)):
                try:
                    weights = np.array([away_seq_dict[tuple(second_half_sequence2[-markov_order:])][event] for event in events_clean])
                    weights = weights/weights.sum()
                except:
                    weights=np.array([1 for event in events_clean])
                    weights = weights/weights.sum()
                
                #Weight season markov transition probabilities with ingame markov transition
                try:
                    ingame_w = np.array([away_fh_seq_dict[tuple(second_half_sequence2[-1:])][event] for event in events_clean])
                    ingame_w = ingame_w/ingame_w.sum()
                    weights = ingame_weighting*ingame_w + (1-ingame_weighting)*weights
                except:
                    pass
                second_half_sequence2.append(random.choices(events_clean,weights)[0])
                
            #Score our second half and add it to the first half score, this is our prediction for one iteration
            home_predict_list.append(home_score+score(Counter(second_half_sequence[markov_order:])))
            away_predict_list.append(away_score+score(Counter(second_half_sequence2[markov_order:])))

        #Get the average points scored across all simulations for both teams
        home_pred = sum(home_predict_list)/len(home_predict_list)
        away_pred = sum(away_predict_list)/len(away_predict_list)
        
        #Calculate predicted spread and actual spread
        spread_pred = home_pred-away_pred
        spread_actual = end_home-end_away
        
        #Calculate predicted and actual point total (over/under)
        total_pred = home_pred+away_pred
        total_actual = end_home+end_away
        
        #Keep track of how we are doing
        spreads.append([spread_pred,spread_actual])
        totals.append([total_pred,total_actual])
        
        #calculate win prob
        pair_results = [1 if home_s>away_s else 0for home_s,away_s in zip(home_predict_list,away_predict_list)]
        confidence = sum(pair_results)/len(pair_results)
        if confidence<0.5:
            confidence=1-confidence
        
        confidences.append(confidence)
        #Decide if we picked the correct winner, keep track of accuracy by team
        if (home_pred>away_pred and end_home>end_away) or(home_pred<away_pred and end_home<end_away):
            print('win')
            wins+=1
            team_correct[home_team]+=1
            team_correct[away_team]+=1
            team_all[home_team]+=1
            team_all[away_team]+=1
        else:
            print('incorrect')
            losses+=1
            team_all[home_team]+=1
            team_all[away_team]+=1
        print(home_team+' home predicted '+str(home_pred)+'| home actual '+str(end_home) +'   |win prob= '+str(confidence))
        print(away_team+' away predicted '+str(away_pred)+'| away actual '+str(end_away) + '  |halftime diff '+str(abs(home_score-away_score)))
        print('')

incorrect
DEN home predicted 142.638| home actual 112   |win prob= 0.818
NOP away predicted 129.13| away actual 120  |halftime diff 4

win
UTA home predicted 115.988| home actual 115   |win prob= 0.8
MIL away predicted 102.724| away actual 111  |halftime diff 7

win
MIA home predicted 104.586| home actual 117   |win prob= 0.903
BRK away predicted 87.188| away actual 88  |halftime diff 10

win
DAL home predicted 109.509| home actual 81   |win prob= 0.875
MEM away predicted 125.533| away actual 111  |halftime diff 13

incorrect
PHI home predicted 127.356| home actual 117   |win prob= 0.742
GSW away predicted 117.947| away actual 120  |halftime diff 12

incorrect
IND home predicted 113.081| home actual 112   |win prob= 0.556
ORL away predicted 110.482| away actual 117  |halftime diff 5

win
SAS home predicted 139.071| home actual 116   |win prob= 0.835
OKC away predicted 125.62| away actual 102  |halftime diff 9

win
CLE home predicted 91.143| home actual 93   |win prob= 0.999
DET away pr

win
MIN home predicted 107.43| home actual 103   |win prob= 0.91
NYK away predicted 87.392| away actual 92  |halftime diff 10

incorrect
GSW home predicted 114.738| home actual 111   |win prob= 0.616
PHO away predicted 109.995| away actual 115  |halftime diff 1

incorrect
ATL home predicted 123.076| home actual 128   |win prob= 0.5489999999999999
NOP away predicted 124.219| away actual 116  |halftime diff 1

win
CLE home predicted 119.203| home actual 126   |win prob= 0.575
TOR away predicted 115.867| away actual 101  |halftime diff 4

win
BRK home predicted 118.806| home actual 103   |win prob= 0.997
DET away predicted 82.116| away actual 75  |halftime diff 26

win
HOU home predicted 132.312| home actual 118   |win prob= 0.974
CHO away predicted 101.31| away actual 106  |halftime diff 27

win
UTA home predicted 89.297| home actual 89   |win prob= 0.634
OKC away predicted 93.345| away actual 98  |halftime diff 13

win
LAC home predicted 116.457| home actual 140   |win prob= 0.625
BOS a

win
LAC home predicted 115.413| home actual 115   |win prob= 0.714
IND away predicted 107.702| away actual 109  |halftime diff 6

win
MIL home predicted 126.37| home actual 115   |win prob= 0.844
LAL away predicted 110.726| away actual 101  |halftime diff 3

win
MIN home predicted 102.983| home actual 107   |win prob= 0.791
GSW away predicted 114.285| away actual 117  |halftime diff 12

incorrect
SAC home predicted 129.24| home actual 121   |win prob= 0.78
BRK away predicted 117.096| away actual 123  |halftime diff 8

incorrect
CHI home predicted 111.717| home actual 126   |win prob= 0.638
WAS away predicted 116.166| away actual 120  |halftime diff 0

win
POR home predicted 123.602| home actual 126   |win prob= 0.85
DAL away predicted 107.723| away actual 118  |halftime diff 13

win
CLE home predicted 112.865| home actual 107   |win prob= 0.804
MIL away predicted 100.267| away actual 102  |halftime diff 11

win
MEM home predicted 116.813| home actual 126   |win prob= 0.658
HOU away pre

win
NYK home predicted 90.788| home actual 92   |win prob= 0.9339999999999999
TOR away predicted 109.473| away actual 117  |halftime diff 19

win
HOU home predicted 117.453| home actual 112   |win prob= 0.976
DEN away predicted 90.47| away actual 85  |halftime diff 18

win
MIL home predicted 122.113| home actual 128   |win prob= 0.967
LAC away predicted 96.24| away actual 118  |halftime diff 21

win
PHI home predicted 126.015| home actual 123   |win prob= 0.787
BRK away predicted 112.612| away actual 110  |halftime diff 10

incorrect
NOP home predicted 120.438| home actual 121   |win prob= 0.525
SAC away predicted 121.402| away actual 118  |halftime diff 2

win
DET home predicted 114.086| home actual 115   |win prob= 0.905
ORL away predicted 94.259| away actual 98  |halftime diff 14

win
UTA home predicted 127.54| home actual 128   |win prob= 0.653
WAS away predicted 121.37| away actual 124  |halftime diff 8

win
OKC home predicted 102.709| home actual 105   |win prob= 0.65
DEN away pr

In [136]:
wins/(wins+losses)

0.7557603686635944

In [137]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
y_actual = np.array([spread[1] for spread in spreads]).reshape(-1, 1)
x_predicted = np.array([spread[0] for spread in spreads]).reshape(-1, 1)

model.fit(x_predicted, y_actual)
print(model.score(x_predicted,y_actual))
fig = px.scatter(x=x_predicted.flatten(),y=y_actual.flatten())
fig.update_xaxes(title='predicted spread')
fig.update_yaxes(title='actual spread')

0.5329052733841544


In [139]:
print('median spread diff '+str(np.median(diffs)))


median spread diff -1.186


In [143]:
accs = []
teams=[]

for key in team_all.keys():
    wins=team_correct[key]
    total=team_all[key]
    teams.append(key)
    accs.append(wins/total)
    
px.bar(x=teams,y=accs)
