In [1]:
from py_ball import playbyplay #to get all the PBP data
import numpy as np 
import json
import os #itrate
import time
from sklearn import linear_model #logistic regression model
import pickle #saving model

In [2]:
"""
Send request to NBA API so that it seems like this is a browser request from stats.nba.com
"""
headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8,ru;q=0.6',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Referer': 'https://stats.nba.com/teams/boxscores-traditional/',
    'x-nba-stats-origin': 'stats',
    'x-nba-stats-token': 'true'
}


In [3]:
TOTAL_GAMES_PER_YEAR = 1230 # there are 30*82*(1/2) games in a given season

In [4]:
def get_seconds_left(period, time_string):
    time_in_quarter = 12 #normal quarter is 12 minutes long
    if period > 4:
        time_in_quarter=5 #if it's overtime, 5 mins long
    mins, seconds = time_string.split(':') #from a string like "11:20", we have 11 mins, 20 seconds
    extra_after_quarter = (4-period)*time_in_quarter*60 
    if period > 4:
        #if overtime, we go into negatives, so 10 seconds into overtime is -10 and so on
        extra_after_quarter = (5-period)*time_in_quarter*60 
        time_elapsed = (time_in_quarter*60) - ((int(mins)*60)+(int(seconds))) # convert to seconds
        return extra_after_quarter-time_elapsed
    else:
        return extra_after_quarter+(int(mins)*60)+(int(seconds)) #convert to seconds

In [None]:
"""
Given the total number of games in a season, we generate the game_ids
"""
def get_game_ids(years):
    game_ids = []
    for year in years:
        for game in range(1, TOTAL_GAMES_PER_YEAR+1):
            maybe = '002%s' + str(game).zfill(5)
            game_ids.append(maybe % year)
    return game_ids

In [None]:
"""
Using the py_ball API, we save all the PBP data locally as json files
"""
def populate_train_test(game_ids, directory='/home/avyayv/data/nba/playbyplay/py_ball/'):
    train_x = []
    train_y = []
    for game_id in game_ids:
        print(game_id)
        
        pbp = playbyplay.PlayByPlay(headers=headers, game_id=game_id)
        with open(directory+game_id+'.json', 'w') as fp:
            json.dump(pbp.data, fp)
            
        time.sleep(0.5)
    return train_x, train_y

In [None]:
train_gids = get_game_ids(['15','16', '17']) #we use the 2015-16, 2016-17 and 2017-18 seasons

In [None]:
populate_train_test(train_gids) # save pbp data for the train_seasons locally

In [7]:
"""
Our methodology for training the data is as follows:
We split the game into 288 10-second time frames (extra 30 time frames for OT) and run separate 
logistic regressions for EACH of the time frames. This is because I found that running a single 
logistic regression model did not capture the time aspect of the WP model very well.

In each of the logistic regressions, we feed in (a) the point differential, with respect to
the home team score and (b) the team that has possession of the ball.

The try catch statement is in case there was some sort of error in parsing a single pbp file.
One or two games have messed up json files, I found.

"""
def get_train_test_from_dir(directory='/home/avyayv/data/nba/playbyplay/py_ball/'):
    train = []
    for file in os.listdir(directory): #list the directory we saved the json files in
        try:
            pbp = json.loads(open(directory+file).read())
            jump_event = pbp['PlayByPlay'][1]
            home_has_ball = (jump_event['HOMEDESCRIPTION'] != None) #did home win the jump
            current_margin = 0
            home_wins = int(pbp['PlayByPlay'][-1]['SCOREMARGIN']) > 0 #was home winning in the last event?
            last_second = 2880 #this variable stores the current time (starts off at 2880 seconds left)
            
            game = {} #we store a dict because we are splitting up into many logistics regressions:
            #the key is the 10 second time (2870, 2860, ...) and the value is 
            #[margin, home_team_won, home_possession]
            
            added_this_game = [] #this is to make sure we aren't adding values twice
            for event in pbp['PlayByPlay'][2:]:
                
                seconds_left_in_game = get_seconds_left(event['PERIOD'], event['PCTIMESTRING']) #check number of seconds left
                
                for sec in range(seconds_left_in_game+1, last_second):
                    if sec % 10 == 0 and (sec not in added_this_game): #see if its a tenth, and its not been added yet
                        game[sec] = [current_margin, home_wins, home_has_ball] #populate train dictionary
                        added_this_game.append(sec)
                    
                last_second = seconds_left_in_game #update the last_second variable, which stores the current time
                
                """
                Now, we use logic to determine who has possession at a given timestamp, given the PBP data
                If the home team did something and the away team didn't, the home team has the ball.
                I.e if the home team got a rebound, than the home team has the ball
                """
                
                home_desc = (event['HOMEDESCRIPTION'] != None)
                visitor_desc = (event['VISITORDESCRIPTION'] != None)
                
                if home_desc and not visitor_desc:
                    home_has_ball = True
                
                if visitor_desc and not home_desc:
                    home_has_ball = False
                    
                if home_desc and visitor_desc:
                    if ('STEAL' in event['HOMEDESCRIPTION']) or ('BLOCK' in event['HOMEDESCRIPTION']):
                        home_has_ball = True
                    else:
                        home_has_ball = False
                        
                """
                Here, we update the margin if it changed
                """
                    
                if event['SCOREMARGIN'] != None:
                    margin = 0
                    if event['SCOREMARGIN'] != 'TIE':
                        margin = (int(event['SCOREMARGIN']))
                        current_margin = margin
                
                """
                Add to the game dictionary for pbp events
                """
                        
                if seconds_left_in_game % 10 == 0 and seconds_left_in_game not in added_this_game:  
                    game[seconds_left_in_game] = [current_margin, home_wins, home_has_ball]
                    added_this_game.append(int(seconds_left_in_game))
                
            train.append(game)
            
        except UnicodeDecodeError:
            continue
       
    return train

In [8]:
"""
Get the train data from the directory
"""
train = get_train_test_from_dir()

In [11]:
"""
Use the train data to generate a dictionary of models
The dictionary has a key for the 10 second timeframe and a value of the logistic model
"""
def train_model(train_x):
    time_to_train_x = {}
    time_to_train_y = {}
    time_to_model = {}
    for game in train_x:
        for key in game.keys():
            if key in time_to_train_x:
                time_to_train_x[key].append([game[key][0], game[key][2]])
                time_to_train_y[key].append(game[key][1])
            else:
                time_to_train_x[key] = [[game[key][0], game[key][2]]]
                time_to_train_y[key] = [game[key][1]]
                
    for key in time_to_train_x.keys():
        model = linear_model.LogisticRegression(max_iter=10000, solver='lbfgs')
        model.fit(X=np.array(time_to_train_x[key]), y=np.array(time_to_train_y[key]))
        time_to_model[key] = model
    return time_to_model

In [12]:
"""
Generate the models
"""
time_to_model = train_model(train)

In [13]:
"""
Save models using pickle
"""
with open('model.pickle', 'wb') as handle:
    pickle.dump(time_to_model, handle, protocol=pickle.HIGHEST_PROTOCOL)