# Acquiring NBA Data

* Stephen Curry shots data from 2014-15 season
* Play by Play data for each GSW game last season 
* nbashots package by @savvastj on github
* stats.nba.com API

In [1]:
%matplotlib inline
import requests
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import nbashots as nba # this will throw a warning if using matplotlib 1.5



## First Get All Steph Curry Shots
* only from 2014-15 season
* use nbashots package

In [2]:
curry_id = nba.get_player_id("Curry, Stephen")[0]
curry_id

201939

In [3]:
shots = nba.Shots()

In [4]:
shots.url_paramaters

{'AheadBehind': '',
 'ClutchTime': '',
 'ContextFilter': '',
 'ContextMeasure': 'FGA',
 'DateFrom': '',
 'DateTo': '',
 'EndPeriod': '',
 'EndRange': '',
 'GameID': '',
 'GameSegment': '',
 'LastNGames': 0,
 'LeagueID': '00',
 'Location': '',
 'Month': 0,
 'OpponentTeamID': 0,
 'Outcome': '',
 'Period': 0,
 'PlayerID': 0,
 'PointDiff': '',
 'Position': '',
 'RangeType': '',
 'RookieYear': '',
 'Season': '2015-16',
 'SeasonSegment': '',
 'SeasonType': 'Regular Season',
 'StartPeriod': '',
 'StartRange': '',
 'TeamID': 0,
 'VsConference': '',
 'VsDivision': ''}

In [5]:
# update Season parameter to '2014-15'
shots.update_params({'Season':'2014-15'})
# updated parameters
shots.url_paramaters

{'AheadBehind': '',
 'ClutchTime': '',
 'ContextFilter': '',
 'ContextMeasure': 'FGA',
 'DateFrom': '',
 'DateTo': '',
 'EndPeriod': '',
 'EndRange': '',
 'GameID': '',
 'GameSegment': '',
 'LastNGames': 0,
 'LeagueID': '00',
 'Location': '',
 'Month': 0,
 'OpponentTeamID': 0,
 'Outcome': '',
 'Period': 0,
 'PlayerID': 0,
 'PointDiff': '',
 'Position': '',
 'RangeType': '',
 'RookieYear': '',
 'Season': '2014-15',
 'SeasonSegment': '',
 'SeasonType': 'Regular Season',
 'StartPeriod': '',
 'StartRange': '',
 'TeamID': 0,
 'VsConference': '',
 'VsDivision': ''}

First get all shots taken in the NBA last year

In [6]:
last_season_shots = shots.get_shots()
last_season_shots.head()

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,ACTION_TYPE,SHOT_TYPE,SHOT_ZONE_BASIC,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG
0,Shot Chart Detail,21400001,2,203076,Anthony Davis,1610612740,New Orleans Pelicans,1,11,43,...,Jump Shot,2PT Field Goal,Mid-Range,Center(C),16-24 ft.,20,50,194,1,0
1,Shot Chart Detail,21400001,4,202696,Nikola Vucevic,1610612753,Orlando Magic,1,11,31,...,Jump Bank Shot,2PT Field Goal,Mid-Range,Center(C),16-24 ft.,18,-8,189,1,1
2,Shot Chart Detail,21400001,7,203076,Anthony Davis,1610612740,New Orleans Pelicans,1,11,6,...,Jump Shot,2PT Field Goal,Mid-Range,Left Side Center(LC),16-24 ft.,18,-131,127,1,0
3,Shot Chart Detail,21400001,9,203901,Elfrid Payton,1610612753,Orlando Magic,1,10,54,...,Layup Shot,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,1,-15,4,1,0
4,Shot Chart Detail,21400001,25,203076,Anthony Davis,1610612740,New Orleans Pelicans,1,10,29,...,Dunk Shot,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,0,0,1,1,1


Create a dataframe that contains all of Steph's shots

In [7]:
curry_shots_df = last_season_shots[last_season_shots['PLAYER_ID'] == curry_id]
curry_shots_df.head()

Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,ACTION_TYPE,SHOT_TYPE,SHOT_ZONE_BASIC,SHOT_ZONE_AREA,SHOT_ZONE_RANGE,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG
2166,Shot Chart Detail,21400014,50,201939,Stephen Curry,1610612744,Golden State Warriors,1,7,29,...,Running Jump Shot,2PT Field Goal,Mid-Range,Left Side(L),8-16 ft.,10,-81,72,1,1
2174,Shot Chart Detail,21400014,74,201939,Stephen Curry,1610612744,Golden State Warriors,1,5,9,...,Jump Shot,3PT Field Goal,Above the Break 3,Center(C),24+ ft.,25,65,246,1,0
2206,Shot Chart Detail,21400014,176,201939,Stephen Curry,1610612744,Golden State Warriors,2,8,49,...,Driving Layup Shot,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,2,23,15,1,1
2210,Shot Chart Detail,21400014,205,201939,Stephen Curry,1610612744,Golden State Warriors,2,6,43,...,Driving Layup Shot,2PT Field Goal,Restricted Area,Center(C),Less Than 8 ft.,1,-11,7,1,1
2214,Shot Chart Detail,21400014,227,201939,Stephen Curry,1610612744,Golden State Warriors,2,5,36,...,Jump Shot,3PT Field Goal,Above the Break 3,Left Side Center(LC),24+ ft.,24,-164,187,1,0


Get game IDs for each game Steph played in last season

In [8]:
all_steph_gameIDs = curry_shots_df['GAME_ID']
played_gamesID = all_steph_gameIDs.unique()
np.savetxt('/data/bkotecha/playedGames.txt',played_gamesID,delimiter=", ",fmt="%s") 
len(played_gamesID)

80

Save curry_shots_df to CSV for later use

In [9]:
curry_shots_df.to_csv('/data/bkotecha/curryShots.csv')

## Now Get Play-by-Play Data from NBA.com
* PBP data of every GSW game from last season

Retrieve all game IDs from saved .txt file

In [10]:
steph_game_ids = np.loadtxt('/data/bkotecha/playedGames.txt', dtype= str)

In [11]:
steph_game_ids[0]

"b'0021400014'"

In [12]:
steph_game_ids = list(map(lambda s: s.replace('b' , ''), steph_game_ids))
steph_game_ids = list(map(lambda s: s.replace("'" , ''), steph_game_ids))

In [13]:
steph_game_ids[0]

'0021400014'

Now use requests library to get JSON files from stats.nba.com for each game and save as CSV

In [14]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0'}

for i in range(0,len(steph_game_ids)):
    filePath = "/data/bkotecha/games/"
    pbp_url = 'http://stats.nba.com/stats/playbyplayv2?EndPeriod=10&EndRange=55800&GameID='+steph_game_ids[i]+'&RangeType=2&Season=2014-15&SeasonType=Regular+Season&StartPeriod=1&StartRange=0'
    print("getting json for game: ")
    print(i)
    response = requests.get(pbp_url,headers=headers)
    heads = response.json()['resultSets'][0]['headers']
    response.raise_for_status() # raise exception if invalid response
    pbp = response.json()['resultSets'][0]['rowSet']
    pbp_df = pd.DataFrame(pbp, columns=heads)
    filePath += steph_game_ids[i] + '.csv'
    pbp_df.to_csv(filePath)

getting json for game: 
0
getting json for game: 
1
getting json for game: 
2
getting json for game: 
3
getting json for game: 
4
getting json for game: 
5
getting json for game: 
6
getting json for game: 
7
getting json for game: 
8
getting json for game: 
9
getting json for game: 
10
getting json for game: 
11
getting json for game: 
12
getting json for game: 
13
getting json for game: 
14
getting json for game: 
15
getting json for game: 
16
getting json for game: 
17
getting json for game: 
18
getting json for game: 
19
getting json for game: 
20
getting json for game: 
21
getting json for game: 
22
getting json for game: 
23
getting json for game: 
24
getting json for game: 
25
getting json for game: 
26
getting json for game: 
27
getting json for game: 
28
getting json for game: 
29
getting json for game: 
30
getting json for game: 
31
getting json for game: 
32
getting json for game: 
33
getting json for game: 
34
getting json for game: 
35
getting json for game: 
36
getting jso

View dataframe of first game played

In [15]:
fileName = '/data/bkotecha/games/'+steph_game_ids[0]+'.csv'
temp_df = pd.read_csv(fileName)

In [16]:
temp_df.head()

Unnamed: 0.1,Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,...,PLAYER2_TEAM_CITY,PLAYER2_TEAM_NICKNAME,PLAYER2_TEAM_ABBREVIATION,PERSON3TYPE,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ID,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION
0,0,21400014,0,12,0,1,10:15 PM,12:00,,,...,,,,0,0,,,,,
1,1,21400014,1,10,0,1,10:16 PM,12:00,Jump Ball Thompson vs. Bogut: Tip to,,...,Golden State,Warriors,GSW,2,1610612758,,,,,
2,2,21400014,2,2,5,1,10:16 PM,11:43,MISS Cousins 3' Layup,,...,,,,0,0,,,,,
3,3,21400014,3,4,0,1,10:16 PM,11:41,,,...,,,,0,0,,,,,
4,4,21400014,4,2,1,1,10:16 PM,11:35,,,...,,,,0,0,,,,,


Most games have about 450-550 events

In [17]:
len(temp_df)

526

## Manipulating the Data
* Convert PCTIMESTRING to seconds
* place markers for: end of quarter, steph subbed in, and steph subbed out
* add column that indicates time elapsed between consecutive events
* keep count of consecutive seconds steph has played at any given event in a game

A function to get seconds value of game clock

In [18]:
def get_sec(s):
    l = s.split(':')
    return int(l[0]) * 3600 + int(l[1]) * 60 + int(l[2])

A function to determine:
 - Whether GSW is home or away
 - When Steph is subbed in or out of the game
 - If an event is the end of the quarter 
 - The seconds elapsed between each game event

In [19]:
def setupGameDF(game_df):
    #check if warriors are home or visitors
    if(game_df['PLAYER1_TEAM_CITY'].iloc[1] == 'Golden State'):
        game_df['stephSubOut'] = game_df['HOMEDESCRIPTION'].str.contains("FOR Curry",na=False)
        game_df['stephSubIn'] = game_df['HOMEDESCRIPTION'].str.contains("Curry FOR",na=False)
    else:
        game_df['stephSubOut'] = game_df['VISITORDESCRIPTION'].str.contains("FOR Curry",na=False)
        game_df['stephSubIn'] = game_df['VISITORDESCRIPTION'].str.contains("Curry FOR",na=False)
    
    #steph always starts games  
    game_df['stephIn'] = True
    game_df['stephOut'] = False
    game_df.loc[0, 'timeElapsedBetweenEvents'] = 0
    game_df['stephConsecutiveSeconds'] = 0
    #place marker for end of quarter
    game_df['endOfQuarter'] = (game_df['PCTIMESTRING'] == 0)
    game_df['timeElapsedBetweenEvents'] = game_df['PCTIMESTRING'].shift(1) - game_df['PCTIMESTRING']

A function to get the CSV of a specified game

In [20]:
def specificGame(steph_game_ids,i):
    fileName = "/data/bkotecha/games/"
    fileName += steph_game_ids[i] +'.csv'
    game_df =pd.read_csv(fileName)
    return game_df

A function to calculate Steph's consecutive seconds played for specified game

In [21]:
def countConsecutiveSeconds(game_df,stephSecCounter=0):
    for index, row in game_df.iterrows():
        if(game_df['endOfQuarter'].iloc[index] == True):
            game_df.loc[index+1, 'timeElapsedBetweenEvents'] = 0
            stephSecCounter = 0 # reset consecutive seconds player counter at end of quarters
            if(game_df['PERIOD'].iloc[index] == 3):
                row['stephSubIn'] = True
        if(game_df['PERIOD'].iloc[index] == 3):
            if(game_df['PCTIMESTRING'].iloc[index] == 720):
                row = game_df.ix[index:len(game_df)]
                row['stephIn'] = True
                row = game_df.ix[index:len(game_df)]
                row['stephOut'] = False
                stephSecCounter = 0 
        if(game_df['stephSubOut'].ix[index] ==  True):
            row = game_df.ix[index:len(game_df)]
            row['stephIn'] = False
            row = game_df.ix[index:len(game_df)]
            row['stephOut'] = True
            stephSecCounter = 0
            row = game_df.ix[index:index]
            row['stephConsecutiveSeconds'] = stephSecCounter
        elif(game_df['stephSubIn'].ix[index] == True):
            row = game_df.ix[index:len(game_df)]
            row['stephIn'] = True
            row = game_df.ix[index:len(game_df)]
            row['stephOut'] = False
            stephSecCounter = 0
        elif(game_df['stephIn'].iloc[index] == True):
            stephSecCounter += game_df['timeElapsedBetweenEvents'].iloc[index]
            game_df.loc[index, 'stephConsecutiveSeconds'] = stephSecCounter

Calculate Steph's Consecutive Seconds Played for every event in each of his 80 games

In [24]:
for i in range(0,len(steph_game_ids)):
    
    #get csv of specified game
    game_df = specificGame(steph_game_ids,i)
    
    #fix time formatting 
    #put game time in HH:MM:SS format
    game_df['PCTIMESTRING'] = '00:' + game_df['PCTIMESTRING'].astype(str)
    convertToSeconds = lambda x: get_sec(x)
    game_df['PCTIMESTRING'] = game_df['PCTIMESTRING'].apply(convertToSeconds)
    
    #setup game dataframe
    setupGameDF(game_df)
    
    #calculate steph consec seconds
    game_df.loc[0, 'timeElapsedBetweenEvents'] = 0
    countConsecutiveSeconds(game_df,0)
    
    #stack game data frames on top of each other to get one DF with all games
    if(i == 0):
        result = game_df
    else:
        result = pd.concat([result, game_df], ignore_index=True)

In [25]:
len(result)

37063

In [26]:
result.head()

Unnamed: 0.1,Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,...,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION,stephSubOut,stephSubIn,stephIn,stephOut,timeElapsedBetweenEvents,stephConsecutiveSeconds,endOfQuarter
0,0,21400014,0,12,0,1,10:15 PM,720,,,...,,,,False,False,True,False,0,0,False
1,1,21400014,1,10,0,1,10:16 PM,720,Jump Ball Thompson vs. Bogut: Tip to,,...,,,,False,False,True,False,0,0,False
2,2,21400014,2,2,5,1,10:16 PM,703,MISS Cousins 3' Layup,,...,,,,False,False,True,False,17,17,False
3,3,21400014,3,4,0,1,10:16 PM,701,,,...,,,,False,False,True,False,2,19,False
4,4,21400014,4,2,1,1,10:16 PM,695,,,...,,,,False,False,True,False,6,25,False


Now lets merge the events table with Steph Curry's shots table 

In [27]:
curry_shots = pd.read_csv('/data/bkotecha/curryShots.csv')
#rename game_event_id to eventnum
curry_shots=curry_shots.rename(columns = {'GAME_EVENT_ID':'EVENTNUM'})
len(curry_shots)

1341

In [28]:
final_df = pd.merge(curry_shots, result, on=['GAME_ID', 'EVENTNUM'])
len(final_df)

1341

Check if there are any shots with seconds played value of 0

In [29]:
len(final_df[final_df['stephConsecutiveSeconds'] == 0])

53

Sometimes Steph gets subbed into the game at the start of the 2nd or 4th quarter. These substitions do not show up on the play by play game log. There were 53 such shots where his consecutive seconds played was 0. To address this for any shots that had a time played value of 0, I subtracted the current seconds left in the quarter from the starting point of a quarter (720 seconds).

In [30]:
for index, row in final_df.iterrows():
    if(final_df['stephConsecutiveSeconds'].iloc[index] == 0):
        final_df.loc[index, 'stephConsecutiveSeconds'] = 720 - final_df['PCTIMESTRING'].iloc[index]

len(final_df[final_df['stephConsecutiveSeconds'] == 0])

0

In [31]:
final_df.head()

Unnamed: 0,Unnamed: 0_x,GRID_TYPE,GAME_ID,EVENTNUM,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD_x,MINUTES_REMAINING,...,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION,stephSubOut,stephSubIn,stephIn,stephOut,timeElapsedBetweenEvents,stephConsecutiveSeconds,endOfQuarter
0,2166,Shot Chart Detail,21400014,50,201939,Stephen Curry,1610612744,Golden State Warriors,1,7,...,,,,False,False,True,False,9,271,False
1,2174,Shot Chart Detail,21400014,74,201939,Stephen Curry,1610612744,Golden State Warriors,1,5,...,,,,False,False,True,False,7,411,False
2,2206,Shot Chart Detail,21400014,176,201939,Stephen Curry,1610612744,Golden State Warriors,2,8,...,,,,False,False,True,False,12,12,False
3,2210,Shot Chart Detail,21400014,205,201939,Stephen Curry,1610612744,Golden State Warriors,2,6,...,,,,False,False,True,False,5,138,False
4,2214,Shot Chart Detail,21400014,227,201939,Stephen Curry,1610612744,Golden State Warriors,2,5,...,,,,False,False,True,False,6,205,False


Save final dataframe for use in other notebooks

In [32]:
final_df.to_csv('/data/bkotecha/final_df.csv')