## NBA Shooting Statistics and Styles
By Ashley Radford


In [1]:
%%javascript
// no autoscrolling with output
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
# necessary importations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import math

# plots graphs before full cell execution
from ipywidgets.widgets.interaction import show_inline_matplotlib_plots 
from ipywidgets import Dropdown, Button, Layout, Output
from IPython.display import clear_output

import helper_basketball as h
import sklearn.decomposition as skld # for non-negative matrix factorization
import seaborn as sns

from numpy import linalg as LA # to compute Frobenious norm
from scipy.cluster.hierarchy import dendrogram, linkage # for clustering

In [3]:
seasons = ['1946-47','1947-48','1948-49','1949-50','1950-51','1951-52','1952-53','1953-54','1954-55','1955-56','1956-57','1957-58','1958-59','1959-60','1960-61','1961-62','1962-63','1963-64','1964-65','1965-66','1966-67','1967-68','1968-69','1969-70','1970-71','1971-72','1972-73','1973-74','1974-75','1975-76','1976-77','1977-78','1978-79','1979-80','1980-81','1981-82','1982-83','1983-84','1984-85','1985-86','1986-87','1987-88','1988-89','1989-90','1990-91','1991-92','1992-93','1993-94','1994-95','1995-96','1996-97','1997-98','1998-99','1999-00','2000-01','2001-02','2002-03','2003-04','2004-05','2005-06','2006-07','2007-08','2008-09','2009-10','2010-11','2011-12','2012-13','2013-14','2014-15','2015-16','2016-17','2017-18']

In [4]:
# function to choose shotchart params
def shotchart_params(teamid='0', playerid='0', gameid=''):
    chosen_params = {'PlayerID':playerid,
                     'PlayerPosition':'',
                     'Season':latest_season,
                     'ContextMeasure':'FGA',
                     'DateFrom':'',
                     'DateTo':'',
                     'GameID':gameid,
                     'GameSegment':'',
                     'LastNGames':'0',
                     'LeagueID':'00',
                     'Location':'',
                     'Month':'0',
                     'OpponentTeamID':'0',
                     'Outcome':'',
                     'Period':'0',
                     'Position':'',
                     'RookieYear':'',
                     'SeasonSegment':'',
                     'SeasonType':'Regular Season',
                     'TeamID':teamid,
                     'VsConference':'',
                     'VsDivision':''}
    return chosen_params

In [5]:
# pulling team data function
def get_data():
    # getting teams data
    teams_params = {'LeagueID':'00'}
    teams = h.get_nba_data('commonTeamYears', teams_params)
    
    # getting player data
    # can put whatever in season, not only current season
    players_params = {'LeagueID':'00', 'Season': latest_season, 'IsOnlyCurrentSeason':'0'}
    players = h.get_nba_data('commonallplayers', players_params)
    
    # getting shots data
    shotdata = h.get_nba_data('shotchartdetail', shotchart_params())
    return teams, players, shotdata

In [6]:
def get_current(teams, players, shotdata):
    # setting up series types
    teams.ABBREVIATION = teams.ABBREVIATION.astype('category')
    teams.TEAM_ID      = teams.TEAM_ID.astype('category')
    teams.MIN_YEAR     = teams.MIN_YEAR.astype('int')
    teams.MAX_YEAR     = teams.MAX_YEAR.astype('int')

    # taking current teams
    teams = teams[(teams.MIN_YEAR <= (current_year-1)) & (teams.MAX_YEAR >= current_year)]
    # taking current and active players
    players = players[players.TEAM_ID.isin(teams.TEAM_ID) & players.PERSON_ID.isin(shotdata.PLAYER_ID)]
    # taking active players shotdata
    shotdata = shotdata[shotdata.PLAYER_ID.isin(players.PERSON_ID)]
    
    return teams, players, shotdata

In [7]:
def clean_teams(teams, players):
    # creating TEAM_NAME column on teams by merging data from players
    team_names = players[['TEAM_ABBREVIATION', 'TEAM_NAME']].copy().drop_duplicates()
    teams = pd.merge(teams, team_names, left_on='ABBREVIATION', right_on='TEAM_ABBREVIATION')

    # getting rid of TEAM_ABBREVIATION column
    teams = teams.drop(['TEAM_ABBREVIATION'], axis=1) # only returns values, so needs to be reassigned
    
    return teams

In [8]:
# prettier dates function
def readable_date(date):
    mm = date[4:6]
    dd = date[6:]
    yyyy = date[:4]
    readable = mm+'/'+dd+'/'+yyyy
    return readable

def prettier_dates(shotdata):
    # changing the GAME_DATE column in shotdata
    temp = []
    for date in shotdata.GAME_DATE:
        read_date = readable_date(date)
        temp += [read_date]

    shotdata['GAME_DATE'] = temp
    return shotdata

In [9]:
def get_player_other_dict(shotdata):
    # getting other teams        
    player_other_dd = dict()
    for p, data in shotdata.groupby('PLAYER_NAME'):
        all_teams = np.unique(shotdata.loc[shotdata['PLAYER_NAME']==p]['TEAM_ID']) # grab all of their teams (dont need .copy() because multi-dtype)
        if len(all_teams) > 1:
            current_team = shotdata.loc[shotdata['PLAYER_NAME']==p]['TEAM_ID'].values[-1] # grab their current team (ordered by date)
            other_teams = all_teams[np.where(all_teams!=current_team)]                    # only grab previous teams
            player_other_dd[p] = other_teams                                              # create dict keys and values
            
    return player_other_dd

In [10]:
def get_other_teams_df(player_other_dd, teams, players):
    # creating other_teams dataframe from player dataframe (to append onto later)
    other_teams = pd.DataFrame()
    for player in player_other_dd.keys():
        for i in range(len(player_other_dd[player])):
            # grabbing the additional row
            new_row = players[players['DISPLAY_FIRST_LAST']==player].copy()
            index = new_row.index[0] # finding index label
        
            # changing team columns
            team_id = player_other_dd[player][i]
        
            # updating the team columns
            new_row.loc[index,'TEAM_ID'] = team_id
            new_row.loc[index,'TEAM_NAME'] = teams['TEAM_NAME'][np.where(teams['TEAM_ID']==team_id)[0][0]]
            new_row.loc[index,'TEAM_ABBREVIATION'] = teams['ABBREVIATION'][np.where(teams['TEAM_ID']==team_id)[0][0]]
            new_row.loc[index,'TEAM_CITY'] = players['TEAM_CITY'][players.index[np.where(players['TEAM_ID']==team_id)[0][0]]]
            new_row.loc[index,'TEAM_CODE'] = players['TEAM_CODE'][players.index[np.where(players['TEAM_ID']==team_id)[0][0]]]
        
            # adding the updated column
            other_teams = other_teams.append(new_row)
    
    return other_teams

In [11]:
def combine_all_teams(players, other_teams):
    # appending other_teams onto players dataframe and sorting
    players = players.append(other_teams).sort_values('DISPLAY_LAST_COMMA_FIRST')
    return players

In [12]:
# preparation for correlation matrix, shooting heatmap, dendrogram
def get_ID_indexes(teams, players):
    teams = teams.set_index('TEAM_ID')
    teamids = np.unique(teams.index.values)

    players = players.set_index('PERSON_ID')
    playerids = np.unique(players.loc[players.TEAM_ID.isin(teamids)].index.values)
    
    return teams, teamids, players, playerids

In [13]:
# following data gathering procedure shown here 
# https://arxiv.org/abs/1401.0942
def get_allshots(playerids):
    shotdf = []
    for p in playerids:
        # get player p's data
        shotdata = h.get_nba_data('shotchartdetail', shotchart_params(playerid=p))
    
        # subset columns
        sc = shotdata.loc[:,'SHOT_DISTANCE':'SHOT_MADE_FLAG']
        sc.drop('SHOT_ATTEMPTED_FLAG', axis=1)

        # filter shots to 31 feet from end zone
        sc = sc.loc[sc.LOC_Y < (31*12)]
 
        # keep data with more than 50 shots
        if sc.SHOT_MADE_FLAG.sum() > 50:
            sc['PlayerID'] = p
            shotdf += [sc]
        
    allshots = pd.concat(shotdf)

    pickle.dump(allshots, open('allshots' + str(latest_season) + '.pkl', 'wb'))

In [14]:
def make_court_bins(allshots):
    # making court
    # bin edge definitions in inches
    xedges = (np.linspace(start=-25, stop=25, num=151, dtype=np.float)) * 12
    yedges = (np.linspace(start= -4, stop=31, num=106, dtype=np.float)) * 12

    # number of bins is one less than number of edges
    nx = xedges.size - 1
    ny = yedges.size - 1

    # 2d histogram containers for binned counts and smoothed binned counts
    all_smooth = {}
    all_counts = {}

    # data matrix: players (row) by vectorized 2-d court locations (column)
    for i, one in enumerate(allshots.groupby('PlayerID')):
    
        # pid is player and pdf is the rows associated to the player
        pid, pdf = one
    
        # h.bin_shots, see below for its function performance example
        tmp1, xedges, yedges = h.bin_shots(pdf, bin_edges=(xedges, yedges), density=True, sigma=2)
        tmp2, xedges, yedges = h.bin_shots(pdf, bin_edges=(xedges, yedges), density=False)
    
        # vectorize and store into dictionary
        all_smooth[pid] = tmp1.reshape(-1)
        all_counts[pid] = tmp2.reshape(-1)
    
    return all_smooth, all_counts, xedges, yedges

In [15]:
def correlation_matrix(data):
    # computing the correlation matrix R
    R = np.corrcoef(data,rowvar=0) 

    # visualizing the matrix R
    plt.figure(figsize=(10,8))
    vis_R = sns.heatmap(R)

In [16]:
def non_negative_matrix(n_components, train_data):
    model = skld.NMF(n_components=n_components, init='nndsvda', max_iter=500, random_state=0)
    W = model.fit_transform(train_data)
    H = model.components_
    return W, H

# non negative matrix parameter tuning
# computing the norm of the difference
def nnmf_paramater_tuning(data):
    r_values = [1,5,7,10,13,15,17,20,25,30]
    error_norm = []
    for r in r_values:
        W,H = non_negative_matrix(n_components = r, train_data = data)
        error_norm.append(LA.norm(data-np.matmul(W,H),'fro'))
    
    # plotting the approximation error as a function of r
    plt.plot(r_values, error_norm)
    plt.title('Approximated Error as a Function of r')
    plt.xlabel('r')
    plt.ylabel('Error')
    plt.show()

In [17]:
# constructing player shooting heatmap
def make_heatmap(playerid, players, xedges, yedges, all_smooth):
    shoot_heat = h.plot_shotchart(all_smooth[int(playerid)], xedges, yedges)

    if type(players.DISPLAY_FIRST_LAST[int(playerid)]) == str:
        shoot_heat.set_title(players.DISPLAY_FIRST_LAST[int(playerid)] +'\'s Shooting Style')
    else: # multiple teams, so indexing pulls more than one result
        shoot_heat.set_title(players.DISPLAY_FIRST_LAST[int(playerid)].any() + ' Shooting Style')

In [18]:
# hierarchical clustering
def hierarchical_prep(X, r_select, players, all_smooth):  
    # non negative matrix factorization with 10 base vectors
    W, H = non_negative_matrix(r_select, X)
    
    # getting cleaner looking data for hierarchical clustering
    players_coeff = pd.DataFrame(H, columns=players.DISPLAY_FIRST_LAST[list(all_smooth.keys())].unique())
    # standardizing players (want them to sum to 1)
    players_coeff /= players_coeff.sum(0)
    return players_coeff
    
def plot_dendrogram(playerid, players_coeff, players, all_smooth):
    # defining linkage type
    Z = linkage(players_coeff.T, method='complete', metric='euclidean')
    
    # plotting the dendogram
    plt.figure(figsize=(52,20))
    plt.title('Hierarchical Clustering Dendrogram')
    plt.ylabel('distance')
    dendro = dendrogram(
                Z,
                leaf_rotation=90.,  # rotates the x axis labels
                leaf_font_size=12.,  # font size for the x axis labels
                labels = np.array(players.DISPLAY_FIRST_LAST[list(all_smooth.keys())].unique())
    )

    # getting list of player ticks
    player_ticks = dendro['ivl']

    # coloring the player
    if type(players.DISPLAY_FIRST_LAST[int(playerid)]) == str:
        player_loc = list(player_ticks).index(players.DISPLAY_FIRST_LAST[int(playerid)])
    else: # multiple teams, so indexing pulls more than one result
        player_loc = list(player_ticks).index(players.DISPLAY_FIRST_LAST[int(playerid)].any())
    plt.gca().get_xticklabels()[player_loc].set_color('g')

In [19]:
# plotting total shots each game for chosen player
def plot_total_shots(player_shots, date_shots_df, date_location):   
    objects = date_shots_df['GAME_DATE']
    y_pos = np.arange(len(objects))
    shots_plot = plt.bar(y_pos, date_shots_df['shots_made'], align = 'center')
    plt.xticks(y_pos, objects, rotation=90, fontsize=7);
    
    plt.xlabel('Game Date')
    plt.ylabel('Shots Made')
    plt.title('Total Shots Made Each Game by ' + str(list(player_shots['PLAYER_NAME'])[0]) 
              + ' for ' + str(list(player_shots['TEAM_NAME'])[0]))
    
    # coloring specific date bar and label
    shots_plot.get_children()[date_location].set_color('g') 
    plt.gca().get_xticklabels()[date_location].set_color('g')

In [20]:
# plotting percentage of shots made in each period for chosen game
def plot_percent_shots(match_date, gameid, teamid, playerid):
    # getting data set up for percentage of shots
    # made in each period for chosen game    
    period_percentage = h.get_nba_data('shotchartdetail', shotchart_params(teamid, playerid, gameid))
    period_percentage_dd = dict()
    for p, s in period_percentage.groupby('PERIOD'):
        period_percentage_dd[p] = sum(s.SHOT_MADE_FLAG) / sum(s.SHOT_ATTEMPTED_FLAG)
    
    period_percentage_df = pd.DataFrame({
                        'period': sorted(list(period_percentage_dd.keys())),
                        'percentage_made': list(period_percentage_dd.values()),    
                      })
    
    plt.plot((period_percentage_df['period']), period_percentage_df['percentage_made'], '-o')
    plt.xlabel('Period')
    plt.ylabel('Percentage of Shots Made')
    plt.title('Percentage of Shots Made Per Period on ' + str(readable_date(match_date)))
    
    ticks=[1,2,3,4]
    ot = [] # for overtime periods
    quarters = list(period_percentage.PERIOD.unique()) # quarters where shots were taken
    for i in quarters: # getting possible overtime periods
        if i > 4:
            ot.append(i)
    if len(ot) != 0:
        ticks += ot
    
    plt.xticks(ticks)
    plt.xlim((0.5,ticks[-1] + 0.5))
    plt.ylim((-0.1,1.1))

In [21]:
def widgets_prep(teams, players, shotdata):
    # getting dictionaries ready for the widgets
    # creating (team: teamid) dictionary
    teams = teams.sort_values('ABBREVIATION') # sorting teams by abbreviation
    team_dd_text = teams.ABBREVIATION + ', ' + teams.TEAM_NAME
    team_dd = dict(zip(team_dd_text, teams.index))

    # creating players by team dictionary
    # teamid: {player: personid})
    players = players.sort_values('DISPLAY_FIRST_LAST') # sorting players by first name
    plyr_by_team_dd = dict()
    for t, p in players.groupby('TEAM_ID'):
        plyr_by_team_dd[t] = dict(zip(p.DISPLAY_FIRST_LAST, p.index))

    # creating games by player dictionary
    # (personid, teamid): {game date: gameid}
    game_by_player_dd = dict()
    for p, g in shotdata.groupby(['PLAYER_ID','TEAM_ID']):
        game_by_player_dd[p] = dict(zip(g.GAME_DATE, g.GAME_ID))
        
    return team_dd, plyr_by_team_dd, game_by_player_dd

In [22]:
# choose season drop down menu
latest_season = '2016-17'
current_year = 2017

season_menu = Dropdown(options= seasons , label=latest_season)
season_button = Button(description='Choose Season!', icon='check')

def select_season(pick):
    with out00:
        clear_output()
        
        latest_season = season_menu.value
        print('Season picked is ' + season_menu.value + '!')

        if season_menu.value == '1999-00': current_year = 2000
        else: current_year = int(season_menu.value[:2]+season_menu.value[-2:])

        # setting up the data
        print('Getting season data...')
        teams, players, shotdata = get_data()
        teams, players, shotdata = get_current(teams, players, shotdata)
        teams = clean_teams(teams, players)
        shotdata = prettier_dates(shotdata)
        player_other_dd = get_player_other_dict(shotdata)
        other_teams = get_other_teams_df(player_other_dd, teams, players)
        players = combine_all_teams(players, other_teams)
        teams, teamids, players, playerids = get_ID_indexes(teams, players)

        # getting allshots data from pickle file
        try: allshots = pickle.load(open('allshots' + str(latest_season) + '.pkl', 'rb'))
        except: 
            print('Please wait while shot data is pulled :)')
            get_allshots(playerids)

        # constructing heatmap
        print('Constructing heatmap...')
        all_smooth, all_counts, xedges, yedges = make_court_bins(allshots)

        # non negative matrix factorization
        # creating our X matrix
        X = np.stack(all_smooth.values()).T
        correlation_matrix(X)
        show_inline_matplotlib_plots()


        # tuning parameters buton
        # then r selection button
        # then player selection button
        # then game selection button
        def begin_tuning(a):
            def select_r(b):
                def get_player_visuals(c):
                    def get_game_visuals(d):
                        # DEFINTION START: get_game_visuals
                        with out03:
                            clear_output()

                            teamid = str(team_menu.value)
                            playerid = str(plyr_menu.value)
                            gameid = str(game_menu.value)

                            # getting data set up for plot of total 
                            # shots each game for chosen player
                            player_shots = h.get_nba_data('shotchartdetail', shotchart_params(teamid, playerid))
                            player_shots_dd = dict()
                            for d, s in player_shots.groupby('GAME_DATE'):
                                player_shots_dd[d] = sum(s.SHOT_MADE_FLAG)

                            date_shots_df = pd.DataFrame({
                                                'GAME_DATE': list(player_shots_dd.keys()),
                                                'shots_made': list(player_shots_dd.values()),
                                                })

                            # getting game date (from value) to color later in plot_total_shots()
                            gameid_index = list(player_shots['GAME_ID']).index(gameid)
                            match_date = list(player_shots['GAME_DATE'])[gameid_index]
                            date_location = list(date_shots_df.GAME_DATE).index(match_date)

                            # getting prettier dates
                            date_shots_df = prettier_dates(date_shots_df)

                            plt.figure(figsize = (25, 8))
                            # plotting total shots each game for chosen player
                            plt.subplot(121)
                            plot_total_shots(player_shots, date_shots_df, date_location)
                            # plotting percentage of shots made in each period for chosen game
                            plt.subplot(122)
                            plot_percent_shots(match_date, gameid, teamid, playerid)
                            show_inline_matplotlib_plots()

                    # DEFINITION START: get_player_visuals
                    with out02:
                        clear_output()
                        teamid = str(team_menu.value)
                        playerid = str(plyr_menu.value)

                        # pass on the player with less than 50 shots
                        try: 
                            # constructing player shooting heatmap
                            make_heatmap(playerid, players, xedges, yedges, all_smooth)
                            show_inline_matplotlib_plots()
                            # hierarchical clustering 
                            players_coeff = hierarchical_prep(X, r_select, players, all_smooth)
                            plot_dendrogram(playerid, players_coeff, players, all_smooth)
                            show_inline_matplotlib_plots()
                        except: print('Less than 50 shots.')

                        # call game button now
                        game_button = Button(description='Get Game Data!', icon='check')
                        display(game_menu, game_button)
                        out03 = Output()
                        display(out03)
                        with out03:
                            game_button.on_click(get_game_visuals)

                # DEFINITION START: select_r 
                with out01:
                    clear_output()

                    r_select = r_menu.value
                    print('I have chosen to use ' + str(r_menu.value) + ' base vectors.')

                    # call team and player button now
                    player_button = Button(description='Get Player Data!', icon='check')
                    display(team_menu, plyr_menu, player_button)
                    out02 = Output()
                    display(out02)
                    with out02:
                        player_button.on_click(get_player_visuals)

            # DEFINITION START: begin_tuning
            # get rid of tuning button (no need to repeat it)
            tuning_button.close()

            print('Tuning paramaters...')
            nnmf_paramater_tuning(X)
            show_inline_matplotlib_plots()

            # choosing number of base vectors
            # prep for r selection
            r_menu = Dropdown(options= range(1,31), label='10')
            r_button = Button(description='Pick number of nnmf components!', icon='check', layout=Layout(width='35%'))
            print('Choose the number of base vectors you would like to have (default is 10).')
                
            display(r_menu, r_button)
            out01 = Output()
            display(out01)
            with out01:
                r_button.on_click(select_r)


        print('Let\'s now tune our parameters')
        tuning_button = Button(description='Start Tuning', icon='check')
        display(tuning_button)
        tuning_button.on_click(begin_tuning)

        # getting data setup
        team_dd, plyr_by_team_dd, game_by_player_dd = widgets_prep(teams, players, shotdata)

        # creating widgets
        label_hold = shotdata.loc[0,'PLAYER_ID']
        selected00 = players.loc[label_hold,'TEAM_ABBREVIATION'] + ', ' + players.loc[label_hold,'TEAM_NAME']
        selected01 = players.loc[label_hold, 'DISPLAY_FIRST_LAST']
        selected02 = shotdata.loc[0,'GAME_DATE']

        team_menu = Dropdown(options=team_dd, label=selected00)
        plyr_menu = Dropdown(options=plyr_by_team_dd[team_dd[selected00]], label=selected01)
        game_menu = Dropdown(options=game_by_player_dd[(plyr_by_team_dd[team_dd[selected00]][selected01], team_dd[selected00])], label=selected02)

        
        # update players list definition
        def update_team(change): 
            plyr_menu.options = plyr_by_team_dd[change['new']]
            plyr_menu.value = list(plyr_by_team_dd[change['new']].values())[0]

        # update game list defintition
        def update_player(change):
            if change['new'] != None: # update game from player only when player is changed, ow will do it through above cell
                game_menu.options = game_by_player_dd[(change['new'], team_menu.value)]
                game_menu.value = list(game_by_player_dd[(change['new'], team_menu.value)].values())[0]   
        
        # updating widgets
        team_menu.observe(update_team, names='value')
        plyr_menu.observe(update_player, names='value')


display(season_menu, season_button)
out00 = Output()
display(out00)
with out00:
    season_button.on_click(select_season)

Dropdown(index=70, options=('1946-47', '1947-48', '1948-49', '1949-50', '1950-51', '1951-52', '1952-53', '1953…

Button(description='Choose Season!', icon='check', style=ButtonStyle())

Output()