In [1]:
import requests
import pandas as pd

## GET league-wide player stats from stats.nba.com/stats API
nba.com does not provide documentation for their stats API. However, some great documentation and classes built around the various endpoints exists. For example, the code below are snippets I've cherry-picked from https://github.com/seemethere/nba_py, which provides easy access to several useful endpoints. You can get tremendously granular in-game stats from this API. Some day, I'd love to dig into this [beautiful spatial-temporal model](https://arxiv.org/pdf/1408.0777.pdf) of *expected possession value* by Cervone _et al_... but lets move on with TDA! 

In [2]:
BASE_URL = 'http://stats.nba.com/stats/{endpoint}'
HEADERS  = {'user-agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) '
                           'AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/45.0.2454.101 Safari/537.36')
           }

## helper function for requests
def get_json(endpoint, params, referer='scores'):
    """
    Args:
        endpoint (str): endpoint to be called from the API
        params (dict): parameters to be passed to the API
    Raises:
        HTTPError: if requests hits a status code != 200
    Returns:
        json (json): json object for selected API call
    """
    h = dict(HEADERS)
    h['referer'] = 'http://stats.nba.com/{ref}/'.format(ref=referer)
    response = requests.get(BASE_URL.format(endpoint=endpoint), params=params, headers=h)
    response.raise_for_status()
    return response.json()

#### Specify parameters for player stats endpoint
The parameters below will query the specified endpoint for the 2015-16 NBA regular season. Note, we're requesting the stats to be normalized by the total number of in-game minutes of each player. 

In [3]:
ENDPOINT = 'leaguedashplayerstats'
PARAMS   = {'LeagueID': '00',       # NBA
            'Season': '2010-11',
            'SeasonType': 'Regular Season',
            'MeasureType': 'Base',  # options: Base, Advanced, Misc, Four Factors, Scoring, Opponent, Usage
            'PerMode': 'PerMinute', # options: PerGame, MinutesPer, PerMinute, PerPossession, ...
            'PlusMinus': 'N',       # ?
            'PaceAdjust': 'N',      # ?
            'Rank': 'N',
            'PORound': '0',         # all playoff rounds, other values pick specific rounds
            'Outcome': '',          # possible to filter by win ('W') or loss ('L') 
            'Location': '',         # possible to filter by 'Home' or 'Away'
            'Month': '0',           # all months, possible to filter by Oct ('1'), Nov ('2'), Dec ('3'), etc...
            'SeasonSegment': '',    # blank uses entire season
            'DateFrom': '',         # begin of date range filter
            'DateTo': '',           # end of date range filter
            'OpponentTeamID': '0',  # all opponents, or filter stats against specific teams
            'VsConference': '',
            'VsDivision': '',
            'TeamID': '0',
            'Conference': '',
            'Division': '',
            'GameSegment': '',      # all segments, other options: 'First Half', 'Second Half', 'Overtime'
            'Period': '0',          # can specifie quarter '1' or overtime period str(4+n)
            'ShotClockRange': '',
            'LastNGames': '0',
            'GameScope': '',
            'PlayerExperience': '',
            'PlayerPosition': '',
            'StarterBench': '',     # can select between 'Starter' or 'Bench' players
            'DraftYear': '',
            'DraftPick': '',
            'College': '',
            'Country': '',
            'Height': '',
            'Weight': ''
           }

#### Transform to a dataframe
Restrict to players who saw more than 30 minutes of playing time in the season.

In [4]:
# make the request and pull out the headers and rows
result = get_json(endpoint=ENDPOINT, params=PARAMS, referer='')
cols   = result['resultSets'][0]['headers']
rows   = result['resultSets'][0]['rowSet']

# restrict feature set to 7 key stats along with player/team info
features = [
 'PLAYER_ID',
 'PLAYER_NAME',
 'TEAM_ID',
 'TEAM_ABBREVIATION',
 'REB',
 'AST',
 'TOV',
 'STL',
 'BLK',
 'PF',
 'PTS'
]

# stats dataframe
df_stats = pd.DataFrame(rows, columns=cols)
df_stats = df_stats[df_stats.MIN > 30][features]
df_stats.head(5)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,REB,AST,TOV,STL,BLK,PF,PTS
0,201985,AJ Price,1610612754,IND,0.09,0.14,0.07,0.04,0.0,0.08,0.41
1,201166,Aaron Brooks,1610612756,PHX,0.06,0.18,0.08,0.03,0.0,0.09,0.49
2,201189,Aaron Gray,1610612740,NOH,0.32,0.03,0.06,0.02,0.02,0.18,0.24
3,201151,Acie Law,1610612744,GSW,0.09,0.12,0.06,0.04,0.0,0.08,0.3
4,1733,Al Harrington,1610612743,DEN,0.2,0.06,0.06,0.02,0.01,0.12,0.46


#### Add additional player info, e.g. position, weight, height, etc.
Heads-up: this takes a couple minutes. There's another endpoint for this and, unfortunately, we have to make a seperate request for each player, which is slow. There's probably a faster way to get this information, but we're only looking at a few hundred players.

In [5]:
# helper function to pull down player details from the API
def get_player_details(player_id, season, fields=['POSITION']):
    # make request
    result  = get_json('commonplayerinfo', params={'PlayerID':player_id})['resultSets'][0]
    allrows = pd.DataFrame(result['rowSet'], columns=result['headers'])
    
    # return the first row on the requested fields in a dictionary
    return dict(allrows[(allrows['FROM_YEAR'] <= season) & (season <= allrows['TO_YEAR'])].ix[0,fields])

# build a dataframe of player details
df_full = df_stats.merge(df_stats.PLAYER_ID.apply(lambda s: pd.Series(get_player_details(s, 2010, ['POSITION', 'WEIGHT', 'HEIGHT']))), 
                         left_index=True, 
                         right_index=True)
df_full.head(5)

Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,REB,AST,TOV,STL,BLK,PF,PTS,HEIGHT,POSITION,WEIGHT
0,201985,AJ Price,1610612754,IND,0.09,0.14,0.07,0.04,0.0,0.08,0.41,,,
1,201166,Aaron Brooks,1610612756,PHX,0.06,0.18,0.08,0.03,0.0,0.09,0.49,6-0,Guard,161.0
2,201189,Aaron Gray,1610612740,NOH,0.32,0.03,0.06,0.02,0.02,0.18,0.24,7-0,Center,270.0
3,201151,Acie Law,1610612744,GSW,0.09,0.12,0.06,0.04,0.0,0.08,0.3,6-3,Guard,202.0
4,1733,Al Harrington,1610612743,DEN,0.2,0.06,0.06,0.02,0.01,0.12,0.46,6-9,Forward,245.0


## Perform TDA analysis on player performance data
The specifications for the [Mapper algorithm](http://danifold.net/mapper/) as described in [Lum _et al_ (2013)](http://www.nature.com/articles/srep01236):
- the __similarity metric__ is variance-normalized Euclidean distance;
- the __filter functions__ are the first and second SVD components of players' stat lines;
- the __clustering__ is preformed with the mapper default, single-linkage hierachical clustering (note: there is some ambiguity over choosing a cutoff for the dendrogram; the biggest gap in the tree height seems justifiable);
- the __cover__ for the domain of the filter functions is a grid cover with the same number of intervals in each dimension and 50% overlap.

In [6]:
import mapper
from scipy.spatial.distance import pdist

Using cmappertools v1.0.24.


In [35]:
# point cloud (just the stats)
pcd = df_stats[['PTS','REB','AST','STL','BLK','PF','TOV']].as_matrix()

# the metric parameters below specify variance-normalized Euclidean (seuclidean) for the
# dissimilarity metric, where the variance (V) is computed automatically
dist = pdist(pcd, metric='seuclidean')

# compute filter values using first & second SVD components
filt = mapper.filters.dm_eigenvector(data=pcd, k=[0,1], metricpar={})

# assign the cover for the filter functions
part = mapper.cover.cube_cover_primitive(intervals=20, overlap=50)(filt)

# compute the mapper output (note: using single-linkage clustering default)
result = mapper.mapper(dist, filt, part, cutoff=mapper.cutoff.biggest_gap(), metricpar={}, verbose=False)

Eigenvalues:
[-24.06490908 -15.29422603]
Number of observations: 436.
There are 614 nodes.


#### Visualize the results

In [36]:
# import visualization utilities from parent directory
import sys

pwd = !pwd
parent_dir = '/'.join(pwd[0].split('/')[:-1])
if parent_dir not in sys.path:
    sys.path.append('/'.join(pwd[0].split('/')[:-1]))

import d3_lib
import tda_mapper_extensions
from IPython.core.display import HTML

One-hot encode the position and team categories for easy labeling in the visualization

In [37]:
df_encode = df_full[['POSITION','TEAM_ABBREVIATION']]

# note -- positions are sometimes blank and sometimes contains two, e.g. forward-guard.
# for blank entries, label unknown. for multiple positions, assume the first one is more achetypical.
df_encode['POSITION'] = df_encode.POSITION.apply(lambda x: 'UNKNOWN' if x == '' else x.split('-')[0].upper())

# encode labels
df_encode = pd.get_dummies(df_encode, columns=['POSITION','TEAM_ABBREVIATION'], prefix={'POSITION':'POSITION', 'TEAM_ABBREVIATION':'TEAM'})

# merge back stats
df_encode = df_full.merge(df_encode, left_index=True, right_index=True)
df_encode.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,REB,AST,TOV,STL,BLK,PF,...,TEAM_OKC,TEAM_ORL,TEAM_PHI,TEAM_PHX,TEAM_POR,TEAM_SAC,TEAM_SAS,TEAM_TOR,TEAM_UTA,TEAM_WAS
0,201985,AJ Price,1610612754,IND,0.09,0.14,0.07,0.04,0.0,0.08,...,0,0,0,0,0,0,0,0,0,0
1,201166,Aaron Brooks,1610612756,PHX,0.06,0.18,0.08,0.03,0.0,0.09,...,0,0,0,1,0,0,0,0,0,0
2,201189,Aaron Gray,1610612740,NOH,0.32,0.03,0.06,0.02,0.02,0.18,...,0,0,0,0,0,0,0,0,0,0
3,201151,Acie Law,1610612744,GSW,0.09,0.12,0.06,0.04,0.0,0.08,...,0,0,0,0,0,0,0,0,0,0
4,1733,Al Harrington,1610612743,DEN,0.2,0.06,0.06,0.02,0.01,0.12,...,0,0,0,0,0,0,0,0,0,0


In [38]:
df_encode.shape

(436, 48)

Build the graphs for visualizing (TODO: allow the various features to be selected interactively)

In [39]:
G_pts = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='PTS')
G_reb = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='REB')
G_ast = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='AST')
G_stl = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='STL')
G_blk = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='BLK')
G_pf  = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='PF')
G_tov = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='TOV')

# position labels
G_guard  = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='POSITION_GUARD')
G_frwrd  = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='POSITION_FORWARD')
G_center = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='POSITION_CENTER')

# a selection of teams
G_team_min = tda_mapper_extensions.custom_d3js_fdgraph(result, df_encode, feature='POSITION_CENTER')

Display with D3

In [40]:
HTML(d3_lib.set_styles('force_directed') +
     '<script src="http://d3js.org/d3.v3.min.js"></script>' +
     '<script src="http://marvl.infotech.monash.edu/webcola/cola.v3.min.js"></script>' +
     d3_lib.draw_graph('force_directed', {'data': G_guard}))

In [22]:
HTML(d3_lib.set_styles('force_directed') +
     '<script src="http://d3js.org/d3.v3.min.js"></script>' +
     '<script src="http://marvl.infotech.monash.edu/webcola/cola.v3.min.js"></script>' +
     d3_lib.draw_graph('force_directed', {'data': G_reb}))

In [23]:
HTML(d3_lib.set_styles('force_directed') +
     '<script src="http://d3js.org/d3.v3.min.js"></script>' +
     '<script src="http://marvl.infotech.monash.edu/webcola/cola.v3.min.js"></script>' +
     d3_lib.draw_graph('force_directed', {'data': G_ast}))

In [24]:
HTML(d3_lib.set_styles('force_directed') +
     '<script src="http://d3js.org/d3.v3.min.js"></script>' +
     '<script src="http://marvl.infotech.monash.edu/webcola/cola.v3.min.js"></script>' +
     d3_lib.draw_graph('force_directed', {'data': G_blk}))

In [25]:
HTML(d3_lib.set_styles('force_directed') +
     '<script src="http://d3js.org/d3.v3.min.js"></script>' +
     '<script src="http://marvl.infotech.monash.edu/webcola/cola.v3.min.js"></script>' +
     d3_lib.draw_graph('force_directed', {'data': G_pf}))

In [34]:
HTML(d3_lib.set_styles('force_directed') +
     '<script src="http://d3js.org/d3.v3.min.js"></script>' +
     '<script src="http://marvl.infotech.monash.edu/webcola/cola.v3.min.js"></script>' +
     d3_lib.draw_graph('force_directed', {'data': G_stl}))