In [101]:
import pandas as pd
import requests
import helper_functions
from sklearn import cluster, preprocessing
import json

In [102]:
# open database
engine = helper_functions.create_db_engine()

Play Type URLs

In [103]:
url_player = 'http://stats.nba.com/stats/leaguedashplayerbiostats?College=&Conference=&Country=' \
        '&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0' \
        '&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0' \
        '&PlayerExperience=&PlayerPosition=&Season=2016-17&SeasonSegment=&SeasonType=Regular+Season' \
        '&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='

url_pnrballhandler = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=PRBallHandler&limit=500&names=offensive&q=2487153&season=2016&seasonType=Reg'

url_transition = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=Transition&limit=500&names=offensive&q=2487275&season=2016&seasonType=Reg'
    
url_isolation = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=Isolation&limit=500&names=offensive&q=2487276&season=2016&seasonType=Reg'

url_pnrrollman = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=PRRollman&limit=500&names=offensive&q=2487276&season=2016&seasonType=Reg'
    
url_postup = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=Postup&limit=500&names=offensive&q=2487276&season=2016&seasonType=Reg'
    
url_spotup = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=Spotup&limit=500&names=offensive&q=2487276&season=2016&seasonType=Reg'
    
url_handoff = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=Handoff&limit=500&names=offensive&q=2487277&season=2016&seasonType=Reg'
    
url_cut = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=Cut&limit=500&names=offensive&q=2487277&season=2016&seasonType=Reg'
    
url_offscreen = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=OffScreen&limit=500&names=offensive&q=2487277&season=2016&seasonType=Reg'
    
url_putback = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=OffRebound&limit=500&names=offensive&q=2487278&season=2016&seasonType=Reg'

url_misc = 'http://stats-prod.nba.com/wp-json/statscms/v1/synergy/player/' \
        '?category=Misc&limit=500&names=offensive&q=2487293&season=2016&seasonType=Reg'

In [104]:
def create_df_from_url(url, columns):
    resp = requests.get(url)
    resp.raise_for_status()
    data = resp.json()['results']
    df = pd.DataFrame(data)
    df['Player'] = df['PlayerFirstName'] + ' ' + df['PlayerLastName']
    #fix and reformat PlayerName in order to match on Name later
    df['Player'] = df['Player'].apply(lambda s: s.replace('-', ' ').replace('  ', ' ')\
                                      .replace(' Jr.', '').replace('.', '').replace(' III', ''))
    #fix specific names
    df['Player'] = df['Player'].apply(lambda s: s.replace('Jahil Okafor', 'Jahlil Okafor'))
    df['Player'] = df['Player'].apply(lambda s: s.replace("DAngelo Russell", "D'Angelo Russell"))
    df['Player'] = df['Player'].apply(lambda s: s.replace('Jonathan Simmons', 'Jonathon Simmons'))
    df['Player'] = df['Player'].apply(lambda s: s.replace('Malcom Delaney', 'Malcolm Delaney'))
    df['Player'] = df['Player'].apply(lambda s: s.replace('Nene Hilario', 'Nene'))
    df['Player'] = df['Player'].apply(lambda s: s.replace('è', 'e').replace('é', 'e'))
    
    df = df[['Player', 'Time']]
    df.columns = columns
    return df

In [105]:
df_pnrballhandler = create_df_from_url(url_pnrballhandler, ['Player', 'PnR_BallHandler_Percent'])
df_transition = create_df_from_url(url_transition, ['Player', 'Transition_Percent'])
df_isolation = create_df_from_url(url_isolation, ['Player', 'Isolation_Percent'])
df_pnrrollman = create_df_from_url(url_pnrrollman, ['Player', 'PnR_Rollman_Percent'])
df_postup = create_df_from_url(url_postup, ['Player', 'PostUp_Percent'])
df_spotup = create_df_from_url(url_spotup, ['Player', 'SpotUp_Percent'])
df_handoff = create_df_from_url(url_handoff, ['Player', 'Handoff_Percent'])
df_cut = create_df_from_url(url_cut, ['Player', 'Cut_Percent'])
df_offscreen = create_df_from_url(url_offscreen, ['Player', 'Offscreen_Percent'])
df_putback = create_df_from_url(url_putback, ['Player', 'Putback_Percent'])
df_misc = create_df_from_url(url_misc, ['Player', 'Misc_Percent'])

In [106]:
df_playtypes = df_pnrballhandler.merge(df_transition, on='Player', how='outer').merge(df_isolation, on='Player', how='outer')\
                    .merge(df_pnrrollman, on='Player', how='outer').merge(df_postup, on='Player', how='outer')\
                    .merge(df_spotup, on='Player', how='outer').merge(df_handoff, on='Player', how='outer')\
                    .merge(df_cut, on='Player', how='outer').merge(df_offscreen, on='Player', how='outer')\
                    .merge(df_putback, on='Player', how='outer').merge(df_misc, on='Player', how='outer')

#Total to make sure that percents add to 100
df_playtypes['Total'] = df_playtypes.sum(axis=1, numeric_only=True)

#Misc had a few duplicate players, so drop duplicate players
df_playtypes = df_playtypes[df_playtypes.duplicated('Player') != True]

#A few players (about 10) had total value that was way off.  Looks like db is returning data from
#another column b/c there are some duplicate percents
df_playtypes = df_playtypes[~((df_playtypes['Total'] > 101) | (df_playtypes['Total'] < 99))]

#merge height and weight numbers
#df_playtypes = df_player.merge(df_playtypes, on='PlayerName', how='outer')
df_playtypes = df_playtypes.reset_index(drop=True)

df_playtypes.sort(['Player'])



Unnamed: 0,Player,PnR_BallHandler_Percent,Transition_Percent,Isolation_Percent,PnR_Rollman_Percent,PostUp_Percent,SpotUp_Percent,Handoff_Percent,Cut_Percent,Offscreen_Percent,Putback_Percent,Misc_Percent,Total
156,Aaron Brooks,44.125301,10.182800,10.966100,,,18.799000,4.960840,0.522193,3.655350,1.305480,5.48303,100.000094
304,Aaron Gordon,11.058200,19.923700,6.768350,4.480460,7.435650,22.211599,4.861770,7.530980,3.813160,8.007630,3.90848,99.999980
317,Al Farouq Aminu,2.272730,14.123400,4.870130,7.467530,1.623380,45.941601,2.759740,7.954550,0.811688,6.818180,5.35714,100.000068
184,Al Horford,0.722394,4.953560,4.024770,24.355000,28.173401,18.369499,0.619195,6.707950,3.508770,3.405570,5.15996,100.000068
356,Al Jefferson,,1.848430,1.848430,22.365999,46.210701,1.848430,0.369686,12.199600,,9.426990,3.88170,99.999966
65,Alan Anderson,11.702100,6.382980,19.148899,,1.063830,50.000000,1.063830,2.127660,4.255320,,4.25532,99.999939
346,Alan Williams,,9.604520,0.564972,23.728800,16.949200,0.564972,0.282486,20.903999,,18.079100,9.32203,100.000078
37,Alec Burks,29.936300,15.286600,8.280250,,,22.929899,7.006370,3.503180,5.414010,3.503180,4.14013,99.999919
292,Alex Abrines,6.345180,16.497499,2.030460,2.284260,,43.908600,6.598980,1.015230,11.928900,2.538070,6.85279,99.999969
366,Alex Len,,6.849320,2.283110,18.264799,19.482500,5.783870,0.456621,17.656000,1.369860,15.981700,11.87210,99.999880


In [107]:
df_player_season_stats = pd.read_sql('''SELECT g.Player, g.Height, g.Weight, ps.*, p.PTSPerG, p.3PAPerG, a.FTr
                                        FROM BasketballDatabase.PlayerSeasonShootingStatsYTD ps
                                        JOIN BasketballDatabase.PlayerSeasonStatsPerGameYTD p on p.PlayerID = ps.PlayerID
                                        JOIN BasketballDatabase.PlayerSeasonAdvStatsYTD a on a.PlayerID = ps.PlayerID
                                        JOIN BasketballDatabase.GeneralPlayerInfo g on g.PlayerID = ps.PlayerID
                                        ''', con=engine)
df_player_season_stats = df_player_season_stats.drop_duplicates(subset=['PlayerID'], keep=False).reset_index(drop=True)

df_player_season_stats['Height'] = df_player_season_stats['Height'].apply(lambda x: int(x.split('-')[0])*12 + int(x.split('-')[1]))

df_player_season_stats.drop(['FGPercent', 'AvgShotDist', 'Season', 'GameTypeID', 'TeamID'], axis=1, inplace=True)
df_player_season_stats['Player'] = df_player_season_stats['Player'].apply(lambda s: s.replace('-', ' ').replace('.', '')\
                                                                          .replace(' III', '').replace(' Jr', ''))

df_player_season_stats.sort(['Player'])



Unnamed: 0,Player,Height,Weight,PlayerID,Age,2PAPercent,PercentFGA0to2ft,PercentFGA3to9ft,PercentFGA10to15ft,PercentFGA16Plusftto3,...,FGPercent10to15ft,FGPercent16Plusftto3,3PFGPercent,Percent2PAAstByOthers,Percent3PAAstByOthers,Percent3PAFromCorner,3PPercentFromCorner,PTSPerG,3PAPerG,FTr
405,AJ Hammons,84,260,1601,24,0.727,0.273,0.091,0.182,0.182,...,0.250,0.500,0.667,0.800,1.000,0.167,0.000,1.4,0.4,0.455
81,Aaron Brooks,72,161,943,32,0.578,0.192,0.157,0.105,0.118,...,0.533,0.353,0.355,0.099,0.558,0.124,0.333,5.0,2.0,0.132
265,Aaron Gordon,81,220,1391,21,0.686,0.350,0.102,0.103,0.132,...,0.427,0.314,0.284,0.497,0.859,0.264,0.242,12.4,3.3,0.253
321,Aaron Harrison,78,210,1498,22,0.500,0.000,0.000,0.000,0.500,...,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.2,0.4,0.500
290,Adreian Payne,82,237,1440,25,0.756,0.311,0.156,0.089,0.200,...,0.750,0.556,0.182,0.667,1.000,0.091,0.000,4.0,0.8,0.422
141,Al Farouq Aminu,81,220,1092,26,0.552,0.329,0.129,0.031,0.062,...,0.385,0.385,0.324,0.462,1.000,0.372,0.357,8.6,3.4,0.288
74,Al Horford,82,245,913,30,0.693,0.217,0.213,0.130,0.131,...,0.402,0.459,0.358,0.581,0.976,0.148,0.382,14.1,3.6,0.166
27,Al Jefferson,82,289,664,32,0.998,0.289,0.380,0.217,0.108,...,0.324,0.353,0.000,0.536,0.000,0.000,0.000,8.1,0.0,0.180
48,Alan Anderson,78,220,784,34,0.456,0.177,0.089,0.089,0.101,...,0.143,0.250,0.302,0.563,0.846,0.349,0.200,2.9,1.5,0.203
351,Alan Williams,80,260,1541,24,0.996,0.500,0.415,0.076,0.004,...,0.444,0.000,0.000,0.723,0.000,0.000,0.000,6.8,0.0,0.386


In [108]:
df = df_player_season_stats.merge(df_playtypes, on='Player')
df.fillna(value=float(0), inplace=True)

df

Unnamed: 0,Player,Height,Weight,PlayerID,Age,2PAPercent,PercentFGA0to2ft,PercentFGA3to9ft,PercentFGA10to15ft,PercentFGA16Plusftto3,...,Isolation_Percent,PnR_Rollman_Percent,PostUp_Percent,SpotUp_Percent,Handoff_Percent,Cut_Percent,Offscreen_Percent,Putback_Percent,Misc_Percent,Total
0,Jason Terry,74,185,5,39,0.288,0.119,0.062,0.049,0.058,...,2.061860,1.718210,0.000000,41.924400,5.154640,2.061860,9.621990,1.718210,4.46735,100.000021
1,Jamal Crawford,77,200,32,36,0.630,0.092,0.124,0.120,0.298,...,25.300100,0.092336,0.000000,18.928900,5.909510,0.646353,2.585410,1.108030,4.70914,99.999980
2,Dirk Nowitzki,84,245,46,38,0.685,0.064,0.056,0.213,0.353,...,3.641090,27.958401,21.326401,24.707399,1.170350,2.600780,5.331600,1.950590,3.77113,99.999999
3,Vince Carter,78,220,216,40,0.408,0.188,0.095,0.048,0.078,...,4.047220,0.337268,7.082630,33.558201,7.925800,2.866780,10.961200,3.709950,6.40809,99.999969
4,Paul Pierce,79,235,283,39,0.403,0.097,0.016,0.129,0.161,...,7.692310,15.384600,5.494510,45.054901,0.000000,0.000000,6.593410,0.000000,6.59341,99.999941
5,Joe Johnson,79,240,443,35,0.595,0.079,0.229,0.126,0.161,...,15.310300,1.379310,10.482800,32.413799,4.827590,1.517240,5.655170,2.068970,2.48276,100.000029
6,Zach Randolph,81,260,450,35,0.902,0.296,0.287,0.107,0.211,...,13.228900,11.575300,32.288898,11.227200,0.609225,6.875540,1.479550,13.315900,5.04787,99.999991
7,Richard Jefferson,79,233,455,36,0.460,0.298,0.086,0.025,0.052,...,3.854880,5.442180,0.680272,44.217701,0.907029,10.884400,2.721090,2.947850,9.29705,100.000084
8,Tony Parker,74,185,468,34,0.877,0.279,0.228,0.117,0.253,...,6.488010,0.000000,0.282087,17.066299,7.334270,1.269390,3.949220,0.282087,3.80818,99.999942
9,Tyson Chandler,85,240,474,34,1.000,0.895,0.096,0.004,0.004,...,0.000000,16.997200,1.133140,0.283286,1.133140,30.028299,0.000000,26.062300,18.98020,100.000005


In [109]:
data = df[['Height', 'Weight', 'PercentFGA0to2ft', 'PercentFGA3to9ft', 'PercentFGA10to15ft',
           'PercentFGA16Plusftto3', 'PercentFGA3P', 'Percent2PAAstByOthers', 'Percent3PAAstByOthers',
           'Percent3PAFromCorner', 'FTr', 'PnR_BallHandler_Percent', 'Transition_Percent', 'Isolation_Percent', 
           'PnR_Rollman_Percent', 'PostUp_Percent', 'SpotUp_Percent', 'Handoff_Percent', 'Cut_Percent',
           'Offscreen_Percent', 'Putback_Percent']]

data = helper_functions.standard_scaler(data)

rand_state = 25
y_pred = cluster.KMeans(n_clusters=8, max_iter=1000, n_init=20, random_state=rand_state).fit_predict(data)
df_class = pd.DataFrame({'Class': y_pred})
df = pd.concat([df, df_class], axis=1)

In [112]:
df[df['Class'] == 1].sort(['PTSPerG'], ascending=False)

  if __name__ == '__main__':


Unnamed: 0,Player,Height,Weight,PlayerID,Age,2PAPercent,PercentFGA0to2ft,PercentFGA3to9ft,PercentFGA10to15ft,PercentFGA16Plusftto3,...,PnR_Rollman_Percent,PostUp_Percent,SpotUp_Percent,Handoff_Percent,Cut_Percent,Offscreen_Percent,Putback_Percent,Misc_Percent,Total,Class
223,Rudy Gobert,85,245,1353,24,0.998,0.849,0.143,0.005,0.002,...,20.8925,4.15822,1.31846,0.0,30.0203,0.10142,21.8053,12.7789,100.00005,1
28,Dwight Howard,83,265,676,31,0.997,0.764,0.165,0.031,0.037,...,10.0823,27.9835,2.77778,0.102881,20.9877,0.0,21.913601,10.3909,99.999981,1
97,DeAndre Jordan,83,265,997,28,0.996,0.871,0.125,0.0,0.0,...,19.5205,10.6164,0.570776,0.0,21.689501,0.0,24.7717,15.7534,99.999902,1
258,Clint Capela,82,240,1431,22,1.0,0.775,0.214,0.006,0.004,...,28.021999,3.2967,0.824176,0.0,29.945101,0.137363,14.4231,7.14286,100.000087,1
212,Cody Zeller,84,240,1329,24,0.998,0.661,0.198,0.027,0.112,...,33.106998,1.69779,9.33786,1.18845,21.222401,2.54669,9.16808,10.017,100.000039,1
166,Kenneth Faried,80,228,1185,27,0.985,0.647,0.279,0.035,0.027,...,9.80036,11.2523,3.44828,0.907441,26.8603,0.181488,25.2269,9.07441,100.000164,1
278,Montrezl Harrell,80,240,1487,23,0.982,0.687,0.202,0.031,0.061,...,24.5327,3.50467,5.60748,0.233645,29.439301,0.233645,12.1495,6.30841,99.99995,1
9,Tyson Chandler,85,240,474,34,1.0,0.895,0.096,0.004,0.004,...,16.9972,1.13314,0.283286,1.13314,30.028299,0.0,26.0623,18.9802,100.000005,1
362,Shawn Long,81,255,1627,24,0.822,0.671,0.027,0.027,0.082,...,16.6667,7.24638,15.942,1.44928,13.0435,0.724638,26.087,7.24638,100.000086,1
154,Tristan Thompson,81,238,1163,25,0.993,0.696,0.252,0.035,0.009,...,25.5853,1.33779,1.17057,0.167224,28.595301,0.0,21.0702,10.0334,99.999899,1
