In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score

In [2]:
all_data = pd.read_csv('../data/historical_RAPTOR_by_team.csv')

In [6]:
all_data.columns

Index(['player_name', 'player_id', 'season', 'season_type', 'team', 'poss',
       'mp', 'raptor_offense', 'raptor_defense', 'raptor_total', 'war_total',
       'war_reg_season', 'war_playoffs', 'predator_offense',
       'predator_defense', 'predator_total', 'pace_impact'],
      dtype='object')

In [33]:
def split_season(df):
    reg_season = df[df['season_type']=='RS']
    playoffs = df[df['season_type']=='PO']
    
    reg_season = reg_season[reg_season['mp']>300]
    return reg_season, playoffs

In [34]:
reg_season, playoffs = split_season(all_data)

In [35]:
def dropColumns(df):
    stats_only_df = df.drop(columns=['player_name','player_id','season', 'season_type', 'team', 'poss',
       'mp','raptor_total','war_total','war_playoffs','predator_total','predator_offense',
       'predator_defense'])
    return stats_only_df

In [36]:
reg_season_stats_df = dropColumns(reg_season)

In [37]:
reg_season_stats_df

Unnamed: 0,raptor_offense,raptor_defense,war_reg_season,pace_impact
3,-2.480005,-0.123125,0.070322,0.466668
6,-1.845659,-2.038480,-0.669619,0.444865
8,-2.870680,-0.987076,-0.273787,1.665034
11,4.267101,3.075579,15.456216,-0.513828
13,3.631396,3.605263,11.478972,-0.577012
...,...,...,...,...
27364,-2.073401,-0.690914,-0.007877,-0.331493
27366,-3.358178,-3.040589,-1.141160,0.644128
27367,-2.903709,2.688832,0.531968,0.311576
27369,-3.142927,1.774239,0.369725,0.120889


In [38]:
reg_season

Unnamed: 0,player_name,player_id,season,season_type,team,poss,mp,raptor_offense,raptor_defense,raptor_total,war_total,war_reg_season,war_playoffs,predator_offense,predator_defense,predator_total,pace_impact
3,Alaa Abdelnaby,abdelal01,1992,RS,POR,1948,934,-2.480005,-0.123125,-2.603129,0.070322,0.070322,0.0,-2.416858,-0.634025,-3.050883,0.466668
6,Alaa Abdelnaby,abdelal01,1993,RS,BOS,2304,1152,-1.845659,-2.038480,-3.884140,-0.669619,-0.669619,0.0,-1.804417,-1.907295,-3.711713,0.444865
8,Alaa Abdelnaby,abdelal01,1995,RS,SAC,926,476,-2.870680,-0.987076,-3.857756,-0.273787,-0.273787,0.0,-2.839132,-0.608234,-3.447366,1.665034
11,Kareem Abdul-Jabbar,abdulka01,1977,RS,LAL,6654,3016,4.267101,3.075579,7.342679,15.456216,15.456216,0.0,4.482404,2.556956,7.039361,-0.513828
13,Kareem Abdul-Jabbar,abdulka01,1978,RS,LAL,5064,2265,3.631396,3.605263,7.236659,11.478972,11.478972,0.0,4.130288,2.971010,7.101298,-0.577012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27364,Ante Zizic,zizican01,2019,RS,CLE,2240,1082,-2.073401,-0.690914,-2.764315,-0.007877,-0.007877,0.0,-2.392563,-2.024149,-4.416713,-0.331493
27366,Ivica Zubac,zubaciv01,2017,RS,LAL,1305,609,-3.358178,-3.040589,-6.398767,-1.141160,-1.141160,0.0,-3.533604,-2.932103,-6.465707,0.644128
27367,Ivica Zubac,zubaciv01,2018,RS,LAL,871,410,-2.903709,2.688832,-0.214877,0.531968,0.531968,0.0,-2.616331,0.752875,-1.863456,0.311576
27369,Ivica Zubac,zubaciv01,2019,RS,LAC,1141,524,-3.142927,1.774239,-1.368688,0.369725,0.369725,0.0,-3.105152,0.994436,-2.110716,0.120889


In [10]:
def scale_df(df):
    standard_scaler = StandardScaler()
    scaled_data = standard_scaler.fit_transform(df)
    return scaled_data

In [39]:
reg_season_scale_df = scale_df(reg_season_stats_df)

In [40]:
reg_season_stats_df

Unnamed: 0,raptor_offense,raptor_defense,war_reg_season,pace_impact
3,-2.480005,-0.123125,0.070322,0.466668
6,-1.845659,-2.038480,-0.669619,0.444865
8,-2.870680,-0.987076,-0.273787,1.665034
11,4.267101,3.075579,15.456216,-0.513828
13,3.631396,3.605263,11.478972,-0.577012
...,...,...,...,...
27364,-2.073401,-0.690914,-0.007877,-0.331493
27366,-3.358178,-3.040589,-1.141160,0.644128
27367,-2.903709,2.688832,0.531968,0.311576
27369,-3.142927,1.774239,0.369725,0.120889


In [41]:
reg_season_scale_df

array([[-0.79931692,  0.03245956, -0.65679582,  0.68066241],
       [-0.52718347, -1.20464183, -0.88395258,  0.64044233],
       [-0.96691614, -0.5255545 , -0.7624349 ,  2.89130461],
       ...,
       [-0.98108591,  1.84866307, -0.51507363,  0.39456204],
       [-1.08370982,  1.25793994, -0.56488115,  0.04279832],
       [-0.35769147,  1.1015225 , -0.449591  , -0.21969283]])

In [42]:
silhouette = []

for n_clusters in range(2, 21):
    kmeans = KMeans(n_clusters = n_clusters, random_state = 99)
    cluster_labels = kmeans.fit_predict(reg_season_scale_df)
    
    centers = kmeans.cluster_centers_

    score = silhouette_score(reg_season_scale_df, cluster_labels)
    silhouette.append(score)
    
    print("For n_clusters = {}, silhouette score is {}".format(n_clusters, score))

For n_clusters = 2, silhouette score is 0.3133639957428435
For n_clusters = 3, silhouette score is 0.22576861117515062
For n_clusters = 4, silhouette score is 0.2341727139950163
For n_clusters = 5, silhouette score is 0.22582238765363372
For n_clusters = 6, silhouette score is 0.2229203449658741
For n_clusters = 7, silhouette score is 0.22055913725811863
For n_clusters = 8, silhouette score is 0.20866373001025657
For n_clusters = 9, silhouette score is 0.2098903286977371
For n_clusters = 10, silhouette score is 0.20709858784245788
For n_clusters = 11, silhouette score is 0.2089558191637835
For n_clusters = 12, silhouette score is 0.20678662177707446
For n_clusters = 13, silhouette score is 0.20965212534611433
For n_clusters = 14, silhouette score is 0.21163965379436242
For n_clusters = 15, silhouette score is 0.2036088803440211
For n_clusters = 16, silhouette score is 0.2012999164158483
For n_clusters = 17, silhouette score is 0.20371909335138447
For n_clusters = 18, silhouette score i

In [43]:
silhouette_diff = []

for i in range(1, len(silhouette)):
    improvement = 1 - ((1 - silhouette[i]) / (1 - silhouette[i - 1]))
    silhouette_diff.append(improvement)
    print("For n_cluster = {}, percent improvement = {}".format(i + 2, improvement))

For n_cluster = 3, percent improvement = -0.12757179062064883
For n_cluster = 4, percent improvement = 0.010854768924599578
For n_cluster = 5, percent improvement = -0.01090366782952179
For n_cluster = 6, percent improvement = -0.0037485489653519277
For n_cluster = 7, percent improvement = -0.0030385658567420037
For n_cluster = 8, percent improvement = -0.01526146217946156
For n_cluster = 9, percent improvement = 0.001550034712166748
For n_cluster = 10, percent improvement = -0.0035333586673833572
For n_cluster = 11, percent improvement = 0.0023423231348169704
For n_cluster = 12, percent improvement = -0.002742194986398827
For n_cluster = 13, percent improvement = 0.003612525516727527
For n_cluster = 14, percent improvement = 0.0025147514303350826
For n_cluster = 15, percent improvement = -0.010186678577877917
For n_cluster = 16, percent improvement = -0.002899283870932967
For n_cluster = 17, percent improvement = 0.003028892803767058
For n_cluster = 18, percent improvement = -0.003334

In [44]:
kmeans = KMeans(n_clusters = 6)

kmeans.fit(reg_season_scale_df)

y_kmeans = kmeans.predict(reg_season_scale_df)
kmeans.fit(reg_season_scale_df)

y_kmeans = kmeans.predict(reg_season_scale_df)

In [45]:
reg_season['cluster'] = y_kmeans

In [None]:
kmeans = KMeans(n_clusters = 12)

kmeans.fit(scale_df)

y_kmeans = kmeans.predict(scale_df)
kmeans.fit(scale_df)

y_kmeans = kmeans.predict(scale_df)

In [35]:
all_data['cluster'] = y_kmeans

In [100]:
cluster_0 = all_data[all_data['cluster']==1]

In [46]:
cluster_0[cluster_0['season']==2019]

Unnamed: 0,player_name,player_id,season,season_type,team,poss,mp,raptor_offense,raptor_defense,raptor_total,war_total,war_reg_season,war_playoffs,predator_offense,predator_defense,predator_total,pace_impact,cluster
132,Jaylen Adams,adamsja01,2019,RS,ATL,952,428,-2.377644,-3.734729,-6.112373,-0.733513,-0.733513,0.0,-2.023145,-4.304853,-6.327998,-0.099018,0
269,DeVaughn Akoon-Purcell,akoonde01,2019,RS,DEN,49,22,-2.229499,-1.01984,-3.249339,-0.005556,-0.005556,0.0,-1.262366,0.66152,-0.600846,-0.894708,0
353,Jarrett Allen,allenja01,2019,RS,BRK,4478,2096,-1.523007,-2.805412,-4.328419,-1.688439,-1.688439,0.0,-1.515219,-1.770966,-3.286186,0.030829,0
358,Kadeem Allen,allenka01,2019,RS,NYK,870,416,2.078138,-1.023795,1.054343,0.801391,0.801391,0.0,1.129783,-1.333298,-0.203515,-0.764299,0
499,Al-Farouq Aminu,aminual01,2019,PO,POR,823,399,-0.879552,-0.971744,-1.851295,0.1872,0.0,0.1872,-1.023039,-0.77789,-1.800928,-0.783185,0
712,Ryan Anderson,anderry01,2019,RS,PHO,570,278,-3.270465,-1.670249,-4.940714,-0.309508,-0.309508,0.0,-2.183461,-2.721894,-4.905355,-0.398038,0
797,Carmelo Anthony,anthoca01,2019,RS,HOU,612,294,-2.506433,-2.628394,-5.134827,-0.357081,-0.357081,0.0,-1.131556,-2.09568,-3.227237,-0.18251,0
863,Ryan Arcidiacono,arcidry01,2019,RS,CHI,4077,1961,-0.091056,-0.403091,-0.494146,2.22707,2.22707,0.0,-1.028347,-0.31001,-1.338357,-1.350893,0
909,Trevor Ariza,arizatr01,2019,RS,PHO,1856,884,-1.445941,-2.092738,-3.538678,-0.355141,-0.355141,0.0,-1.016009,-1.24049,-2.256499,-0.162205,0
910,Trevor Ariza,arizatr01,2019,RS,WAS,3122,1465,0.512549,-1.910558,-1.398009,1.014975,1.014975,0.0,0.663968,-1.607899,-0.943931,0.447631,0


In [99]:
cluster_1 = all_data[all_data['cluster']==8]
cluster_1[cluster_1['season']==2019]

Unnamed: 0,player_name,player_id,season,season_type,team,poss,mp,raptor_offense,raptor_defense,raptor_total,war_total,war_reg_season,war_playoffs,predator_offense,predator_defense,predator_total,pace_impact,cluster
6074,Tyler Davis,davisty01,2019,RS,OKC,2,1,-40.825921,61.113882,20.287961,0.011423,0.011423,0.0,-26.935305,42.989063,16.053758,-2.873834,8
24683,Tyler Ulis,ulisty01,2019,RS,CHI,2,1,-16.801473,62.469205,45.667732,0.023868,0.023868,0.0,-15.001847,41.704602,26.702755,-3.443889,8


In [50]:
cluster_3 = all_data[all_data['cluster']==3]
cluster_3[cluster_3['season']==2019].head(50)

Unnamed: 0,player_name,player_id,season,season_type,team,poss,mp,raptor_offense,raptor_defense,raptor_total,war_total,war_reg_season,war_playoffs,predator_offense,predator_defense,predator_total,pace_impact,cluster
349,Grayson Allen,allengr01,2019,PO,UTA,32,14,-9.8357,-5.824538,-15.660238,-0.096257,0.0,-0.096257,-8.832492,-8.682236,-17.514728,1.202344,3
350,Grayson Allen,allengr01,2019,RS,UTA,899,416,-4.608434,-5.53922,-10.147654,-1.583498,-1.583498,0.0,-3.970137,-4.558908,-8.529045,0.869532,3
352,Jarrett Allen,allenja01,2019,PO,BRK,245,110,-1.208032,-7.931457,-9.139489,-0.378198,0.0,-0.378198,-3.278979,-7.242624,-10.521602,2.247688,3
713,Ryan Anderson,anderry01,2019,RS,MIA,90,44,-3.705832,-2.712141,-6.417973,-0.081488,-0.081488,0.0,-3.74913,-4.981757,-8.730887,-1.056914,3
1282,Ron Baker,bakerro01,2019,RS,WAS,98,45,-10.675821,-0.86468,-11.540501,-0.201974,-0.201974,0.0,-9.408844,-3.288406,-12.69725,0.07724,3
1307,Wade Baldwin IV,baldwwa01,2019,RS,POR,199,94,-11.48865,-5.027375,-16.516024,-0.667983,-0.667983,0.0,-10.653276,-5.076885,-15.730161,1.201191,3
1782,Jerryd Bayless,bayleje01,2019,RS,MIN,1365,657,-1.929942,-6.130163,-8.060105,-1.75416,-1.75416,0.0,-1.306543,-5.091015,-6.397558,-1.4769,3
2049,Dairis Bertans,bertada02,2019,RS,NOP,368,167,-5.792376,-5.219272,-11.011647,-0.699774,-0.699774,0.0,-4.934068,-5.509983,-10.444051,-0.600327,3
2289,Antonio Blakeney,blakean01,2019,RS,CHI,1750,829,-3.05917,-4.042692,-7.101862,-1.829112,-1.829112,0.0,-3.451717,-5.196974,-8.648692,-0.638573,3
2486,Jonah Bolden,boldejo01,2019,PO,PHI,175,79,-7.768725,-5.571062,-13.339787,-0.448217,0.0,-0.448217,-5.493739,-5.852479,-11.346218,1.807021,3


In [61]:
season_2019 = all_data[all_data['season']==2019]

In [66]:
cluster_lst_2019 = [10,  0,  2,  9,  3,  7,  4, 11,  5,  1,  8,  6]
cluster_lst_2019.sort()

In [69]:
cluster_dfs = []
for i in cluster_lst_2019:
    cluster cluster_dfs.append(season_2019[season_2019['cluster']==i])

In [101]:
pd.set_option('display.max_rows', 100)
cluster_dfs[0][:100]

Unnamed: 0,player_name,player_id,season,season_type,team,poss,mp,raptor_offense,raptor_defense,raptor_total,war_total,war_reg_season,war_playoffs,predator_offense,predator_defense,predator_total,pace_impact,cluster
132,Jaylen Adams,adamsja01,2019,RS,ATL,952,428,-2.377644,-3.734729,-6.112373,-0.733513,-0.733513,0.0,-2.023145,-4.304853,-6.327998,-0.099018,0
269,DeVaughn Akoon-Purcell,akoonde01,2019,RS,DEN,49,22,-2.229499,-1.01984,-3.249339,-0.005556,-0.005556,0.0,-1.262366,0.66152,-0.600846,-0.894708,0
353,Jarrett Allen,allenja01,2019,RS,BRK,4478,2096,-1.523007,-2.805412,-4.328419,-1.688439,-1.688439,0.0,-1.515219,-1.770966,-3.286186,0.030829,0
358,Kadeem Allen,allenka01,2019,RS,NYK,870,416,2.078138,-1.023795,1.054343,0.801391,0.801391,0.0,1.129783,-1.333298,-0.203515,-0.764299,0
499,Al-Farouq Aminu,aminual01,2019,PO,POR,823,399,-0.879552,-0.971744,-1.851295,0.1872,0.0,0.1872,-1.023039,-0.77789,-1.800928,-0.783185,0
712,Ryan Anderson,anderry01,2019,RS,PHO,570,278,-3.270465,-1.670249,-4.940714,-0.309508,-0.309508,0.0,-2.183461,-2.721894,-4.905355,-0.398038,0
797,Carmelo Anthony,anthoca01,2019,RS,HOU,612,294,-2.506433,-2.628394,-5.134827,-0.357081,-0.357081,0.0,-1.131556,-2.09568,-3.227237,-0.18251,0
863,Ryan Arcidiacono,arcidry01,2019,RS,CHI,4077,1961,-0.091056,-0.403091,-0.494146,2.22707,2.22707,0.0,-1.028347,-0.31001,-1.338357,-1.350893,0
909,Trevor Ariza,arizatr01,2019,RS,PHO,1856,884,-1.445941,-2.092738,-3.538678,-0.355141,-0.355141,0.0,-1.016009,-1.24049,-2.256499,-0.162205,0
910,Trevor Ariza,arizatr01,2019,RS,WAS,3122,1465,0.512549,-1.910558,-1.398009,1.014975,1.014975,0.0,0.663968,-1.607899,-0.943931,0.447631,0
