In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import sklearn
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

## Pretty variable display 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## RANDOM STATE
seed=3

In [3]:
df = pd.read_csv('../bballref_data/bpm_clusters12_all_data.csv', index_col=0)
df.head()

Unnamed: 0_level_0,player_id,pos,age,team_id,g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,...,cluster_2_bpm,cluster_3_bpm,cluster_4_bpm,cluster_5_bpm,cluster_6_bpm,cluster_7_bpm,cluster_8_bpm,cluster_9_bpm,cluster_10_bpm,cluster_11_bpm
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tariq Abdul-Wahad 2000,abdulta01,SG,25,TOT,61,1578,13.6,0.477,0.036,0.299,...,-1e-06,-0.0,-0.0,-0.0,-0.008897,-0.0,-0.912439,-0.0,-0.278662,-0.0
Shareef Abdur-Rahim 2000,abdursh01,SF,23,VAN,82,3223,20.2,0.547,0.075,0.431,...,2.199971,2.4e-05,0.0,0.0,0.0,0.0,4e-06,0.0,0.0,0.0
Ray Allen 2000,allenra02,SG,24,MIL,82,3070,20.6,0.57,0.288,0.282,...,0.0,3.597134,0.0,0.0,0.0,0.0,0.0,0.002866,0.0,0.0
John Amaechi 2000,amaecjo01,C,29,ORL,80,1684,13.2,0.505,0.009,0.416,...,-5e-06,-0.0,-0.0,-0.0,-0.0,-0.0,-2.454038,-0.0,-0.000212,-0.0
Derek Anderson 2000,anderde01,SG,25,LAC,64,2201,16.9,0.542,0.207,0.359,...,-0.0,-0.0,-0.0,-0.0,-0.099991,-0.0,-0.0,-0.0,-5e-06,-0.0


In [8]:
sorted(df.columns)

['age',
 'and1s_per_min',
 'ast_pct',
 'ast_per_poss',
 'astd_pts_per_min',
 'avg_dist',
 'blk_pct',
 'blk_per_poss',
 'bpm',
 'cluster_0',
 'cluster_0_bpm',
 'cluster_1',
 'cluster_10',
 'cluster_10_bpm',
 'cluster_11',
 'cluster_11_bpm',
 'cluster_1_bpm',
 'cluster_2',
 'cluster_2_bpm',
 'cluster_3',
 'cluster_3_bpm',
 'cluster_4',
 'cluster_4_bpm',
 'cluster_5',
 'cluster_5_bpm',
 'cluster_6',
 'cluster_6_bpm',
 'cluster_7',
 'cluster_7_bpm',
 'cluster_8',
 'cluster_8_bpm',
 'cluster_9',
 'cluster_9_bpm',
 'dbpm',
 'def_rtg',
 'drawn_shooting_per_min',
 'drb_pct',
 'drb_per_poss',
 'dws',
 'fg2_pct',
 'fg2_per_poss',
 'fg2a_per_poss',
 'fg3_heave',
 'fg3_pct',
 'fg3_per_poss',
 'fg3a_heave',
 'fg3a_per_fga_pct',
 'fg3a_per_poss',
 'fg_dunk_per_min',
 'fg_pct',
 'fg_pct_00_03',
 'fg_pct_03_10',
 'fg_pct_10_16',
 'fg_pct_16_xx',
 'fg_pct_corner3',
 'fg_pct_fg2a',
 'fg_pct_fg3a',
 'fg_per_poss',
 'fga_per_poss',
 'fouls_offensive_per_min',
 'fouls_shooting_per_min',
 'ft_pct',
 'ft_per

### Define some column variables

In [21]:
cluster_cols = [f'cluster_{n}' for n in range(12)]
bpm_cluster_cols = [f'cluster_{n}_bpm' for n in range(12)]

position_cols =  ['pct_c',
                  'pct_pf',
                  'pct_pg',
                  'pct_sf',
                  'pct_sg',]

shooting_location_cols = [
                         'fg_pct_00_03',
                         'fg_pct_03_10',
                         'fg_pct_10_16',
                         'fg_pct_16_xx',
                         'fg_pct_corner3',
                         'fg_pct_fg2a',
                         'fg_pct_fg3a',
                         'fg_per_poss',
                         'fga_per_poss',]

shooting_cols = ['ts_pct',
                 'avg_dist',
                 'fg2_pct',
                 'fg2a_per_poss',
                 'fg3_pct',
                 'fg3a_per_poss',
                 'ft_pct',
                 'fta_per_poss',
                 'fta_per_fga_pct',
                 'pct_fga_fg2a',
                 'pct_fga_fg3a',
                 'pct_fga_00_03',
                 'pct_fga_03_10',
                 'pct_fga_10_16',
                 'pct_fga_16_xx',
                 'pct_fga_dunk',
                 'pct_fga_fg2a',
                 'pct_fga_fg3a',
                 
            
                ]

bio_cols = [  # some of these cols need to switch dtpyes and handle missing
#             'salary',
            'age',
            'height',
            'weight',
#             'nationality'
           ]

value_cols = ['bpm',
              'obpm',
              'dbpm',
              'vorp',
              'per',
              'off_rtg',
              'def_rtg',
              'plus_minus_on',
              'plus_minus_net',
                ]



In [24]:
avg_cluster_players[value_cols].sort_values(by='def_rtg')

Unnamed: 0_level_0,bpm,obpm,dbpm,vorp,per,off_rtg,def_rtg,plus_minus_on,plus_minus_net
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4,0.728873,-0.157746,0.88662,1.475352,17.377465,113.471831,102.626761,2.127465,2.099296
2,1.982573,1.385892,0.59834,2.43112,19.253942,108.966805,103.232365,1.952697,3.263071
7,-1.255345,-1.506294,0.252947,0.422178,14.62008,107.714286,104.118881,-0.751948,-0.300899
0,-1.322936,-1.429969,0.111315,0.352294,14.567278,105.235474,104.180428,-1.172477,-0.343731
9,4.315337,3.847853,0.469018,4.272086,21.863497,111.868098,104.748466,3.576074,5.765337
8,-0.428423,-0.678571,0.252381,0.696875,14.327232,107.940476,106.388393,-0.509673,-0.215327
10,-2.338958,-2.211166,-0.125062,-0.07866,11.541439,99.578164,106.962779,-2.502481,-2.019603
3,1.971181,2.133228,-0.161575,2.59748,17.849764,109.697638,107.434646,2.09811,3.211969
1,-0.527465,-0.394836,-0.134507,0.743662,12.612559,107.65493,108.151408,0.102465,0.269601
5,-0.760099,-0.286589,-0.475828,0.578477,13.359106,105.307947,108.604305,-1.143709,-0.70596


## Generate mean player for each cluster

In [4]:
cluster_groupby = df.groupby('label')

In [6]:
avg_cluster_players = cluster_groupby.mean()

In [7]:
avg_cluster_players

Unnamed: 0_level_0,age,g,mp,per,ts_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,trb_pct,...,cluster_2_bpm,cluster_3_bpm,cluster_4_bpm,cluster_5_bpm,cluster_6_bpm,cluster_7_bpm,cluster_8_bpm,cluster_9_bpm,cluster_10_bpm,cluster_11_bpm
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,27.24159,68.155963,1545.180428,14.567278,0.523263,0.011171,0.332398,9.484709,18.949541,14.204893,...,-0.002178,0.0,-0.001044,0.0,0.0,-0.001458,-0.001871,-2.136697e-06,-0.01832537,0.0
1,26.876761,72.576291,1908.225352,12.612559,0.542762,0.435413,0.209566,3.324883,12.446244,7.876995,...,-0.000245,0.001802,0.0,-0.0009069528,-0.01278361,0.0,-0.010532,0.0006715048,-0.001767163,-0.030341
2,26.763485,71.842324,2375.514523,19.253942,0.544568,0.042303,0.353373,8.23029,20.906224,14.595851,...,1.973992,0.000161,-0.000116,0.0,-6.182573e-08,9e-06,0.007928,0.004667167,-0.006588595,0.0
3,26.738583,73.951181,2592.014173,17.849764,0.547647,0.285017,0.292098,2.779685,11.271969,7.03748,...,-0.000119,1.930045,0.0,0.004755396,0.007498667,0.0,0.000503,0.02869271,-0.0002780446,0.0
4,26.711268,69.908451,1771.556338,17.377465,0.573789,0.011669,0.555232,11.723944,23.334507,17.551408,...,0.002094,0.0,0.728127,0.0,0.0,-0.003304,0.000245,4.049296e-07,-3.951408e-06,0.0
5,28.124172,66.546358,1594.584437,13.359106,0.519998,0.356444,0.212382,1.839238,9.095199,5.461424,...,0.0,0.003103,0.0,-0.7282069,-0.01216468,0.0,1.2e-05,2.09106e-07,-0.007866895,-0.013713
6,26.382677,65.088189,1832.307087,14.988189,0.532027,0.321969,0.2802,2.72315,11.365512,7.029921,...,0.000448,0.001257,0.0,-0.00656081,0.08016247,0.0,-0.000784,0.003137871,-0.005408952,-0.025134
7,26.8002,66.779221,1489.058941,14.62008,0.543252,0.00286,0.39597,10.572727,20.313287,15.43966,...,5.5e-05,0.0,9.1e-05,0.0,0.0,-1.254523,0.0,4.675325e-08,-3.686314e-08,0.0
8,26.061012,66.561012,1544.331845,14.327232,0.542217,0.219528,0.284482,6.929167,16.962202,11.938095,...,0.001152,8.3e-05,-0.000104,8.265476e-06,0.00144723,0.0,-0.375107,0.0008436283,-0.02135789,-0.017916
9,26.607362,73.055215,2649.490798,21.863497,0.563199,0.226156,0.368785,5.16227,18.746626,12.008896,...,0.013667,0.044492,6.2e-05,-2.030675e-07,0.003844912,1e-06,0.000859,4.252418,-5.705521e-08,0.0
