In [1]:
import pandas as pd
import numpy as np

import pickle

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
nba_later = pd.read_csv('../data/nba_later.csv', index_col = [0])

In [3]:
nba_later.head()

Unnamed: 0,Player,Age_x,W,L,MP,FGM,FGA,FG%,3PM,3PA,...,PTS_TOV,2ND_CH_PTS,FB_PTS,PAINT_PTS,OPP_PTS_TOV,OPP_2ND_PTS,OPP_FBPTS,OPP_PAINT_PTS,BLKA,PFD
0,James Harden,30,28,15,37.1,10.4,23.7,43.9,4.7,13.0,...,5.2,3.0,3.5,11.1,14.2,10.5,12.6,37.9,1.5,8.0
1,Giannis Antetokounmpo,25,37,6,30.7,11.1,19.9,55.7,1.6,5.0,...,5.1,3.5,6.4,17.9,9.0,6.6,7.9,23.7,1.1,8.4
2,Trae Young,21,9,32,35.1,9.3,20.8,44.5,3.4,9.3,...,4.2,1.9,3.4,10.2,14.2,11.2,10.3,42.0,1.3,7.7
3,Luka Doncic,20,26,14,32.9,9.5,20.4,46.6,3.0,9.2,...,3.1,2.8,2.8,12.2,10.9,9.7,10.7,34.2,0.9,7.3
4,Damian Lillard,29,19,25,37.0,8.9,19.7,45.0,3.7,9.8,...,3.0,1.0,2.5,8.3,11.1,12.3,9.8,36.0,1.1,6.1


In [7]:
nba_later.columns

Index(['Player', 'Age_x', 'W', 'L', 'MP', 'FGM', 'FGA', 'FG%', '3PM', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%_x', 'OREB', 'DREB_x', 'REB', 'AST', 'TOV',
       'PF', 'FP', 'DD2', 'TD3', '+/-', 'DEF_RTG', 'DREB%', 'T_DREB%_x', 'STL',
       'T_STL%_x', 'BLK', 'T_BLK%_x', 'OP_TOV', 'OP_2NDPTS', 'OP_PAINT',
       'DEF_WS', 'USG%', 'T_FGM%', 'T_FGA%', 'T_3PM%', 'T_3PA%', 'T_FTM%',
       'T_FTA%', 'T_OREB%', 'T_REB%', 'T_AST%', 'T_TOV%', 'T_BLKA%', 'T_PF%',
       'T_PFD%', 'T_PTS%', 'FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%', '3PT%',
       'FBPTS%', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%', 'UAST_2FGM%',
       'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%', 'PTS_TOV',
       '2ND_CH_PTS', 'FB_PTS', 'PAINT_PTS', 'OPP_PTS_TOV', 'OPP_2ND_PTS',
       'OPP_FBPTS', 'OPP_PAINT_PTS', 'BLKA', 'PFD'],
      dtype='object')

In [6]:
nba_later.drop(columns = ['DREB_y', 'T_STL%_y', 'T_BLK%_y', 'T_DREB%_y', 'Age_y', 'FT%_y'], inplace = True)

In [9]:
nba_later.rename(columns = {'Age_x': 'Age', 'FT%_x': 'FT%', 'DREB_x':'DREB', 'T_DREB%_x' : 'T_DREB%',
                           'T_STL%_x' : 'T_STL%', 'T_BLK%' : 'T_BLK%'}, inplace = True)

# Categories

In [20]:
offensive_stats = ['FG%', '3P%', 'OREB', 'TOV', 'DD2',
            'TD3', 'T_FGM%', 'T_FGA%', 'T_3PM%', 'T_3PA%', 'T_OREB%',
            'T_TOV%', '2ND_CH_PTS', 'FB_PTS', 'FP', 'STL', 'USG%', 
                'FGM_UAST%',
                  'AST'] 

In [27]:
defensive_stats = ['DREB', 'DEF_RTG', 'DREB%', 'T_DREB%', 'STL', 'T_STL%',
            'BLK', 'DEF_WS', 'T_BLKA%',
            'BLKA',  'FP']

In [12]:
overall_stats = ['REB','AST', 'DD2', 'TD3', 'USG%', 'T_REB%',
          'T_AST%', 'PTS_TOV', 'BLK', 'T_BLKA%', 'AST_2FGM%', 'AST_3FGM%', 'STL', 'T_STL%',
                'FG%', 'FGA_2P%', 'T_FGA%', 'TOV', 'T_DREB%', 'T_OREB%', 'MP']

In [13]:
shooting_stats = ['FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%', '3P%',
           'FBPTS%', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%', 'UAST_2FGM%',
           'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%'
           ]

In [14]:
nba_later = nba_later[nba_later['MP'] > 6]
nba_later.shape

(427, 74)

# PCA & KMEANS

In [21]:
sc = StandardScaler()
pca = PCA(n_components = 6)
clusters = KMeans(n_clusters = 3, random_state =248)

In [22]:
def nba_stats(category):
    X = nba_later[category]
    y = nba_later['Player']
    #Scale Data
    X_sc = sc.fit_transform(X)
    pca.fit(X_sc)
    #dimensionality reduction
    X_pca = pca.transform(X_sc)
    print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())
    #create clusters
    clusters.fit(X_pca)
    y_kmeans = clusters.predict(X_pca)
    print("Silhouette Score:", silhouette_score(X_sc, clusters.labels_))

## Offensive

In [23]:
nba_stats(offensive_stats)

Cumulative Explained Variance: 0.8598057643491612
Silhouette Score: 0.28182522603841853


## Defensive

In [28]:
nba_stats(defensive_stats)

Cumulative Explained Variance: 0.962632063330381
Silhouette Score: 0.24254407862271896


## Shooting

In [29]:
nba_stats(shooting_stats)

Cumulative Explained Variance: 0.9335493622178543
Silhouette Score: 0.28157497817930127


## Overall

In [30]:
nba_stats(overall_stats)

Cumulative Explained Variance: 0.8144377238696634
Silhouette Score: 0.2547030793927546


In [31]:
nba_later.to_csv('../data/nba_later_eda.csv')