In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

%matplotlib inline

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

## Import DataFrame

In [77]:
nba_stats19 = pd.read_csv('data/nba_stats19.csv', index_col = [0])
nba_stats19.columns

Index(['Player', 'Age_x', 'W', 'L', 'MP', 'FGM', 'FGA', 'FG%', '3PM', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%_x', 'OREB', 'DREB_x', 'REB', 'AST', 'TOV',
       'PF', 'FP', 'DD2', 'TD3', '+/-', 'DEF_RTG', 'DREB_y', 'DREB%',
       'T_DREB%_x', 'STL', 'T_STL%_x', 'BLK', 'T_BLK%_x', 'OP_TOV',
       'OP_2NDPTS', 'OP_PAINT', 'DEF_WS', 'USG%', 'T_FGM%', 'T_FGA%', 'T_3PM%',
       'T_3PA%', 'T_FTM%', 'T_FTA%', 'T_OREB%', 'T_DREB%_y', 'T_REB%',
       'T_AST%', 'T_TOV%', 'T_STL%_y', 'T_BLK%_y', 'T_BLKA%', 'T_PF%',
       'T_PFD%', 'T_PTS%', 'Age_y', 'FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%',
       '3PT%', 'FBPTS%', 'FT%_y', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%',
       'UAST_2FGM%', 'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%',
       'PTS_TOV', '2ND_CH_PTS', 'FB_PTS', 'PAINT_PTS', 'OPP_PTS_TOV',
       'OPP_2ND_PTS', 'OPP_FBPTS', 'OPP_PAINT_PTS', 'BLKA', 'PFD', 'Minutes',
       'BSR Off.', 'BSR Def.', 'BSR Total', 'OOR Off.', 'OOR Def.',
       'OOR Total', 'OVR Off.', 'OVR Def.', 'OVR t

In [78]:
nba_stats19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466 entries, 0 to 465
Data columns (total 92 columns):
Player           466 non-null object
Age_x            466 non-null int64
W                466 non-null int64
L                466 non-null int64
MP               466 non-null float64
FGM              466 non-null float64
FGA              466 non-null float64
FG%              466 non-null float64
3PM              466 non-null float64
3PA              466 non-null float64
3P%              466 non-null float64
FTM              466 non-null float64
FTA              466 non-null float64
FT%_x            466 non-null float64
OREB             466 non-null float64
DREB_x           466 non-null float64
REB              466 non-null float64
AST              466 non-null float64
TOV              466 non-null float64
PF               466 non-null float64
FP               466 non-null float64
DD2              466 non-null int64
TD3              466 non-null int64
+/-              466 non-null fl

## Renaming columns that somehow got messed up

In [79]:
nba_stats19.drop(columns = ['DREB_y', 'T_DREB%_y', 'T_STL%_y', 'T_BLK%_y', 'Age_y','FT%_y'], inplace = True)
nba_stats19.rename(columns = {'Age_x': 'Age', 'FT%_x': 'FT%', 'DREB_x': 'DREB', 'T_DREB%_x' : 'T_DREB%',
                             'T_STL%_x': 'T_STL%', 'T_BLK%_x': 'T_BLK%'}, inplace = True)

In [80]:
nba_stats19.shape

(466, 86)

## Sorting features into categories

In [81]:
offensive_stats = ['FG%', '3P%', 'OREB', 'TOV', 'DD2',
            'TD3', 'T_FGM%', 'T_FGA%', 'T_3PM%', 'T_3PA%', 'T_OREB%',
            'T_TOV%', '2ND_CH_PTS', 'FB_PTS', 'BSR Off.',
            'OOR Off.', 'OVR Off.', 'FP', 'STL', 'USG%', 
                'FGM_UAST%',
                  'AST'] 

In [82]:
defensive_stats = ['DREB', 'DEF_RTG', 'DREB%', 'T_DREB%', 'STL', 'T_STL%',
            'BLK', 'DEF_WS', 'T_BLK%', 'T_BLKA%',
            'BLKA', 'BSR Def.', 'OVR Def.', 'WAR', 'FP']

In [83]:
overall_stats = ['REB','AST', 'DD2', 'TD3', 'USG%', 'T_REB%',
          'T_AST%', 'PTS_TOV', 'BLK', 'T_BLKA%', 'AST_2FGM%', 'AST_3FGM%', 'STL', 'T_STL%',
                'FG%', 'FGA_2P%', 'T_FGA%', 'TOV', 'T_DREB%', 'T_OREB%', 'MP']

In [84]:
shooting_stats = ['FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%', '3P%',
           'FBPTS%', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%', 'UAST_2FGM%',
           'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%'
           ]

In [85]:
nba_stats19 = nba_stats19[nba_stats19['MP'] > 5]

### Offensive

In [86]:
X = nba_stats19[offensive_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 6)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8036969810958362


In [87]:
#creating clusters
offensive = KMeans(n_clusters = 3,random_state= 248)
offensive.fit(X_pca)
y_kmeans = offensive.predict(X_pca)
nba_stats19['off_kmean'] = offensive.labels_
off_sil = silhouette_score(X_sc, offensive.labels_)
silhouette_score(X_sc, offensive.labels_)

0.2403663716138714

In [88]:
# nba_stats19[nba_stats19['off_kmean'] == 1]

### Defensive

In [89]:
X = nba_stats19[defensive_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8302619025224812


In [90]:
#creating clusters
defensive = KMeans(n_clusters = 3, random_state= 248)
defensive.fit(X_pca)
y_kmeans = defensive.predict(X_pca)
nba_stats19['def_kmean'] = defensive.labels_
def_sil = silhouette_score(X_sc, defensive.labels_)
silhouette_score(X_sc, defensive.labels_)

0.18719209913001186

### Overall

In [91]:
X = nba_stats19[overall_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 6)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.7655803283521689


In [92]:
#creating clusters
overall = KMeans(n_clusters = 3, random_state = 248)
overall.fit(X_pca)
y_kmeans = overall.predict(X_pca)
nba_stats19['ov_kmean'] = overall.labels_
over_sil = silhouette_score(X_sc, overall.labels_)
silhouette_score(X_sc, overall.labels_)

0.24049331404586816

### Shooting

In [93]:
X = nba_stats19[shooting_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8501122495646658


In [94]:
#creating clusters
shoot = KMeans(n_clusters = 3, random_state = 248)
shoot.fit(X_pca)
y_kmeans = shoot.predict(X_pca)
nba_stats19['shoot_kmean'] = shoot.labels_
shoot_sil = silhouette_score(X_sc, shoot.labels_)
silhouette_score(X_sc, shoot.labels_)

0.2597134613093371

In [95]:
print(f'Shooting classification similarity score is {shoot_sil}')
print(f'Overall classification similarity score is {over_sil}')
print(f'Defensive classification similarity score is {def_sil}')
print(f'Offensive classification similarity score is {off_sil}')

Shooting classification similarity score is 0.2597134613093371
Overall classification similarity score is 0.24049331404586816
Defensive classification similarity score is 0.18719209913001186
Offensive classification similarity score is 0.2403663716138714


In [21]:
nba_stats19.to_csv('data/nba_2eda.csv')

# Rec system

In [22]:
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [23]:
nba_stats19.shape

(421, 90)

## Offensive

In [24]:
#create pivot table
off_piv = pd.pivot_table(nba_stats19[offensive_stats], index = nba_stats19['Player'])
#create sparse matrix
off_sparse = sparse.csr_matrix(off_piv.fillna(0))
# #calculate cosine similarity
off_recommender = pairwise_distances(off_sparse, metric = 'cosine')
# #creating recommender dataframe
off_rec_df = pd.DataFrame(off_recommender, columns = off_piv.index, index = off_piv.index)
off_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Admiral Schofield,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,...,Wenyen Gabriel,Wes Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willy Hernangomez,Yogi Ferrell,Zach Collins,Zach LaVine,Zylan Cheatham
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.045891,0.081593,0.206225,0.027914,0.045578,0.023659,0.074372,0.100853,0.185649,...,0.075181,0.054471,0.115094,0.011659,0.171274,0.141132,0.044216,0.134059,0.033476,0.399016
Aaron Holiday,0.045891,0.0,0.075416,0.286452,0.098414,0.061022,0.022413,0.05086,0.221889,0.227153,...,0.112756,0.076412,0.10997,0.035506,0.293132,0.233967,0.019867,0.221641,0.030273,0.492779
Abdel Nader,0.081593,0.075416,0.0,0.15965,0.097644,0.125478,0.050126,0.075591,0.152579,0.11222,...,0.135936,0.143595,0.054668,0.081238,0.294863,0.138985,0.086049,0.123291,0.058547,0.334141
Admiral Schofield,0.206225,0.286452,0.15965,0.0,0.142349,0.246102,0.238972,0.20871,0.147776,0.089136,...,0.274845,0.282759,0.132914,0.196606,0.294724,0.12092,0.237436,0.051599,0.273878,0.456186
Al Horford,0.027914,0.098414,0.097644,0.142349,0.0,0.089605,0.051553,0.09055,0.087121,0.113001,...,0.125527,0.098744,0.089819,0.023511,0.159199,0.117414,0.107689,0.074674,0.064411,0.448698


In [25]:
#evaluating matrix
off_rec_df['LeBron James'].sort_values()[1:11]

Player
Luka Doncic        0.006642
Bradley Beal       0.011995
Malcolm Brogdon    0.014210
Kawhi Leonard      0.015378
James Harden       0.018045
Brandon Ingram     0.018469
Kyrie Irving       0.018639
Trae Young         0.019043
Jrue Holiday       0.019785
Lou Williams       0.021475
Name: LeBron James, dtype: float64

## Defensive

In [26]:
# create pivot table
def_piv = pd.pivot_table(nba_stats19[defensive_stats], index = nba_stats19['Player'])
# #create sparse matrix
def_sparse = sparse.csr_matrix(def_piv.fillna(0))
# # #calculate cosine similarity
def_recommender = pairwise_distances(def_sparse, metric = 'cosine')
# # #creating recommender dataframe
def_rec_df = pd.DataFrame(def_recommender, columns = def_piv.index, index = def_piv.index)
def_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Admiral Schofield,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,...,Wenyen Gabriel,Wes Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willy Hernangomez,Yogi Ferrell,Zach Collins,Zach LaVine,Zylan Cheatham
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.12794,0.125035,0.338637,0.059641,0.114035,0.018636,0.147315,0.170126,0.173718,...,0.28584,0.107935,0.052113,0.016194,0.133023,0.262209,0.146549,0.152544,0.042384,0.493686
Aaron Holiday,0.12794,0.0,0.139915,0.54367,0.176613,0.06497,0.077492,0.034029,0.298218,0.252638,...,0.381575,0.109496,0.041122,0.103631,0.197235,0.402372,0.037287,0.352967,0.124383,0.728638
Abdel Nader,0.125035,0.139915,0.0,0.394764,0.066748,0.025861,0.122983,0.12506,0.075417,0.08308,...,0.284932,0.118638,0.168391,0.117657,0.025046,0.211465,0.069613,0.258267,0.165778,0.525959
Admiral Schofield,0.338637,0.54367,0.394764,0.0,0.417232,0.376123,0.405112,0.629648,0.462958,0.466437,...,0.754817,0.48844,0.47829,0.381823,0.445977,0.62457,0.404533,0.575162,0.567844,0.160504
Al Horford,0.059641,0.176613,0.066748,0.417232,0.0,0.10603,0.08621,0.179687,0.093236,0.171028,...,0.323239,0.178799,0.131989,0.040709,0.066812,0.244416,0.162185,0.147737,0.08519,0.608345


## Overall

In [27]:
#create pivot table
over_piv = pd.pivot_table(nba_stats19[overall_stats], index = nba_stats19['Player'])
# #create sparse matrix
over_sparse = sparse.csr_matrix(over_piv.fillna(0))
# # #calculate cosine similarity
over_recommender = pairwise_distances(over_sparse, metric = 'cosine')
# # #creating recommender dataframe
over_rec_df = pd.DataFrame(over_recommender, columns = over_piv.index, index = over_piv.index)
over_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Admiral Schofield,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,...,Wenyen Gabriel,Wes Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willy Hernangomez,Yogi Ferrell,Zach Collins,Zach LaVine,Zylan Cheatham
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.055557,0.036741,0.120793,0.009587,0.023366,0.006459,0.035918,0.03252,0.073877,...,0.072742,0.038424,0.075757,0.004626,0.219021,0.080406,0.030493,0.032146,0.032598,0.30626
Aaron Holiday,0.055557,0.0,0.090739,0.209019,0.068967,0.063579,0.043259,0.076575,0.129153,0.196221,...,0.128984,0.116631,0.135854,0.045365,0.327939,0.176364,0.028136,0.142189,0.052956,0.482256
Abdel Nader,0.036741,0.090739,0.0,0.052497,0.021963,0.030432,0.036902,0.024529,0.029442,0.041164,...,0.079316,0.039991,0.027971,0.05117,0.307553,0.101335,0.046125,0.031233,0.085492,0.352007
Admiral Schofield,0.120793,0.209019,0.052497,0.0,0.093681,0.111757,0.14161,0.10873,0.087046,0.087111,...,0.153073,0.129385,0.066721,0.138067,0.398027,0.155824,0.146473,0.072851,0.210406,0.421453
Al Horford,0.009587,0.068967,0.021963,0.093681,0.0,0.016438,0.020601,0.035048,0.016493,0.057759,...,0.060898,0.041614,0.067086,0.014363,0.229144,0.063392,0.037182,0.024068,0.051254,0.334746


In [28]:
#evaluating matrix
over_rec_df['LeBron James'].sort_values()[1:11]

Player
Russell Westbrook    0.012577
Kyrie Irving         0.012687
Trae Young           0.013453
Malcolm Brogdon      0.015295
Jrue Holiday         0.019057
De'Aaron Fox         0.024164
Spencer Dinwiddie    0.024657
Jeff Teague          0.024957
Damian Lillard       0.025007
Eric Bledsoe         0.026277
Name: LeBron James, dtype: float64

## Shooting

In [29]:
#create pivot table
shoot_piv = pd.pivot_table(nba_stats19[shooting_stats], index = nba_stats19['Player'])
# #create sparse matrix
shoot_sparse = sparse.csr_matrix(shoot_piv.fillna(0))
# # #calculate cosine similarity
shoot_recommender = pairwise_distances(shoot_sparse, metric = 'cosine')
# # #creating recommender dataframe
shoot_rec_df = pd.DataFrame(shoot_recommender, columns = shoot_piv.index, index = shoot_piv.index)
shoot_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Admiral Schofield,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,...,Wenyen Gabriel,Wes Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willy Hernangomez,Yogi Ferrell,Zach Collins,Zach LaVine,Zylan Cheatham
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.071524,0.036012,0.140548,0.020841,0.013129,0.00285,0.023423,0.066912,0.151636,...,0.033673,0.063105,0.109328,0.009855,0.212723,0.045868,0.039641,0.086732,0.022085,0.279448
Aaron Holiday,0.071524,0.0,0.121354,0.225042,0.123,0.066078,0.066999,0.14191,0.247214,0.350427,...,0.113031,0.215832,0.204906,0.041352,0.401063,0.198642,0.025953,0.254186,0.038483,0.508683
Abdel Nader,0.036012,0.121354,0.0,0.044004,0.026512,0.017881,0.030672,0.012766,0.070684,0.075546,...,0.028697,0.062396,0.022804,0.052048,0.307067,0.044633,0.092562,0.04068,0.060542,0.314373
Admiral Schofield,0.140548,0.225042,0.044004,0.0,0.094362,0.09975,0.121713,0.089187,0.162046,0.061103,...,0.113871,0.117865,0.013192,0.162733,0.460401,0.120189,0.179235,0.060921,0.157972,0.418662
Al Horford,0.020841,0.123,0.026512,0.094362,0.0,0.022882,0.021363,0.020071,0.038163,0.09191,...,0.032398,0.045687,0.079399,0.048165,0.228553,0.019743,0.070021,0.037887,0.066485,0.264509


In [30]:
#evaluating matrix
player = shoot_rec_df['LeBron James'].sort_values()[1:11]

In [31]:
keys = player.to_dict().keys()
key_list = list(keys)

# Pickle Rec systems

In [32]:
pickle.dump(off_rec_df, open('pickles/offensive.p', 'wb+'))
pickle.dump(def_rec_df, open('pickles/defensive.p', 'wb+'))
pickle.dump(over_rec_df, open('pickles/overall.p', 'wb+'))
pickle.dump(shoot_rec_df, open('pickles/shoot.p', 'wb+'))

## Creating a salary pickle

In [33]:
player_salary = ['Player', 'salary']
player_salary = nba_stats19[player_salary]

In [34]:
player_salary

Unnamed: 0,Player,salary
0,Ante Zizic,"$2,281,800"
1,Boban Marjanovic,"$3,500,000"
2,Bogdan Bogdanovic,"$8,529,386"
3,Bojan Bogdanovic,"$17,000,000"
6,CJ Miles,"$8,730,158"
...,...,...
446,Caleb Martin,"$898,310"
448,Jaylen Nowell,"$1,400,000"
451,Ed Davis,"$4,767,000"
453,Jared Dudley,"$2,564,753"


## Reindex player salary 

In [73]:
play_sal = player_salary.set_index('Player')
play_sal.head()

Unnamed: 0_level_0,salary
Player,Unnamed: 1_level_1
Ante Zizic,"$2,281,800"
Boban Marjanovic,"$3,500,000"
Bogdan Bogdanovic,"$8,529,386"
Bojan Bogdanovic,"$17,000,000"
CJ Miles,"$8,730,158"


In [74]:
pickle.dump(play_sal, open('pickles/salaries.p', 'wb+'))