In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

%matplotlib inline

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

## Import DataFrame

In [2]:
nba_stats19 = pd.read_csv('data/nba_stats19.csv', index_col = [0])
nba_stats19.columns

Index(['Player', 'Age_x', 'W', 'L', 'MP', 'FGM', 'FGA', 'FG%', '3PM', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%_x', 'OREB', 'DREB_x', 'REB', 'AST', 'TOV',
       'PF', 'FP', 'DD2', 'TD3', '+/-', 'DEF_RTG', 'DREB_y', 'DREB%',
       'T_DREB%_x', 'STL', 'T_STL%_x', 'BLK', 'T_BLK%_x', 'OP_TOV',
       'OP_2NDPTS', 'OP_PAINT', 'DEF_WS', 'USG%', 'T_FGM%', 'T_FGA%', 'T_3PM%',
       'T_3PA%', 'T_FTM%', 'T_FTA%', 'T_OREB%', 'T_DREB%_y', 'T_REB%',
       'T_AST%', 'T_TOV%', 'T_STL%_y', 'T_BLK%_y', 'T_BLKA%', 'T_PF%',
       'T_PFD%', 'T_PTS%', 'Age_y', 'FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%',
       '3PT%', 'FBPTS%', 'FT%_y', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%',
       'UAST_2FGM%', 'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%',
       'PTS_TOV', '2ND_CH_PTS', 'FB_PTS', 'PAINT_PTS', 'OPP_PTS_TOV',
       'OPP_2ND_PTS', 'OPP_FBPTS', 'OPP_PAINT_PTS', 'BLKA', 'PFD', 'Minutes',
       'BSR Off.', 'BSR Def.', 'BSR Total', 'OOR Off.', 'OOR Def.',
       'OOR Total', 'OVR Off.', 'OVR Def.', 'OVR t

In [3]:
nba_stats19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 461 entries, 0 to 460
Data columns (total 92 columns):
Player           461 non-null object
Age_x            461 non-null float64
W                461 non-null float64
L                461 non-null float64
MP               461 non-null float64
FGM              461 non-null float64
FGA              461 non-null float64
FG%              461 non-null float64
3PM              461 non-null float64
3PA              461 non-null float64
3P%              461 non-null float64
FTM              461 non-null float64
FTA              461 non-null float64
FT%_x            461 non-null float64
OREB             461 non-null float64
DREB_x           461 non-null float64
REB              461 non-null float64
AST              461 non-null float64
TOV              461 non-null float64
PF               461 non-null float64
FP               461 non-null float64
DD2              461 non-null float64
TD3              461 non-null float64
+/-              461 n

## Renaming columns that somehow got messed up

In [4]:
nba_stats19.drop(columns = ['DREB_y', 'T_DREB%_y', 'T_STL%_y', 'T_BLK%_y', 'Age_y','FT%_y'], inplace = True)
nba_stats19.rename(columns = {'Age_x': 'Age', 'FT%_x': 'FT%', 'DREB_x': 'DREB', 'T_DREB%_x' : 'T_DREB%',
                             'T_STL%_x': 'T_STL%', 'T_BLK%_x': 'T_BLK%'}, inplace = True)

In [5]:
nba_stats19.shape

(461, 86)

## Sorting features into categories

In [6]:
offensive_stats = ['FG%', '3P%', 'OREB', 'TOV', 'DD2',
            'TD3', 'T_FGM%', 'T_FGA%', 'T_3PM%', 'T_3PA%', 'T_OREB%',
            'T_TOV%', '2ND_CH_PTS', 'FB_PTS', 'BSR Off.',
            'OOR Off.', 'OVR Off.', 'FP', 'STL', 'USG%', 
                'FGM_UAST%',
                  'AST'] 

In [7]:
defensive_stats = ['DREB', 'DEF_RTG', 'DREB%', 'T_DREB%', 'STL', 'T_STL%',
            'BLK', 'DEF_WS', 'T_BLK%', 'T_BLKA%',
            'BLKA', 'BSR Def.', 'OVR Def.', 'WAR', 'FP']

In [8]:
overall_stats = ['REB','AST', 'FP', 'DD2', 'TD3', 'USG%', 'T_REB%',
          'T_AST%', 'PTS_TOV', 'BSR Total', 'OOR Total',
          'OVR total', 'WAR']

In [9]:
shooting_stats = ['FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%', '3P%',
           'FBPTS%', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%', 'UAST_2FGM%',
           'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%'
           ]

In [10]:
nba_stats19 = nba_stats19[nba_stats19['MP'] > 15]

### Offensive

In [11]:
X = nba_stats19[offensive_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 6)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8269327373860982


In [12]:
#creating clusters
offensive = KMeans(n_clusters = 3,random_state= 248)
offensive.fit(X_pca)
y_kmeans = offensive.predict(X_pca)
nba_stats19['off_kmean'] = offensive.labels_
off_sil = silhouette_score(X_sc, offensive.labels_)
silhouette_score(X_sc, offensive.labels_)

0.2666019771105569

In [13]:
# nba_stats19[nba_stats19['off_kmean'] == 1]

### Defensive

In [14]:
X = nba_stats19[defensive_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8532930183795493


In [15]:
#creating clusters
defensive = KMeans(n_clusters = 5, random_state= 248)
defensive.fit(X_pca)
y_kmeans = defensive.predict(X_pca)
nba_stats19['def_kmean'] = defensive.labels_
def_sil = silhouette_score(X_sc, defensive.labels_)
silhouette_score(X_sc, defensive.labels_)

0.1814029945105203

### Overall

In [16]:
X = nba_stats19[overall_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8929885103737824


In [17]:
#creating clusters
overall = KMeans(n_clusters = 5, random_state = 248)
overall.fit(X_pca)
y_kmeans = overall.predict(X_pca)
nba_stats19['ov_kmean'] = overall.labels_
over_sil = silhouette_score(X_sc, overall.labels_)
silhouette_score(X_sc, overall.labels_)

0.19895394328713362

### Shooting

In [18]:
X = nba_stats19[shooting_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.9058273274405296


In [19]:
#creating clusters
shoot = KMeans(n_clusters = 3, random_state = 248)
shoot.fit(X_pca)
y_kmeans = shoot.predict(X_pca)
nba_stats19['shoot_kmean'] = shoot.labels_
shoot_sil = silhouette_score(X_sc, shoot.labels_)
silhouette_score(X_sc, shoot.labels_)

0.2819348600021086

In [20]:
print(f'Shooting classification similarity score is {shoot_sil}')
print(f'Overall classification similarity score is {over_sil}')
print(f'Defensive classification similarity score is {def_sil}')
print(f'Offensive classification similarity score is {off_sil}')

Shooting classification similarity score is 0.2819348600021086
Overall classification similarity score is 0.19895394328713362
Defensive classification similarity score is 0.1814029945105203
Offensive classification similarity score is 0.2666019771105569


In [21]:
nba_stats19.to_csv('data/nba_2eda.csv')

# Rec system

In [22]:
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [23]:
nba_stats19.shape

(304, 90)

## Offensive

In [24]:
#create pivot table
off_piv = pd.pivot_table(nba_stats19[offensive_stats], index = nba_stats19['Player'])
#create sparse matrix
off_sparse = sparse.csr_matrix(off_piv.fillna(0))
# #calculate cosine similarity
off_recommender = pairwise_distances(off_sparse, metric = 'cosine')
# #creating recommender dataframe
off_rec_df = pd.DataFrame(off_recommender, columns = off_piv.index, index = off_piv.index)
off_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Allen Crabbe,Andre Drummond,Andrew Wiggins,...,Tyler Herro,Tyler Johnson,Tyus Jones,Vince Carter,Wendell Carter Jr.,Wesley Matthews,Will Barton,Willie Cauley-Stein,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.050602,0.018192,0.048679,0.027935,0.084755,0.101685,0.115984,0.148126,0.022978,...,0.038098,0.05446,0.102097,0.059992,0.108337,0.118603,0.012188,0.166274,0.125472,0.033758
Aaron Holiday,0.050602,0.0,0.098019,0.059705,0.029962,0.064961,0.238918,0.154017,0.288632,0.058054,...,0.041244,0.068376,0.055183,0.025586,0.258423,0.129872,0.040021,0.302237,0.230022,0.036361
Al Horford,0.018192,0.098019,0.0,0.082284,0.046881,0.093478,0.087619,0.088389,0.148801,0.045739,...,0.051745,0.069685,0.174045,0.096819,0.094235,0.092933,0.023778,0.150577,0.08468,0.060965
Al-Farouq Aminu,0.048679,0.059705,0.082284,0.0,0.081773,0.08702,0.126822,0.144422,0.167006,0.115233,...,0.106348,0.090631,0.085157,0.083337,0.161339,0.190527,0.069675,0.176854,0.206195,0.09941
Alec Burks,0.027935,0.029962,0.046881,0.081773,0.0,0.058452,0.180773,0.080924,0.266155,0.025411,...,0.00766,0.028573,0.108905,0.027702,0.225225,0.0753,0.022631,0.268419,0.163592,0.010684


In [25]:
#evaluating matrix
off_rec_df['LeBron James'].sort_values()[1:11]

Player
Luka Doncic        0.006992
Malcolm Brogdon    0.013774
Bradley Beal       0.016039
James Harden       0.016929
Kawhi Leonard      0.017814
Kyrie Irving       0.018259
Trae Young         0.018510
Brandon Ingram     0.020582
Pascal Siakam      0.020649
Lou Williams       0.021372
Name: LeBron James, dtype: float64

## Defensive

In [26]:
# create pivot table
def_piv = pd.pivot_table(nba_stats19[defensive_stats], index = nba_stats19['Player'])
# #create sparse matrix
def_sparse = sparse.csr_matrix(def_piv.fillna(0))
# # #calculate cosine similarity
def_recommender = pairwise_distances(def_sparse, metric = 'cosine')
# # #creating recommender dataframe
def_rec_df = pd.DataFrame(def_recommender, columns = def_piv.index, index = def_piv.index)
def_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Allen Crabbe,Andre Drummond,Andrew Wiggins,...,Tyler Herro,Tyler Johnson,Tyus Jones,Vince Carter,Wendell Carter Jr.,Wesley Matthews,Will Barton,Willie Cauley-Stein,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.133236,0.044854,0.116557,0.008302,0.135305,0.163977,0.075157,0.032511,0.032141,...,0.028971,0.066659,0.188469,0.146976,0.090507,0.050998,0.024973,0.134814,0.138457,0.023388
Aaron Holiday,0.133236,0.0,0.173268,0.082713,0.100389,0.046802,0.359507,0.119363,0.183246,0.225896,...,0.108417,0.217794,0.044258,0.159812,0.321814,0.040881,0.124643,0.245351,0.384317,0.134632
Al Horford,0.044854,0.173268,0.0,0.089443,0.057646,0.170066,0.112493,0.086819,0.033572,0.067386,...,0.085937,0.061656,0.255786,0.109698,0.061065,0.111569,0.020023,0.071805,0.152242,0.063312
Al-Farouq Aminu,0.116557,0.082713,0.089443,0.0,0.100408,0.096033,0.173279,0.034831,0.08734,0.228095,...,0.127842,0.11982,0.148854,0.024517,0.163224,0.109655,0.097505,0.080563,0.312597,0.158756
Alec Burks,0.008302,0.100389,0.057646,0.100408,0.0,0.120373,0.203283,0.054018,0.056218,0.051058,...,0.014355,0.077862,0.128593,0.127109,0.124089,0.037562,0.038311,0.154624,0.192685,0.028677


## Overall

In [27]:
#create pivot table
over_piv = pd.pivot_table(nba_stats19[overall_stats], index = nba_stats19['Player'])
# #create sparse matrix
over_sparse = sparse.csr_matrix(over_piv.fillna(0))
# # #calculate cosine similarity
over_recommender = pairwise_distances(over_sparse, metric = 'cosine')
# # #creating recommender dataframe
over_rec_df = pd.DataFrame(over_recommender, columns = over_piv.index, index = over_piv.index)
over_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Allen Crabbe,Andre Drummond,Andrew Wiggins,...,Tyler Herro,Tyler Johnson,Tyus Jones,Vince Carter,Wendell Carter Jr.,Wesley Matthews,Will Barton,Willie Cauley-Stein,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.09253,0.026746,0.054973,0.006386,0.089115,0.110502,0.092795,0.069455,0.047695,...,0.012839,0.111435,0.198167,0.083081,0.102671,0.064194,0.100352,0.050329,0.089934,0.043238
Aaron Holiday,0.09253,0.0,0.082794,0.166356,0.06742,0.021476,0.214931,0.209275,0.229337,0.121859,...,0.09695,0.11346,0.103761,0.178264,0.303155,0.090841,0.126986,0.184291,0.056308,0.06698
Al Horford,0.026746,0.082794,0.0,0.120561,0.028042,0.064512,0.194956,0.201784,0.071896,0.035098,...,0.034304,0.202752,0.249215,0.135358,0.109739,0.02803,0.029273,0.079105,0.035956,0.074616
Al-Farouq Aminu,0.054973,0.166356,0.120561,0.0,0.083015,0.192647,0.018063,0.028833,0.081721,0.187595,...,0.098701,0.114252,0.255376,0.115496,0.121327,0.198017,0.227936,0.023361,0.190548,0.128949
Alec Burks,0.006386,0.06742,0.028042,0.083015,0.0,0.067273,0.136446,0.112207,0.106177,0.030806,...,0.005413,0.101818,0.172811,0.084448,0.142632,0.047126,0.096035,0.082757,0.07878,0.022495


In [28]:
#evaluating matrix
over_rec_df['LeBron James'].sort_values()[1:11]

Player
Ricky Rubio        0.014532
Luka Doncic        0.014983
Devonte' Graham    0.017321
Malcolm Brogdon    0.019002
Damian Lillard     0.019670
James Harden       0.019692
Ben Simmons        0.021973
Jimmy Butler       0.024029
Kawhi Leonard      0.027981
Jrue Holiday       0.028989
Name: LeBron James, dtype: float64

## Shooting

In [29]:
#create pivot table
shoot_piv = pd.pivot_table(nba_stats19[shooting_stats], index = nba_stats19['Player'])
# #create sparse matrix
shoot_sparse = sparse.csr_matrix(shoot_piv.fillna(0))
# # #calculate cosine similarity
shoot_recommender = pairwise_distances(shoot_sparse, metric = 'cosine')
# # #creating recommender dataframe
shoot_rec_df = pd.DataFrame(shoot_recommender, columns = shoot_piv.index, index = shoot_piv.index)
shoot_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Allen Crabbe,Andre Drummond,Andrew Wiggins,...,Tyler Herro,Tyler Johnson,Tyus Jones,Vince Carter,Wendell Carter Jr.,Wesley Matthews,Will Barton,Willie Cauley-Stein,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.09287,0.023151,0.019876,0.004826,0.030749,0.062958,0.114798,0.179527,0.023773,...,0.03857,0.040412,0.033246,0.075272,0.047615,0.117669,0.014877,0.199302,0.084045,0.020967
Aaron Holiday,0.09287,0.0,0.143256,0.081359,0.078386,0.190578,0.264404,0.305844,0.31207,0.039165,...,0.080427,0.157333,0.063858,0.095503,0.206462,0.247987,0.043811,0.411826,0.28067,0.04276
Al Horford,0.023151,0.143256,0.0,0.024102,0.024259,0.02321,0.034491,0.066077,0.221069,0.064555,...,0.032623,0.01999,0.069588,0.071328,0.036298,0.082207,0.054279,0.225359,0.038337,0.066773
Al-Farouq Aminu,0.019876,0.081359,0.024102,0.0,0.01327,0.035285,0.077409,0.094875,0.259843,0.044208,...,0.016866,0.024372,0.054031,0.036601,0.072018,0.07435,0.02957,0.291336,0.078764,0.034754
Alec Burks,0.004826,0.078386,0.024259,0.01327,0.0,0.038731,0.079994,0.112286,0.217847,0.024913,...,0.022045,0.031329,0.045192,0.060579,0.066483,0.101533,0.01142,0.241448,0.079969,0.014224


In [30]:
#evaluating matrix
shoot_rec_df['LeBron James'].sort_values()[1:11]

Player
Jrue Holiday             0.003714
Russell Westbrook        0.006408
Kyrie Irving             0.011316
Ja Morant                0.011485
Dennis Smith Jr.         0.011739
Eric Bledsoe             0.012138
Jeff Teague              0.013092
De'Aaron Fox             0.013229
Giannis Antetokounmpo    0.016624
Kawhi Leonard            0.020934
Name: LeBron James, dtype: float64

# Pickle Rec systems

In [31]:
pickle.dump(off_rec_df, open('pickles/offensive.p', 'wb+'))
pickle.dump(def_rec_df, open('pickles/defensive.p', 'wb+'))
pickle.dump(over_rec_df, open('pickles/overall.p', 'wb+'))
pickle.dump(shoot_rec_df, open('pickles/shoot.p', 'wb+'))

## Creating a salary pickle

In [51]:
pickle.dump(player_salary, open('pickles/shoot.p', 'wb+'))

In [50]:
player_salary = ['Player', 'salary']
player_salary = nba_stats19[player_salary]
player_salary.set_index('Player')

Unnamed: 0_level_0,salary
Player,Unnamed: 1_level_1
Bogdan Bogdanovic,"$8,529,386"
Bojan Bogdanovic,"$17,000,000"
CJ Miles,"$8,730,158"
Dzanan Musa,"$1,911,600"
Damion Lee,0
...,...
Patrick McCaw,"$4,000,000"
Rodney McGruder,"$4,629,630"
Matisse Thybulle,"$2,582,160"
Anthony Tolliver,"$2,564,753"


In [None]:
pi