In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

%matplotlib inline

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

## Import DataFrame

In [2]:
nba_stats19 = pd.read_csv('data/nba_stats19.csv', index_col = [0])
nba_stats19.columns

Index(['Player', 'Age_x', 'W', 'L', 'MP', 'FGM', 'FGA', 'FG%', '3PM', '3PA',
       '3P%', 'FTM', 'FTA', 'FT%_x', 'OREB', 'DREB_x', 'REB', 'AST', 'TOV',
       'PF', 'FP', 'DD2', 'TD3', '+/-', 'DEF_RTG', 'DREB_y', 'DREB%',
       'T_DREB%_x', 'STL', 'T_STL%_x', 'BLK', 'T_BLK%_x', 'OP_TOV',
       'OP_2NDPTS', 'OP_PAINT', 'DEF_WS', 'USG%', 'T_FGM%', 'T_FGA%', 'T_3PM%',
       'T_3PA%', 'T_FTM%', 'T_FTA%', 'T_OREB%', 'T_DREB%_y', 'T_REB%',
       'T_AST%', 'T_TOV%', 'T_STL%_y', 'T_BLK%_y', 'T_BLKA%', 'T_PF%',
       'T_PFD%', 'T_PTS%', 'Age_y', 'FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%',
       '3PT%', 'FBPTS%', 'FT%_y', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%',
       'UAST_2FGM%', 'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%',
       'PTS_TOV', '2ND_CH_PTS', 'FB_PTS', 'PAINT_PTS', 'OPP_PTS_TOV',
       'OPP_2ND_PTS', 'OPP_FBPTS', 'OPP_PAINT_PTS', 'BLKA', 'PFD', 'Minutes',
       'BSR Off.', 'BSR Def.', 'BSR Total', 'OOR Off.', 'OOR Def.',
       'OOR Total', 'OVR Off.', 'OVR Def.', 'OVR t

In [3]:
nba_stats19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 461 entries, 0 to 460
Data columns (total 92 columns):
Player           461 non-null object
Age_x            461 non-null float64
W                461 non-null float64
L                461 non-null float64
MP               461 non-null float64
FGM              461 non-null float64
FGA              461 non-null float64
FG%              461 non-null float64
3PM              461 non-null float64
3PA              461 non-null float64
3P%              461 non-null float64
FTM              461 non-null float64
FTA              461 non-null float64
FT%_x            461 non-null float64
OREB             461 non-null float64
DREB_x           461 non-null float64
REB              461 non-null float64
AST              461 non-null float64
TOV              461 non-null float64
PF               461 non-null float64
FP               461 non-null float64
DD2              461 non-null float64
TD3              461 non-null float64
+/-              461 n

## Renaming columns that somehow got messed up

In [4]:
nba_stats19.drop(columns = ['DREB_y', 'T_DREB%_y', 'T_STL%_y', 'T_BLK%_y', 'Age_y','FT%_y'], inplace = True)
nba_stats19.rename(columns = {'Age_x': 'Age', 'FT%_x': 'FT%', 'DREB_x': 'DREB', 'T_DREB%_x' : 'T_DREB%',
                             'T_STL%_x': 'T_STL%', 'T_BLK%_x': 'T_BLK%'}, inplace = True)

In [5]:
nba_stats19.shape

(461, 86)

In [6]:
nba_stats19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 461 entries, 0 to 460
Data columns (total 86 columns):
Player           461 non-null object
Age              461 non-null float64
W                461 non-null float64
L                461 non-null float64
MP               461 non-null float64
FGM              461 non-null float64
FGA              461 non-null float64
FG%              461 non-null float64
3PM              461 non-null float64
3PA              461 non-null float64
3P%              461 non-null float64
FTM              461 non-null float64
FTA              461 non-null float64
FT%              461 non-null float64
OREB             461 non-null float64
DREB             461 non-null float64
REB              461 non-null float64
AST              461 non-null float64
TOV              461 non-null float64
PF               461 non-null float64
FP               461 non-null float64
DD2              461 non-null float64
TD3              461 non-null float64
+/-              461 n

## Sorting features into categories

In [7]:
offensive_stats = ['FG%', '3P%', 'OREB', 'TOV', 'DD2',
            'TD3', 'T_FGM%', 'T_FGA%', 'T_3PM%', 'T_3PA%', 'T_OREB%',
            'T_TOV%', 'T_STL%', '2ND_CH_PTS', 'FB_PTS', 'BSR Off.',
            'OOR Off.', 'OVR Off.', 'WAR', 'FP', 'STL']

In [8]:
defensive_stats = ['DREB', 'DEF_RTG', 'DREB%', 'T_DREB%', 'STL', 'T_STL%',
            'BLK', 'DEF_WS', 'T_DREB%', 'T_BLK%', 'T_BLKA%',
            'BLKA', 'BSR Def.', 'OVR Def.', 'OVR Def.', 'WAR', 'FP']

In [9]:
overall_stats = ['REB','AST', 'FP', 'DD2', 'TD3', 'USG%', 'T_REB%',
          'T_AST%', 'PTS_TOV', 'BSR Total', 'OOR Total',
          'OVR total', 'WAR']

In [10]:
shooting_stats = ['FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%', '3P%',
           'FBPTS%', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%', 'UAST_2FGM%',
           'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%'
           ]

### Offensive

In [14]:
X = nba_stats19[offensive_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.70960900264003


In [15]:
#creating clusters
offensive = KMeans(n_clusters = 5, random_state= 248)
offensive.fit(X_pca)
y_kmeans = offensive.predict(X_pca)
nba_stats19['off_kmean'] = offensive.labels_
off_sil = silhouette_score(X_sc, offensive.labels_)
silhouette_score(X_sc, offensive.labels_)

0.20118878456482017

### Defensive

In [16]:
X = nba_stats19[defensive_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8244629789328966


In [17]:
#creating clusters
defensive = KMeans(n_clusters = 5, random_state= 248)
defensive.fit(X_pca)
y_kmeans = defensive.predict(X_pca)
nba_stats19['def_kmean'] = defensive.labels_
def_sil = silhouette_score(X_sc, defensive.labels_)
silhouette_score(X_sc, defensive.labels_)

0.17014285371550952

### Overall

In [18]:
X = nba_stats19[overall_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8297771297051779


In [19]:
#creating clusters
overall = KMeans(n_clusters = 5, random_state = 248)
overall.fit(X_pca)
y_kmeans = overall.predict(X_pca)
nba_stats19['ov_kmean'] = overall.labels_
over_sil = silhouette_score(X_sc, overall.labels_)
silhouette_score(X_sc, overall.labels_)

0.24889287162905363

### Shooting

In [20]:
X = nba_stats19[shooting_stats]
y = nba_stats19['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 8)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.9635171166779746


In [21]:
#creating clusters
shoot = KMeans(n_clusters = 5, random_state = 248)
shoot.fit(X_pca)
y_kmeans = shoot.predict(X_pca)
nba_stats19['shoot_kmean'] = shoot.labels_
shoot_sil = silhouette_score(X_sc, shoot.labels_)
silhouette_score(X_sc, shoot.labels_)

0.28667910678720615

In [22]:
print(f'Shooting classification similarity score is {shoot_sil}')
print(f'Overall classification similarity score is {over_sil}')
print(f'Defensive classification similarity score is {def_sil}')
print(f'Offensive classification similarity score is {off_sil}')

Shooting classification similarity score is 0.28667910678720615
Overall classification similarity score is 0.24889287162905363
Defensive classification similarity score is 0.17014285371550952
Offensive classification similarity score is 0.20118878456482017


In [None]:
nba_stats19.to_csv('data/nba_2eda.csv')

# Rec system

In [23]:
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [24]:
nba_stats19 = nba_stats19[nba_stats19['MP'] > 10]

## Offensive

In [25]:
#create pivot table
off_piv = pd.pivot_table(nba_stats19[offensive_stats], index = nba_stats19['Player'])
#create sparse matrix
off_sparse = sparse.csr_matrix(off_piv.fillna(0))
# #calculate cosine similarity
off_recommender = pairwise_distances(off_sparse, metric = 'cosine')
# #creating recommender dataframe
off_rec_df = pd.DataFrame(off_recommender, columns = off_piv.index, index = off_piv.index)
off_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,Allen Crabbe,...,Vince Carter,Wayne Ellington,Wendell Carter Jr.,Wes Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Yogi Ferrell,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.062865,0.080335,0.007044,0.064425,0.036614,0.09233,0.084181,0.072362,0.066153,...,0.0832,0.15396,0.122313,0.060687,0.081352,0.016259,0.165188,0.046776,0.06961,0.043767
Aaron Holiday,0.062865,0.0,0.045158,0.080047,0.087496,0.022096,0.037552,0.220683,0.072213,0.05656,...,0.028649,0.073519,0.304102,0.095852,0.043742,0.047453,0.309201,0.034812,0.144879,0.028244
Abdel Nader,0.080335,0.045158,0.0,0.106623,0.114022,0.042895,0.095226,0.180826,0.075633,0.03939,...,0.043248,0.051349,0.308359,0.134435,0.052355,0.074623,0.341217,0.021788,0.110415,0.060702
Al Horford,0.007044,0.080047,0.106623,0.0,0.076409,0.049792,0.109774,0.093948,0.073274,0.078137,...,0.104513,0.179263,0.109948,0.081838,0.089144,0.014569,0.166322,0.069269,0.069727,0.067315
Al-Farouq Aminu,0.064425,0.087496,0.114022,0.076409,0.0,0.103917,0.085332,0.095789,0.080195,0.076858,...,0.116115,0.233386,0.183394,0.05321,0.150223,0.095413,0.16398,0.072323,0.144689,0.125585


In [26]:
#evaluating matrix
off_rec_df['LeBron James'].sort_values()[1:11]

Player
Luka Doncic          0.010247
Bradley Beal         0.016804
Malcolm Brogdon      0.016933
Pascal Siakam        0.021469
James Harden         0.024222
Kyrie Irving         0.024472
Brandon Ingram       0.024619
Kawhi Leonard        0.025052
Damian Lillard       0.025092
Spencer Dinwiddie    0.026511
Name: LeBron James, dtype: float64

## Defensive

In [27]:
#create pivot table
# def_piv = pd.pivot_table(nba_stats19[defensive_stats], index = nba_stats19['Player'])
# #create sparse matrix
# off_sparse = sparse.csr_matrix(off_piv.fillna(0))
# # #calculate cosine similarity
# off_recommender = pairwise_distances(off_sparse, metric = 'cosine')
# # #creating recommender dataframe
# off_rec_df = pd.DataFrame(off_recommender, columns = off_piv.index, index = off_piv.index)
# off_rec_df.head()

## Overall

In [28]:
#create pivot table
over_piv = pd.pivot_table(nba_stats19[overall_stats], index = nba_stats19['Player'])
# #create sparse matrix
over_sparse = sparse.csr_matrix(over_piv.fillna(0))
# # #calculate cosine similarity
over_recommender = pairwise_distances(over_sparse, metric = 'cosine')
# # #creating recommender dataframe
over_rec_df = pd.DataFrame(over_recommender, columns = over_piv.index, index = over_piv.index)
over_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,Allen Crabbe,...,Vince Carter,Wayne Ellington,Wendell Carter Jr.,Wes Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Yogi Ferrell,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.09253,0.09048,0.026746,0.054973,0.006386,0.089115,0.110502,0.19838,0.092795,...,0.083081,0.080277,0.102671,0.093784,0.064194,0.100352,0.050329,0.128649,0.089934,0.043238
Aaron Holiday,0.09253,0.0,0.149864,0.082794,0.166356,0.06742,0.021476,0.214931,0.408697,0.209275,...,0.178264,0.104511,0.303155,0.182218,0.090841,0.126986,0.184291,0.04784,0.056308,0.06698
Abdel Nader,0.09048,0.149864,0.0,0.181571,0.061875,0.087203,0.195861,0.0495,0.181469,0.038813,...,0.08231,0.086597,0.227903,0.039373,0.19844,0.28901,0.1195,0.155811,0.22957,0.095114
Al Horford,0.026746,0.082794,0.181571,0.0,0.120561,0.028042,0.064512,0.194956,0.276635,0.201784,...,0.135358,0.130399,0.109739,0.194251,0.02803,0.029273,0.079105,0.141386,0.035956,0.074616
Al-Farouq Aminu,0.054973,0.166356,0.061875,0.120561,0.0,0.083015,0.192647,0.018063,0.138439,0.028833,...,0.115496,0.125491,0.121327,0.036173,0.198017,0.227936,0.023361,0.1808,0.190548,0.128949


In [29]:
#evaluating matrix
over_rec_df['LeBron James'].sort_values()[1:11]

Player
Ricky Rubio        0.014532
Luka Doncic        0.014983
Devonte' Graham    0.017321
Malcolm Brogdon    0.019002
Damian Lillard     0.019670
James Harden       0.019692
Ben Simmons        0.021973
Jimmy Butler       0.024029
Kawhi Leonard      0.027981
Jrue Holiday       0.028989
Name: LeBron James, dtype: float64

## Shooting

In [32]:
#create pivot table
shoot_piv = pd.pivot_table(nba_stats19[shooting_stats], index = nba_stats19['Player'])
# #create sparse matrix
shoot_sparse = sparse.csr_matrix(shoot_piv.fillna(0))
# # #calculate cosine similarity
shoot_recommender = pairwise_distances(shoot_sparse, metric = 'cosine')
# # #creating recommender dataframe
shoot_rec_df = pd.DataFrame(shoot_recommender, columns = shoot_piv.index, index = shoot_piv.index)
shoot_rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,Allen Crabbe,...,Vince Carter,Wayne Ellington,Wendell Carter Jr.,Wes Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Yogi Ferrell,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.09287,0.042087,0.023151,0.019876,0.004826,0.030749,0.062958,0.149612,0.114798,...,0.075272,0.220554,0.047615,0.060434,0.117669,0.014877,0.199302,0.038932,0.084045,0.020967
Aaron Holiday,0.09287,0.0,0.120645,0.143256,0.081359,0.078386,0.190578,0.264404,0.383822,0.305844,...,0.095503,0.341734,0.206462,0.23629,0.247987,0.043811,0.411826,0.052004,0.28067,0.04276
Abdel Nader,0.042087,0.120645,0.0,0.033051,0.015478,0.028008,0.031096,0.086863,0.096213,0.063715,...,0.039069,0.105551,0.107338,0.075154,0.031795,0.051129,0.331465,0.055226,0.054305,0.05399
Al Horford,0.023151,0.143256,0.033051,0.0,0.024102,0.024259,0.02321,0.034491,0.095512,0.066077,...,0.071328,0.193677,0.036298,0.04366,0.082207,0.054279,0.225359,0.042366,0.038337,0.066773
Al-Farouq Aminu,0.019876,0.081359,0.015478,0.024102,0.0,0.01327,0.035285,0.077409,0.142578,0.094875,...,0.036601,0.161971,0.072018,0.063288,0.07435,0.02957,0.291336,0.02778,0.078764,0.034754


In [33]:
#evaluating matrix
shoot_rec_df['LeBron James'].sort_values()[1:11]

Player
Jrue Holiday             0.003714
Russell Westbrook        0.006408
Kyrie Irving             0.011316
Ja Morant                0.011485
Dennis Smith Jr.         0.011739
Eric Bledsoe             0.012138
Jeff Teague              0.013092
De'Aaron Fox             0.013229
Giannis Antetokounmpo    0.016624
Kawhi Leonard            0.020934
Name: LeBron James, dtype: float64

In [50]:
players = shoot_rec_df['LeBron James'].sort_values()[1:4]
player_dict = players.to_dict().index
player_keys = [i for i in player_dict.keys()]

['Jrue Holiday', 'Russell Westbrook', 'Kyrie Irving']

In [55]:
for i in player_dict:
    print(i)

Jrue Holiday
Russell Westbrook
Kyrie Irving


In [54]:
list(shoot_rec_df['LeBron James'].sort_values()[1:4].index)

['Jrue Holiday', 'Russell Westbrook', 'Kyrie Irving']

In [None]:
keys=[i for i in mydictionary.keys()]

# Pickle Rec systems

In [103]:
pickle.dump(off_rec_df, open('pickles/offensive.p', 'wb+'))
pickle.dump(over_rec_df, open('pickles/overall.p', 'wb+'))
pickle.dump(shoot_rec_df, open('pickles/shoot.p', 'wb+'))