In [1]:
import pandas as pd
import numpy as np

import pickle

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Import DataFrame

In [2]:
nba_stats19 = pd.read_csv('../data/nba_stats19.csv', index_col = [0])

Some tables contained the same metrics. I will drop one of of each duplicated column and rename the rest

In [3]:
nba_stats19.drop(columns = ['DREB_y', 'T_DREB%_y', 'T_STL%_y', 'T_BLK%_y', 'Age_y','FT%_y'], inplace = True)
nba_stats19.rename(columns = {'Age_x': 'Age', 'FT%_x': 'FT%', 'DREB_x': 'DREB', 'T_DREB%_x' : 'T_DREB%',
                             'T_STL%_x': 'T_STL%', 'T_BLK%_x': 'T_BLK%'}, inplace = True)

In [4]:
nba_stats19.shape

(466, 87)

## Sorting features into categories

I want to make a recommendation system for offensive, defensive, overall, and shooting styles on the court. I am sorting categories of different features to use in my clustering models

In [5]:
offensive_stats = ['FG%', '3P%', 'OREB', 'TOV', 'DD2',
            'TD3', 'T_FGM%', 'T_FGA%', 'T_3PM%', 'T_3PA%', 'T_OREB%',
            'T_TOV%', '2ND_CH_PTS', 'FB_PTS', 'BSR Off.',
            'OOR Off.', 'OVR Off.', 'FP', 'STL', 'USG%', 
                'FGM_UAST%',
                  'AST'] 

In [6]:
defensive_stats = ['DREB', 'DEF_RTG', 'DREB%', 'T_DREB%', 'STL', 'T_STL%',
            'BLK', 'DEF_WS', 'T_BLK%', 'T_BLKA%',
            'BLKA', 'BSR Def.', 'OVR Def.', 'WAR', 'FP']

In [7]:
overall_stats = ['REB','AST', 'DD2', 'TD3', 'USG%', 'T_REB%',
          'T_AST%', 'PTS_TOV', 'BLK', 'T_BLKA%', 'AST_2FGM%', 'AST_3FGM%', 'STL', 'T_STL%',
                'FG%', 'FGA_2P%', 'T_FGA%', 'TOV', 'T_DREB%', 'T_OREB%', 'MP']

In [8]:
shooting_stats = ['FGA_2P%', 'FGA_3P%', '2PT%', '2PT_MR%', '3P%',
           'FBPTS%', 'OFFTOV%', 'PAINTPTS%', 'AST_2FGM%', 'UAST_2FGM%',
           'AST_3FGM%', 'UAST_3FGM%', 'FGM_AST%', 'FGM_UAST%'
           ]

I am only including players that on average plays more than 6 minutes a game. I am using this metric since some teams have played less than 20 games when a season contains 82. In the future I plan on narrowing my player search based on how many games a player has participated in during the season

In [9]:
nba_stats19 = nba_stats19[nba_stats19['MP'] > 6]

In [10]:
nba_stats19.shape

(416, 87)

# KMEANS & PCA

The strength of our clusters will determine how accurate our recommendation systems will be

In [11]:
sc = StandardScaler()
pca = PCA(n_components = 6)
clusters = KMeans(n_clusters = 3, random_state =248)

In [12]:
def nba_stats(category):
    X = nba_stats19[category]
    y = nba_stats19['Player']
    #Scale Data
    X_sc = sc.fit_transform(X)
    pca.fit(X_sc)
    #dimensionality reduction
    X_pca = pca.transform(X_sc)
    print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())
    #create clusters
    clusters.fit(X_pca)
    y_kmeans = clusters.predict(X_pca)
    print("Silhouette Score:", silhouette_score(X_sc, clusters.labels_))

### Offensive Clusters

In [13]:
nba_stats(offensive_stats)

Cumulative Explained Variance: 0.8053770412125152
Silhouette Score: 0.2405207113200336


### Defensive

In [14]:
nba_stats(defensive_stats)

Cumulative Explained Variance: 0.8889370304947651
Silhouette Score: 0.18771922566788357


### Shooting

In [15]:
nba_stats(shooting_stats)

Cumulative Explained Variance: 0.9023852707439991
Silhouette Score: 0.2589192755988837


### Overall

In [16]:
nba_stats(overall_stats)

Cumulative Explained Variance: 0.7704451289818448
Silhouette Score: 0.243719363925963


Category | PCA Explained Variance | KMEANS Silhouette Score
------- | ------ | -----
Offensive | .80538 | .240521
Defensive | .88894 | .187719
Shooting | .90238 | .25892
Overall | .77044 | .24372

This dataframe can be used for eda investigation of players and their clusters

In [17]:
nba_stats19.to_csv('../data/nba_2eda.csv')

# Creating Recommendation DataFrames

#### Offensive

In [18]:
#create pivot table
off_piv = pd.pivot_table(nba_stats19[offensive_stats], index = nba_stats19['Player'])
#create sparse matrix
off_sparse = sparse.csr_matrix(off_piv.fillna(0))
# #calculate cosine similarity
off_recommender = pairwise_distances(off_sparse, metric = 'cosine')
# #creating recommender dataframe
off_rec_df = pd.DataFrame(off_recommender, columns = off_piv.index, index = off_piv.index)

### Defensive

In [19]:
def_piv = pd.pivot_table(nba_stats19[defensive_stats], index = nba_stats19['Player'])
def_sparse = sparse.csr_matrix(def_piv.fillna(0))
def_recommender = pairwise_distances(def_sparse, metric = 'cosine')
def_rec_df = pd.DataFrame(def_recommender, columns = def_piv.index, index = def_piv.index)

### Overall

In [20]:
over_piv = pd.pivot_table(nba_stats19[overall_stats], index = nba_stats19['Player'])
over_sparse = sparse.csr_matrix(over_piv.fillna(0))
over_recommender = pairwise_distances(over_sparse, metric = 'cosine')
over_rec_df = pd.DataFrame(over_recommender, columns = over_piv.index, index = over_piv.index)

### Shooting

In [21]:
shoot_piv = pd.pivot_table(nba_stats19[shooting_stats], index = nba_stats19['Player'])
shoot_sparse = sparse.csr_matrix(shoot_piv.fillna(0))
shoot_recommender = pairwise_distances(shoot_sparse, metric = 'cosine')
shoot_rec_df = pd.DataFrame(shoot_recommender, columns = shoot_piv.index, index = shoot_piv.index)

# Pickles

Pickles of all given recommendation systems

In [22]:
pickle.dump(off_rec_df, open('../pickles/offensive.p', 'wb+'))
pickle.dump(def_rec_df, open('../pickles/defensive.p', 'wb+'))
pickle.dump(over_rec_df, open('../pickles/overall.p', 'wb+'))
pickle.dump(shoot_rec_df, open('../pickles/shoot.p', 'wb+'))

Pickling Salary 

In [23]:
player_salary = ['Player', 'salary']
player_salary = nba_stats19[player_salary]
play_sal = player_salary.set_index('Player')
pickle.dump(play_sal, open('../pickles/salaries.p', 'wb+'))

In [24]:
players = off_rec_df['LeBron James'].sort_values()[1:4].index
players

Index(['Luka Doncic', 'Bradley Beal', 'Kawhi Leonard'], dtype='object', name='Player')

In [25]:
play_sal.head()

Unnamed: 0_level_0,salary
Player,Unnamed: 1_level_1
Ante Zizic,"$2,281,800"
Boban Marjanovic,"$3,500,000"
Bogdan Bogdanovic,"$8,529,386"
Bojan Bogdanovic,"$17,000,000"
CJ Miles,"$8,730,158"


--------------------------------------

pickling for team name - still in progress - need to figureout how to shape my database best for SQL query in flask app

In [None]:
player_team = ['Player', 'Team']
player_team = nba_stats19[player_team]
play_team = player_team.set_index('Player')
pickle.dump(play_team, open('../pickles/teams.p', 'wb+'))

In [None]:
for i in players:
    listen = play_sal[play_sal['Player'] == i]

In [None]:
listen