In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

## Basketball Reference Dataframes

In [2]:
poss = pd.read_csv('data/season1819.csv')
ad_1819 = pd.read_csv('data/ad_1819.csv')

In [3]:
poss = poss.drop_duplicates(subset = 'Player', keep = 'first')
ad_1819 = ad_1819.drop_duplicates(subset = 'Player', keep = 'first')

In [4]:
poss = poss.drop(['Unnamed: 0', 'Unnamed: 29'], axis = 1)
ad_1819 = ad_1819.drop(['Unnamed: 0', 'Pos', 'Age', 'Tm', 'G',
                       'MP'], axis = 1)

In [5]:
season = pd.merge(poss, ad_1819, on = 'Player')

In [6]:
season.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'url_list', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', ' ', 'OWS', 'DWS', 'WS', 'WS/48', ' .1',
       'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [7]:
season = season.fillna(0)

## Unsupervised Learning - Kmeans + PCA

In [83]:
season.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'url_list', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', ' ', 'OWS', 'DWS', 'WS', 'WS/48', ' .1',
       'OBPM', 'DBPM', 'BPM', 'VORP', 'cluster'],
      dtype='object')

In [84]:
X = season.drop(['Player', 'Pos', 'Tm', 'MP', 'FG',
                'FG%', 'FGA', '3P%', '3PA', '2P', '2P%', 'FT', 'FTA',
                'TRB', 'STL', 'BLK', 'PTS', 'PF',
                'url_list', 'PER', 'TS%', 'OWS', 'DWS', 'WS/48', 'BLK%',
                'TOV%', 'FTr', '3PAr'], axis = 1)
y = season['Player']

In [85]:
#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [86]:
#dimensionality reduction
pca = PCA(n_components = 2)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

In [87]:
#creating clusters
km = KMeans(n_clusters = 10, random_state= 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.0924809560867558

In [88]:
#creating columns from KM labels
season['cluster'] = km.labels_

In [89]:
season['cluster'].value_counts(normalize = True)

1    0.209040
5    0.188324
2    0.173258
4    0.129944
9    0.094162
7    0.084746
8    0.045198
6    0.043315
3    0.026365
0    0.005650
Name: cluster, dtype: float64

# 538 Dataframe - KMeans + PCA

In [24]:
scrape538 = pd.read_csv('data/scrape538.csv') 
scrape538 = scrape538.drop(['Unnamed: 0'], axis = 1)

In [25]:
scrape538.shape

(447, 14)

In [26]:
scrape538.columns

Index(['Player', 'Team', 'Position(s)', 'Minutes', 'BSR Off.', 'BSR Def.',
       'BSR Total', 'OOR Off.', 'OOR Def.', 'OOR Total', 'OVR Off.',
       'OVR Def.', 'OVR total', 'WAR'],
      dtype='object')

In [77]:
X = scrape538.drop(['Player', 'Team', 'Position(s)'], axis = 1)
y = scrape538['Player']

In [78]:
# scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [79]:
#dimensionality reduction
pca = PCA(n_components = 3)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

In [80]:
#creating clusters
km = KMeans(n_clusters = 10, random_state = 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.1628076682082633

In [81]:
#creating columns for km labels
scrape538['cluster'] = km.labels_

In [82]:
scrape538['cluster'].value_counts(normalize = True)

8    0.290828
0    0.196868
1    0.187919
4    0.129754
3    0.067114
7    0.055928
5    0.040268
6    0.020134
9    0.006711
2    0.004474
Name: cluster, dtype: float64

# Rec System

In [8]:
season.shape

(369, 53)

In [11]:
df= season.drop(columns = ['Pos', 'Age', 'Tm', 'G', 'MP'],
               axis = 1)
df.dropna(inplace = True)

In [22]:
#create pivot table
pivot = pd.pivot_table(df, index = 'Player')
pivot.drop(0, inplace = True)
pivot.head()

Unnamed: 0_level_0,2P,2P%,2PA,3P,3P%,3PA,3PAr,AST,AST%,BLK,...,TOV%,TRB,TRB%,TS%,USG%,VORP,WS,WS/48,Unnamed: 20_level_0,.1
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,6.0,0.491,12.2,1.7,0.283,6.1,0.332,4.6,16.6,0.8,...,12.3,10.0,11.7,0.538,21.8,2.0,5.1,0.093,0.0,0.0
Aaron Holiday,4.6,0.387,11.9,3.3,0.412,8.1,0.485,8.1,19.3,0.2,...,12.3,6.2,5.8,0.518,21.9,-0.1,0.9,0.065,0.0,0.0
Abdel Nader,2.8,0.429,6.5,2.3,0.313,7.5,0.465,1.4,3.8,1.4,...,10.1,7.0,8.6,0.522,15.1,-0.5,0.9,0.062,0.0,0.0
Al Horford,7.1,0.523,13.5,2.2,0.328,6.6,0.281,6.3,21.2,1.3,...,11.8,10.9,12.4,0.605,18.9,3.4,7.5,0.181,0.0,0.0
Al-Farouq Aminu,2.1,0.333,6.4,1.2,0.286,4.3,0.472,2.6,6.0,1.1,...,9.7,12.1,14.2,0.568,13.7,1.7,5.8,0.121,0.0,0.0


In [23]:
#create a sparse matrix
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))

In [24]:
#calculating cosine similarity
recommender = pairwise_distances(sparse_pivot, metric = 'cosine')

In [25]:
#creating recommender dataframe
rec_df = pd.DataFrame(recommender, columns = pivot.index, index = pivot.index)
rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,Alize Johnson,...,Wendell Carter,Wesley Iwundu,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willy Hernangómez,Yogi Ferrell,Yuta Watanabe,Zach Collins,Zach LaVine
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.005508,0.020118,0.003694,0.011642,0.0045,0.011562,0.005331,0.013644,0.028263,...,0.008148,0.014846,0.011967,0.00364,0.005933,0.010947,0.010553,0.069778,0.010281,0.009264
Aaron Holiday,0.005508,0.0,0.019212,0.010442,0.020382,0.004763,0.006024,0.01284,0.017534,0.034828,...,0.018336,0.015945,0.010452,0.004879,0.016615,0.020439,0.005305,0.068544,0.013497,0.008194
Abdel Nader,0.020118,0.019212,0.0,0.036826,0.011709,0.013402,0.016522,0.011456,0.009037,0.040812,...,0.034966,0.005808,0.024727,0.025143,0.027946,0.01993,0.011744,0.107485,0.011488,0.03251
Al Horford,0.003694,0.010442,0.036826,0.0,0.019572,0.010897,0.018032,0.013445,0.022973,0.03119,...,0.008859,0.027808,0.017593,0.006163,0.006702,0.0164,0.018317,0.061359,0.017922,0.013224
Al-Farouq Aminu,0.011642,0.020382,0.011709,0.019572,0.0,0.011787,0.016396,0.00556,0.003324,0.023214,...,0.016146,0.006911,0.019805,0.01702,0.008216,0.008681,0.014773,0.092105,0.004997,0.035931


In [26]:
#evaluating matrix
rec_df['LeBron James'].sort_values()[1:11]

Player
Kyrie Irving         0.004916
Luka Dončić          0.005689
Nikola Jokić         0.006092
Russell Westbrook    0.007458
Eric Bledsoe         0.007458
Kemba Walker         0.008246
Devin Booker         0.008270
Damian Lillard       0.009166
D'Angelo Russell     0.010287
Chris Paul           0.010674
Name: LeBron James, dtype: float64