In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

## Basketball Reference Dataframes

In [55]:
poss = pd.read_csv('season1819.csv')
ad_1819 = pd.read_csv('ad_1819.csv')

In [56]:
poss = poss.drop_duplicates(subset = 'Player', keep = 'first')
ad_1819 = ad_1819.drop_duplicates(subset = 'Player', keep = 'first')

In [57]:
poss = poss.drop(['Unnamed: 0', 'Unnamed: 29'], axis = 1)
ad_1819 = ad_1819.drop(['Unnamed: 0', 'Pos', 'Age', 'Tm', 'G',
                       'MP'], axis = 1)

In [58]:
season = pd.merge(poss, ad_1819, on = 'Player')

In [59]:
season.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'url_list', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', ' ', 'OWS', 'DWS', 'WS', 'WS/48', ' .1',
       'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [60]:
season = season.fillna(0)

## Unsupervised Learning - Kmeans + PCA

In [61]:
season.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'url_list', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', ' ', 'OWS', 'DWS', 'WS', 'WS/48', ' .1',
       'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [62]:
X = season.drop(['Player', 'Pos', 'Tm', 'MP', 'FG',
                'FG%', 'FGA', '3P%', '3PA', '2P', '2P%', 'FT', 'FTA',
                'TRB', 'STL', 'BLK', 'PTS', 'PF',
                'url_list', 'PER', 'TS%', 'OWS', 'DWS', 'WS/48', 'BLK%',
                'TOV%', 'FTr', '3PAr'], axis = 1)
y = season['Player']

In [63]:
#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [64]:
#dimensionality reduction
pca = PCA(n_components = 2)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

In [65]:
#creating clusters
km = KMeans(n_clusters = 5, random_state= 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.13368471097482784

In [66]:
#creating columns from KM labels
season['cluster'] = km.labels_

In [67]:
season['cluster'].value_counts(normalize = True)

4    0.393597
0    0.261770
3    0.218456
1    0.120527
2    0.005650
Name: cluster, dtype: float64

# 538 Dataframe - KMeans + PCA

In [33]:
scrape538 = pd.read_csv('data/scrape538.csv') 
scrape538 = scrape538.drop(['Unnamed: 0'], axis = 1)

In [34]:
scrape538.shape

(447, 14)

In [35]:
scrape538.columns

Index(['Player', 'Team', 'Position(s)', 'Minutes', 'BSR Off.', 'BSR Def.',
       'BSR Total', 'OOR Off.', 'OOR Def.', 'OOR Total', 'OVR Off.',
       'OVR Def.', 'OVR total', 'WAR'],
      dtype='object')

In [36]:
X = scrape538.drop(['Player', 'Team', 'Position(s)'], axis = 1)
y = scrape538['Player']

In [37]:
# scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [38]:
#dimensionality reduction
pca = PCA(n_components = 4)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

In [46]:
#creating clusters
km = KMeans(n_clusters = 5, random_state = 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.2352700137580218

In [47]:
#creating columns for km labels
scrape538['cluster'] = km.labels_

In [48]:
scrape538['cluster'].value_counts(normalize = True)

3    0.483221
0    0.272931
4    0.123043
1    0.093960
2    0.026846
Name: cluster, dtype: float64