In [44]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# Dataframe 2019

this dataframe consists of up to date stasitcs of the 2019 season. The stats from this dataframe is included from basketball-reference satistcs per 100 team possessions and their advanced statistics. 538 raptor scor ratings are included in  this as well. 

Note: 538 raptor score ratings turn individual players into team talent estimates. The on/off ratpor scores evaluates how a player's team performs while he was on the floor, how the player's courtmates performed without player, how the courtmates' other courtmates performed when they were on the floor without player's courtmates. The raptor box scores takes into account individual player statistics. 

In [65]:
df19 = pd.read_csv('data/fixed19.csv', index_col = [0])
df19 = df19.iloc[:, :-3]
df19['G'] = df19['G'].astype(int)

In [66]:
df19.shape

(470, 64)

## Data Preprocessing

I'm dropping all players who haven't played more than 30 minutes. There has been less than 20 games per season so I am taking minutes into account

In [67]:
df_bline = (df19[df19['MP'] > 48])

In [68]:
df_bline.shape

(378, 64)

### Quick feature engineering of specific values

There are some stats that are valuable on the court but low in numbers. For example, steals are valuable for any competitive game, but even the best players would have less than 3 steals per game. So we want to put more emphasis on these stats.

In [69]:
df_bline['AST_2x'] = (df_bline['AST'] * 2)
df_bline['STL_2x'] = (df_bline['STL'] * 2)
df_bline['BLK_2x'] = (df_bline['BLK'] * 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Sorting features into categories

In [70]:
df_bline.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'url_list', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'salary', 'Team', 'Minutes', 'BSR Off.',
       'BSR Def.', 'BSR Total', 'OOR Off.', 'OOR Def.', 'OOR Total',
       'OVR Off.', 'OVR Def.', 'OVR total', 'WAR', 'AST_2x', 'STL_2x',
       'BLK_2x'],
      dtype='object')

In [71]:
offensive = ['FG%', '3PA', '2PA', 'ORB', 'AST_2x', 'TOV',
            'ORtg', 'DRtg', '3PAr', 'ORB%', 'AST%', 'TOV%',
            'OWS', 'OBPM', 'BSR Off.', 'OOR Off.', 'OVR Off.',
            'WAR']

In [72]:
defensive = ['DRB', 'STL_2x', 'BLK_2x', 'DRtg', 'DRB%', 
            'BLK%', 'DWS', 'DBPM', 'BSR Def.', 'OOR Def.',
            'OVR Def.', 'WAR']

In [73]:
overall = ['FG', 'FT%', 'TRB', 'PER', 'TS%', 'TRB%',
          'WS/48', 'BPM', 'BSR Total', 'OOR Total',
          'OVR total', 'WAR']

### Offensive Practice

In [74]:
X = df_bline[offensive]
y = df_bline['Player']

#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [75]:
#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.8336167309711154


In [76]:
#creating clusters
km = KMeans(n_clusters = 10, random_state= 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.13628004697176288

### Defensive Practice

In [77]:
X = df_bline[defensive]
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

In [78]:
#creating clusters
km = KMeans(n_clusters = 10, random_state= 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.14006248737510987

### Overall Practice

In [79]:
X = df_bline[overall]
X_sc = sc.fit_transform(X)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.9019742486885843


In [80]:
#creating clusters
km = KMeans(n_clusters = 10, random_state = 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.14741566799713737