In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Dataframes 2019

This dataframe consists of stats from basketball reference & 538. There are some players in the 2019 dataframe that do not have stats because they have not played any games this season/are injured. For now, all of their values are filled with a 0

In [2]:
df19 = pd.read_csv('data/fixed19.csv', index_col = [0])
df19 = df19.iloc[:, :-3]
df19['G'] = df19['G'].astype(int)

### Data Preprocessing

In [3]:
df_bline = (df19[df19['MP'] > 30])

In [4]:
df_bline.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'url_list', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'salary', 'Team', 'Minutes', 'BSR Off.',
       'BSR Def.', 'BSR Total', 'OOR Off.', 'OOR Def.', 'OOR Total',
       'OVR Off.', 'OVR Def.', 'OVR total', 'WAR'],
      dtype='object')

In [6]:
X = df_bline.drop(['Player', 'Pos', 'Tm', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA',
              'AST', 'STL', 'BLK', 'PTS', 'url_list', '3PAr', 'FTr', 'STL%', 'BLK%', 'TOV%',
              'USG%', 'OWS', 'DWS', 'WS/48', 'salary', 'Team', 'Minutes', 'BSR Off.',
              'BSR Def.', 'OOR Off.', 'OOR Def.', 'OVR Off.', 'OVR Def.'], axis = 1)
y = df_bline['Player']

In [7]:
#scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

### Unsupervised Learning - Kmeans + PCA

In [8]:
#dimensionality reduction
pca = PCA(n_components = 5)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)
print("Cumulative Explained Variance:", pca.explained_variance_ratio_.sum())

Cumulative Explained Variance: 0.7196903770947045


In [9]:
#creating clusters
km = KMeans(n_clusters = 10, random_state= 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.08525708825135944

In [10]:
#creating columns from KM labels
df_bline['cluster'] = km.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
df_bline['cluster'].value_counts(normalize = True)

8    0.146154
2    0.141026
5    0.120513
6    0.107692
7    0.102564
3    0.102564
1    0.094872
9    0.082051
4    0.082051
0    0.020513
Name: cluster, dtype: float64

In [13]:
df_bline.to_csv('baseline.csv')

In [12]:
df_bline.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg',
       'url_list', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM',
       'DBPM', 'BPM', 'VORP', 'salary', 'Team', 'Minutes', 'BSR Off.',
       'BSR Def.', 'BSR Total', 'OOR Off.', 'OOR Def.', 'OOR Total',
       'OVR Off.', 'OVR Def.', 'OVR total', 'WAR', 'cluster'],
      dtype='object')

In [192]:
check_col = ['FG%', 'PER', 'VORP', 'OVR total', 'WAR']
newplot = df_bline[check_col]

# Rec System

## Basketball Ref

In [30]:
season.shape

(453, 54)

In [31]:
df= season.drop(columns = ['Pos', 'Age', 'Tm', 'G', 'MP'],
               axis = 1)
df.dropna(inplace = True)

In [32]:
#create pivot table
pivot = pd.pivot_table(df, index = 'Player')
pivot.drop(0, inplace = True)
pivot.head()

Unnamed: 0_level_0,2P,2P%,2PA,3P,3P%,3PA,3PAr,AST,AST%,BLK,...,TRB,TRB%,TS%,USG%,VORP,WS,WS/48,cluster,Unnamed: 20_level_0,.1
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,6.0,0.491,12.2,1.7,0.283,6.1,0.333,4.6,15.0,0.8,...,10.0,10.7,0.509,20.2,0.2,0.8,0.092,3,0.0,0.0
Aaron Holiday,4.6,0.387,11.9,3.3,0.412,8.1,0.405,8.1,23.6,0.2,...,6.2,6.9,0.498,21.1,0.1,0.6,0.088,3,0.0,0.0
Abdel Nader,2.8,0.429,6.5,2.3,0.313,7.5,0.533,1.4,4.2,1.4,...,7.0,7.9,0.501,19.1,-0.1,-0.1,-0.058,0,0.0,0.0
Admiral Schofield,2.7,1.0,2.7,3.6,0.571,6.3,0.7,0.9,2.4,0.0,...,7.2,8.5,0.935,9.4,0.0,0.2,0.191,6,0.0,0.0
Al Horford,7.1,0.523,13.5,2.2,0.328,6.6,0.327,6.3,19.5,1.3,...,10.9,12.7,0.527,20.6,0.6,1.6,0.161,3,0.0,0.0


In [33]:
#create a sparse matrix
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))

In [34]:
#calculating cosine similarity
recommender = pairwise_distances(sparse_pivot, metric = 'cosine')

In [35]:
#creating recommender dataframe
rec_df = pd.DataFrame(recommender, columns = pivot.index, index = pivot.index)
rec_df.head()

Player,Aaron Gordon,Aaron Holiday,Abdel Nader,Admiral Schofield,Al Horford,Al-Farouq Aminu,Alec Burks,Alex Caruso,Alex Len,Alfonzo McKinnie,...,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willy Hernangómez,Yogi Ferrell,Yuta Watanabe,Zach Collins,Zach LaVine,Zach Norvell,Zylan Cheatham
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Gordon,0.0,0.005092,0.026005,0.047285,0.003141,0.012515,0.003444,0.013058,0.006664,0.01292,...,0.012861,0.002353,0.007116,0.010475,0.009878,0.069752,0.01375,0.009428,0.312032,0.088275
Aaron Holiday,0.005092,0.0,0.026657,0.054931,0.007413,0.019344,0.00607,0.008788,0.015671,0.01947,...,0.017408,0.007885,0.016705,0.015469,0.006675,0.081204,0.014766,0.007269,0.32132,0.088866
Abdel Nader,0.026005,0.026657,0.0,0.090071,0.043484,0.015041,0.024012,0.013019,0.015141,0.013001,...,0.031536,0.037641,0.035885,0.019408,0.014721,0.134351,0.013156,0.033452,0.22484,0.033618
Admiral Schofield,0.047285,0.054931,0.090071,0.0,0.042325,0.052243,0.041556,0.051112,0.05851,0.043458,...,0.029014,0.034575,0.038074,0.051008,0.049259,0.052896,0.041505,0.078151,0.480893,0.187254
Al Horford,0.003141,0.007413,0.043484,0.042325,0.0,0.021287,0.008154,0.021519,0.014818,0.022236,...,0.019196,0.001104,0.008033,0.01608,0.018828,0.055075,0.023191,0.01252,0.359079,0.118702


In [36]:
#evaluating matrix
rec_df['LeBron James'].sort_values()[1:11]

Player
Malcolm Brogdon      0.004274
Luka Dončić          0.004310
Kyrie Irving         0.006605
Nikola Jokić         0.008260
Trae Young           0.008595
Kawhi Leonard        0.010061
Eric Bledsoe         0.010674
Russell Westbrook    0.010957
D'Angelo Russell     0.012731
Stephen Curry        0.012859
Name: LeBron James, dtype: float64

## 538 rec system

In [37]:
scrape538.shape

(447, 15)

In [38]:
scrape538.columns

Index(['Player', 'Team', 'Position(s)', 'Minutes', 'BSR Off.', 'BSR Def.',
       'BSR Total', 'OOR Off.', 'OOR Def.', 'OOR Total', 'OVR Off.',
       'OVR Def.', 'OVR total', 'WAR', 'cluster'],
      dtype='object')