In [94]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [116]:
# import csv 

nba_data = pd.read_csv("nba_data.csv")
nba_data.count()

player_id           2625
Player              2625
Season              2625
G                   2625
GS                  2625
MP                  2625
FG                  2625
FGA                 2625
fgpercentage        2625
threePM             2625
threePA             2625
threePPercentage    2625
twoPM               2625
twoPA               2625
twoPPercentage      2625
efgpercentage       2625
FT                  2625
FTA                 2625
ftpercentage        2625
ORB                 2625
DRB                 2625
TRB                 2625
AST                 2625
STL                 2625
BLK                 2625
TOV                 2625
PF                  2625
PTS                 2625
PER                 2625
BPM                 2625
VORP                2625
dtype: int64

In [117]:
# Remove extreme outlier
nba_data['G'] = nba_data['G'].astype(float)
#Using drop() to delete rows based on column value
nba_data = nba_data.drop(nba_data[nba_data['G'] <= 6].index) 
nba_data = nba_data.drop(nba_data[nba_data['MP'] <= 3].index)
nba_data.count()

player_id           2393
Player              2393
Season              2393
G                   2393
GS                  2393
MP                  2393
FG                  2393
FGA                 2393
fgpercentage        2393
threePM             2393
threePA             2393
threePPercentage    2393
twoPM               2393
twoPA               2393
twoPPercentage      2393
efgpercentage       2393
FT                  2393
FTA                 2393
ftpercentage        2393
ORB                 2393
DRB                 2393
TRB                 2393
AST                 2393
STL                 2393
BLK                 2393
TOV                 2393
PF                  2393
PTS                 2393
PER                 2393
BPM                 2393
VORP                2393
dtype: int64

In [118]:
# create advanced columns df for the machine learning model

advanced = [nba_data["G"],nba_data["MP"], nba_data["PER"],nba_data["BPM"],nba_data["VORP"]]
advanced_df = pd.concat(advanced, axis=1)
advanced_df

Unnamed: 0,G,MP,PER,BPM,VORP
0,68.0,15.5,10.1,-1.6,0.1
1,38.0,14.7,11.8,-2.1,0.0
2,80.0,29.9,16.5,-0.2,1.1
3,61.0,25.9,8.9,-3.6,-0.7
4,39.0,15.0,12.9,-3.3,-0.2
...,...,...,...,...,...
2620,63.0,27.7,16.3,2.2,1.8
2621,68.0,24.3,20.3,3.3,2.2
2622,63.0,33.7,23.0,3.7,3.0
2623,48.0,20.9,18.2,-0.5,0.4


In [119]:
# scale the advanced df

nba_scaled = StandardScaler().fit_transform(advanced_df)
print(nba_scaled[0:5])

[[ 0.76476195 -0.58127607 -0.66293725 -0.1377741  -0.39371766]
 [-0.61545477 -0.67283399 -0.32734393 -0.29229857 -0.47298456]
 [ 1.31684864  1.06676642  0.60047289  0.29489442  0.39895133]
 [ 0.44271139  0.60897684 -0.89982665 -0.75587198 -1.02785284]
 [-0.56944754 -0.63849977 -0.11019531 -0.6631573  -0.63151835]]


In [120]:
# Initialize PCA model
pca = PCA(n_components=3)

In [121]:
# Get 3 principal components for the data.
nba_pca = pca.fit_transform(nba_scaled)

In [122]:
# create PCA df
df_nba_pca = pd.DataFrame(
    data = nba_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=nba_data.index)
df_nba_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.54903,-0.691784,-0.762504
1,-1.025293,0.415434,-0.103336
2,1.531965,-1.005637,-0.089367
3,-0.877664,-1.394199,0.340511
4,-1.153116,0.310684,-0.114554


In [123]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_nba_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [125]:
# make "Player" and "Year" its own df

player = [nba_data["Player"],nba_data["Season"]]
player_name = pd.concat(player, axis=1)
player_name.head()

Unnamed: 0,Player,Season
0,Álex Abrines,2016-17
1,Quincy Acy,2016-17
2,Steven Adams,2016-17
3,Arron Afflalo,2016-17
4,Alexis Ajinça,2016-17


In [126]:
# Initialize the K-means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(df_nba_pca)

# Predict clusters
predictions = model.predict(df_nba_pca)

# Add the predicted class columns
player_name["class"] = model.labels_
player_name.head()

Unnamed: 0,Player,Season,class
0,Álex Abrines,2016-17,0
1,Quincy Acy,2016-17,1
2,Steven Adams,2016-17,3
3,Arron Afflalo,2016-17,0
4,Alexis Ajinça,2016-17,1


In [127]:
#sort by class

sorted_class = player_name.sort_values(["class"])
class_one = sorted_class[(sorted_class["class"] == 0)]
class_two = sorted_class[(sorted_class["class"] == 1)]
class_three = sorted_class[(sorted_class["class"] == 2)]
class_four = sorted_class[(sorted_class["class"] == 3)]
class_five = sorted_class[(sorted_class["class"] == 4)]

In [133]:
# class 1 df

df_one = pd.concat([class_one, advanced_df], axis=1)
class_one_df = df_one.dropna()
class_one_df.tail()

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
2607,Grant Williams,2020-21,0.0,63.0,18.1,7.5,-3.7,-0.5
2608,Kenrich Williams,2020-21,0.0,66.0,21.6,14.0,-0.7,0.4
2609,Lou Williams,2020-21,0.0,66.0,21.6,14.0,-2.5,-0.2
2610,Patrick Williams,2020-21,0.0,71.0,27.9,10.5,-2.4,-0.2
2617,James Wiseman,2020-21,0.0,39.0,21.4,13.1,-5.0,-0.6


In [134]:
# class 2 df

df_two = pd.concat([class_two, advanced_df], axis=1)
class_two_df = df_two.dropna()
class_two_df.tail()

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
2599,Paul Watson,2020-21,1.0,27.0,11.0,10.9,-1.3,0.1
2604,Hassan Whiteside,2020-21,1.0,36.0,15.2,19.2,-2.2,0.0
2613,D.J. Wilson,2020-21,1.0,35.0,12.4,11.5,-3.3,-0.1
2614,Dylan Windler,2020-21,1.0,31.0,16.5,10.8,-1.7,0.0
2623,Cody Zeller,2020-21,1.0,48.0,20.9,18.2,-0.5,0.4


In [135]:
# class 3 df

df_three = pd.concat([class_three, advanced_df], axis=1)
class_three_df = df_three.dropna()
class_three_df.tail()

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
2586,Gabe Vincent,2020-21,2.0,50.0,13.1,7.6,-5.4,-0.6
2600,Quinndary Weatherspoon,2020-21,2.0,20.0,6.1,9.6,-5.0,-0.1
2615,Justise Winslow,2020-21,2.0,26.0,19.5,6.2,-6.0,-0.5
2616,Cassius Winston,2020-21,2.0,22.0,4.5,10.8,-3.2,0.0
2619,Robert Woodard II,2020-21,2.0,13.0,3.5,12.5,-5.3,0.0


In [136]:
# class 4 df

df_four = pd.concat([class_four, advanced_df], axis=1)
class_four_df = df_four.dropna()
class_four_df.tail()

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
2606,Andrew Wiggins,2020-21,3.0,71.0,33.3,15.0,-0.4,1.0
2618,Christian Wood,2020-21,3.0,41.0,32.3,20.0,1.4,1.1
2620,Delon Wright,2020-21,3.0,63.0,27.7,16.3,2.2,1.8
2621,Thaddeus Young,2020-21,3.0,68.0,24.3,20.3,3.3,2.2
2624,Ivica Zubac,2020-21,3.0,72.0,22.3,19.1,1.0,1.2


In [137]:
# class 5 df

df_five = pd.concat([class_five, advanced_df], axis=1)
class_five_df = df_five.dropna()
class_five_df.tail()

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
2588,Nikola Vučević,2020-21,4.0,70.0,33.5,22.9,5.3,4.4
2601,Russell Westbrook,2020-21,4.0,65.0,36.4,19.5,3.7,3.4
2611,Robert Williams,2020-21,4.0,52.0,18.9,25.7,6.0,2.0
2612,Zion Williamson,2020-21,4.0,61.0,33.2,27.1,5.8,4.0
2622,Trae Young,2020-21,4.0,63.0,33.7,23.0,3.7,3.0


In [139]:
# Plotting OVR Model
fig = px.scatter_3d(
    nba_data,
    x="vorp",
    y="bpm",
    z="per",
    color="class",
    hover_name= nba_data.index,
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['player_id', 'Player', 'Season', 'G', 'GS', 'MP', 'FG', 'FGA', 'fgpercentage', 'threePM', 'threePA', 'threePPercentage', 'twoPM', 'twoPA', 'twoPPercentage', 'efgpercentage', 'FT', 'FTA', 'ftpercentage', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PER', 'BPM', 'VORP'] but received: vorp