In [41]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [42]:
# import csv 

average_data = pd.read_csv("average_player.csv")
average_player_df = average_data.set_index('Unnamed: 0')
average_player_df.index.name = None
average_player_df.head()

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
0,Álex Abrines,2016-17,0.0,68.0,15.5,10.1,-1.6,0.1
3,Arron Afflalo,2016-17,0.0,61.0,25.9,8.9,-3.6,-0.7
7,Lavoy Allen,2016-17,0.0,61.0,14.3,11.6,-1.7,0.1
9,Al-Farouq Aminu,2016-17,0.0,61.0,29.1,11.3,-1.1,0.4
12,Justin Anderson,2016-17,0.0,75.0,16.4,13.9,-1.3,0.2


 class one df has 720 players, looking at the data, these players tend to be average nba players. 
 I am planning to run an elbow curve and add clusters within this DF 

In [43]:
# create average columns df for the machine learning model

average = [average_player_df["G"],average_player_df["MP"], average_player_df["PER"],average_player_df["BPM"],average_player_df["VORP"]]
average_df = pd.concat(average, axis=1)
average_df.head()

Unnamed: 0,G,MP,PER,BPM,VORP
0,68.0,15.5,10.1,-1.6,0.1
3,61.0,25.9,8.9,-3.6,-0.7
7,61.0,14.3,11.6,-1.7,0.1
9,61.0,29.1,11.3,-1.1,0.4
12,75.0,16.4,13.9,-1.3,0.2


In [21]:
# scale the average df

average_scaled = StandardScaler().fit_transform(average_df)
print(average_scaled[0:5])

[[ 0.45109532 -1.07660714 -0.50496959  0.48092718  0.3733581 ]
 [-0.16320363  0.87910336 -1.04293966 -1.09984236 -1.48536502]
 [-0.16320363 -1.30226604  0.167493    0.4018887   0.3733581 ]
 [-0.16320363  1.48086044  0.03300048  0.87611956  1.07037927]
 [ 1.06539428 -0.90736296  1.1986023   0.71804261  0.60569849]]


In [22]:
# Initialize PCA model
pca = PCA(n_components=3)

In [23]:
# Get 3 principal components for the data.
average_pca = pca.fit_transform(average_scaled)

In [25]:
# create PCA df
df_average_pca = pd.DataFrame(
    data = average_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=average_player_df.index)
df_average_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.32659,-0.800373,-1.037976
3,2.085626,0.804853,0.387227
7,-0.51439,-1.221254,-0.485757
9,-1.245178,0.93009,0.847339
12,-1.466461,-0.178891,-1.110267


In [44]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_average_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [45]:
# make "Player" and "Year" its own df

player = [average_player_df["Player"],average_player_df["Season"]]
player_name = pd.concat(player, axis=1)
player_name.head()

Unnamed: 0,Player,Season
0,Álex Abrines,2016-17
3,Arron Afflalo,2016-17
7,Lavoy Allen,2016-17
9,Al-Farouq Aminu,2016-17
12,Justin Anderson,2016-17


In [46]:
# Elbow curve is 2 so we will have two clusters

# Initialize the K-means model
model = KMeans(n_clusters=2, random_state=0)

# Fit the model
model.fit(df_average_pca)

# Predict clusters
predictions = model.predict(df_average_pca)

# Add the predicted class columns
player_name["class"] = model.labels_
player_name.head()

Unnamed: 0,Player,Season,class
0,Álex Abrines,2016-17,0
3,Arron Afflalo,2016-17,1
7,Lavoy Allen,2016-17,0
9,Al-Farouq Aminu,2016-17,0
12,Justin Anderson,2016-17,0


In [47]:
#sort by class

sorted_class = player_name.sort_values(["class"])
class_one = sorted_class[(sorted_class["class"] == 0)]
class_two = sorted_class[(sorted_class["class"] == 1)]
class_one.head()

Unnamed: 0,Player,Season,class
0,Álex Abrines,2016-17,0
1139,Quinn Cook,2018-19,0
1134,Zach Collins,2018-19,0
1130,Gary Clark,2018-19,0
1125,Tyson Chandler,2018-19,0


In [49]:
# class 1 df

df_one = pd.concat([class_one, average_df], axis=1)
top_average = pd.DataFrame(index=average_player_df.index)
top_average = df_one.dropna()
top_average.tail(50)

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
2324,Andre Iguodala,2020-21,0.0,63.0,21.3,9.2,-0.4,0.5
2342,Cameron Johnson,2020-21,0.0,60.0,24.0,11.8,-0.2,0.7
2343,James Johnson,2020-21,0.0,51.0,20.5,11.4,-1.2,0.2
2344,Keldon Johnson,2020-21,0.0,69.0,28.5,13.8,-1.6,0.2
2349,Derrick Jones Jr.,2020-21,0.0,58.0,22.7,11.9,-1.4,0.2
2354,Cory Joseph,2020-21,0.0,63.0,23.0,12.5,-1.8,0.1
2357,Luke Kennard,2020-21,0.0,63.0,19.6,12.7,-0.2,0.6
2359,Maxi Kleber,2020-21,0.0,50.0,26.8,10.6,0.0,0.7
2363,Furkan Korkmaz,2020-21,0.0,55.0,19.3,12.8,-0.1,0.5
2374,Damion Lee,2020-21,0.0,57.0,18.9,11.8,0.2,0.6


In [50]:
# class 2 df

df_two = pd.concat([class_two, average_df], axis=1)
bottom_average = pd.DataFrame(index=average_player_df.index)
bottom_average = df_two.dropna()
bottom_average.tail(50)

Unnamed: 0,Player,Season,class,G,MP,PER,BPM,VORP
2132,Bismack Biyombo,2020-21,1.0,66.0,20.4,11.7,-3.0,-0.4
2155,Dillon Brooks,2020-21,1.0,67.0,29.8,12.1,-3.7,-0.9
2215,Luguentz Dort,2020-21,1.0,52.0,29.7,10.1,-3.5,-0.6
2216,Damyean Dotson,2020-21,1.0,46.0,19.7,8.1,-4.8,-0.6
2219,PJ Dozier,2020-21,1.0,50.0,21.8,10.4,-2.6,-0.2
2242,Malachi Flynn,2020-21,1.0,47.0,19.7,10.9,-2.6,-0.1
2265,Brandon Goodwin,2020-21,1.0,47.0,13.2,9.2,-3.1,-0.2
2278,Rui Hachimura,2020-21,1.0,57.0,31.5,11.4,-3.1,-0.5
2283,R.J. Hampton,2020-21,1.0,51.0,17.4,11.1,-3.8,-0.4
2289,Gary Harris,2020-21,1.0,39.0,27.7,9.1,-4.2,-0.6
