In [None]:
# -------------------------------
#    Importing Dependencies 
# -------------------------------

# SQL Libraries
import psycopg2

from getpass import getpass

# ML Libraries
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt

In [None]:
# Using psycopg2 to pull from the NBA_Analysis DB
password = getpass()
cn = psycopg2.connect(host = "127.0.0.1", port = "5432", database = "NBA_Analysis", user = "postgres", password = password)
cur = cn.cursor()


#Querying test Data
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.stl, misc.drb, misc.blk
FROM main  
LEFT JOIN misc
ON main.id = misc.id;
""")
test_results = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.stl, misc.drb, misc.blk
FROM main  
LEFT JOIN misc
ON main.id = misc.id LIMIT 0;
""")
test_colnames = [desc[0] for desc in cur.description]

#NEW DATA ------------------------------------------s
# Over all player query
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.vorp, misc.per, misc.bpm
FROM main  
LEFT JOIN misc
ON main.id = misc.id
where main.g >= 3;
""")
ovr_results = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.vorp, misc.per, misc.bpm
FROM main  
LEFT JOIN misc
ON main.id = misc.id LIMIT 0;
""")
ovr_colnames = [desc[0] for desc in cur.description]

#PCA Query ------------------------------------------
# Over all player query
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id
WHERE t1.g >= 3;
""")
pca_results = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id LIMIT 0;
""")


pca_colnames = [desc[0] for desc in cur.description]


# PCA Query ------------------------------------------
# Over all player query
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id
WHERE t1.g >= 3;
""")
analysis = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id LIMIT 0;
""")

analysis_cols = [desc[0] for desc in cur.description]



In [None]:
# Assigning SQL data to df's
test_df = pd.DataFrame(test_results)
test_df.columns = test_colnames
test_df.head()

#overall df
ovr_df = pd.DataFrame(ovr_results)
ovr_df.columns = ovr_colnames
ovr_df.head()

#overall df
pca_df = pd.DataFrame(pca_results)
pca_df.columns = pca_colnames
ovr_df.head()

#overall df
nba_data = pd.DataFrame(analysis)
nba_data.columns = analysis_cols
nba_data.head()
print(nba_data.columns)

In [None]:
# Remove extreme outlier
nba_data['g'] = nba_data['g'].astype(float)
#Using drop() to delete rows based on column value
nba_data = nba_data.drop(nba_data[nba_data['g'] <= 6].index) 
nba_data = nba_data.drop(nba_data[nba_data['mp'] <= 3].index)
nba_data.count()

In [None]:
# create traditional columns df for the machine learning model

traditional = [nba_data["g"],nba_data["mp"], nba_data["pts"],nba_data["ast"],nba_data["trb"],nba_data["tov"]]
traditional_df = pd.concat(traditional, axis=1)
traditional_df

In [None]:
# scale the traditional df

nba_scaled = StandardScaler().fit_transform(traditional_df)
print(nba_scaled[0:5])

In [None]:
# Initialize PCA model
pca = PCA(n_components=3)

In [None]:
# Get 3 principal components for the data.
nba_pca = pca.fit_transform(nba_scaled)

In [None]:
# create PCA df
df_nba_pca = pd.DataFrame(
    data = nba_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=nba_data.index)
df_nba_pca.head()

In [None]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_nba_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [None]:
# make "Player" and "Year" its own df

player = [nba_data["player"],nba_data["year"]]
player_name = pd.concat(player, axis=1)
player_name.head()

In [None]:
# Initialize the K-means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(df_nba_pca)

# Predict clusters
predictions = model.predict(df_nba_pca)

# Add the predicted class columns
player_name["class"] = model.labels_
player_name.head()

In [None]:
#sort by class

sorted_class = player_name.sort_values(["class"])
class_one = sorted_class[(sorted_class["class"] == 0)]
class_two = sorted_class[(sorted_class["class"] == 1)]
class_three = sorted_class[(sorted_class["class"] == 2)]
class_four = sorted_class[(sorted_class["class"] == 3)]
class_five = sorted_class[(sorted_class["class"] == 4)]

In [None]:
# class 1 df

df_one = pd.concat([class_one, traditional_df], axis=1)
class_one_df = df_one.dropna()
class_one_df.sample(20)

In [98]:
# create 2 groups for class one for average guards and good guards

# split class one into 2
trad_g_classes =[class_one_df["g"],class_one_df["mp"], class_one_df["pts"],class_one_df["ast"],class_one_df["trb"],class_one_df["tov"]]
trad_g_classes_df = pd.concat(trad_g_classes, axis=1)


# scale the guard df

g_scaled = StandardScaler().fit_transform(trad_g_classes_df)
# Initialize PCA model
pca_g = PCA(n_components=3)
# Get 3 principal components for the data.
g_pca = pca.fit_transform(g_scaled)

# create PCA df
df_g_pca = pd.DataFrame(
    data = g_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=class_one_df.index)
df_g_pca.head()

# make "Player" and "Year" its own df for av

player_g = [class_one_df["player"],class_one_df["year"]]
player_name_g = pd.concat(player_g, axis=1)
player_name_g.head()


# Initialize the K-means model
model_g = KMeans(n_clusters=2, random_state=0)

# Fit the model
model_g.fit(df_g_pca)

# Predict clusters
predictions_g = model.predict(df_g_pca)

# Add the predicted class columns
player_name_g["class"] = model_g.labels_
player_name_g.head()

#sort by class av

sorted_class_g = player_name_g.sort_values(["class"])
class_one_g = sorted_class_g[(sorted_class_g["class"] == 0)]
class_two_g = sorted_class_g[(sorted_class_g["class"] == 1)]


In [110]:
# class 1 df for guards

df_one_g = pd.concat([class_one_g, trad_g_classes_df], axis=1)
top_g = pd.DataFrame(index=class_one_df.index)
top_g = df_one_g.dropna()
top_g.drop('class', axis=1, inplace=True)

top_g["class"] = "good guards"

top_g.sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,player,year,g,mp,pts,ast,trb,tov,class
2219,Joe Harris,2020-21,69.0,31.0,14.1,1.9,3.6,0.9,good guards
1419,Collin Sexton,2018-19,82.0,31.8,16.7,3.0,2.9,2.3,good guards
1238,Jaren Jackson Jr.,2018-19,58.0,26.1,13.8,1.1,4.7,1.7,good guards
1013,Trevor Ariza,2018-19,69.0,34.0,12.5,3.7,5.4,1.5,good guards
2140,Hamidou Diallo,2020-21,52.0,23.6,11.6,1.9,5.2,1.5,good guards
2515,Lonnie Walker IV,2020-21,60.0,25.4,11.2,1.7,2.6,1.1,good guards
1578,Alec Burks,2019-20,66.0,26.6,15.0,2.9,4.3,1.4,good guards
1342,Emmanuel Mudiay,2018-19,59.0,27.2,14.8,3.9,3.3,2.4,good guards
2027,Kyle Anderson,2020-21,69.0,27.3,12.4,3.6,5.7,1.2,good guards
2246,Kevin Huerter,2020-21,69.0,30.8,11.9,3.5,3.3,1.1,good guards


In [111]:
# class 2 df for guards

df_two_g = pd.concat([class_two_g, trad_g_classes_df], axis=1)
low_g = pd.DataFrame(index=class_one_df.index)
low_g = df_two_g.dropna()
low_g.drop('class', axis=1, inplace=True)

low_g["class"] = "average"

low_g.sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,player,year,g,mp,pts,ast,trb,tov,class
1099,Darren Collison,2018-19,76.0,28.2,11.2,6.0,3.1,1.6,average
250,Ty Lawson,2016-17,69.0,25.1,9.9,4.8,2.6,1.9,average
308,Emmanuel Mudiay,2016-17,55.0,25.6,11.0,3.9,3.2,2.2,average
2407,Elfrid Payton,2020-21,63.0,23.6,10.1,3.2,3.4,1.6,average
261,Jeremy Lin,2016-17,36.0,24.5,14.5,5.1,3.8,2.4,average
736,Cory Joseph,2017-18,82.0,27.0,7.9,3.2,3.2,1.1,average
605,Spencer Dinwiddie,2017-18,80.0,28.8,12.6,6.6,3.2,1.6,average
2036,D.J. Augustin,2020-21,57.0,19.8,7.7,3.3,1.6,1.1,average
2094,Jalen Brunson,2020-21,68.0,25.0,12.6,3.5,3.4,1.2,average
220,Brandon Jennings,2016-17,81.0,22.2,7.1,4.9,2.4,1.6,average


In [106]:
# class 2 df

df_two = pd.concat([class_two, traditional_df], axis=1)
class_two_df = df_two.dropna()
class_two_df.tail()
class_two_df.drop('class', axis=1, inplace=True)

class_two_df["class"] = "poor"

class_two_df.sample(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,player,year,g,mp,pts,ast,trb,tov,class
1525,J.J. Barea,2019-20,29.0,15.5,7.7,3.9,1.8,1.3,poor
1379,Theo Pinson,2018-19,18.0,11.7,4.5,1.2,2.0,1.0,poor
2483,Tyrell Terry,2020-21,11.0,5.1,1.0,0.5,0.5,0.2,poor
429,Jarrod Uthoff,2016-17,9.0,12.8,4.4,1.0,2.4,0.3,poor
1865,David Nwaba,2019-20,20.0,13.4,5.2,0.4,2.3,0.4,poor
1461,Jarred Vanderbilt,2018-19,17.0,4.1,1.4,0.2,1.4,0.5,poor
62,Reggie Bullock,2016-17,31.0,15.1,4.5,0.9,2.1,0.3,poor
1094,Gary Clark,2018-19,51.0,12.6,2.9,0.4,2.3,0.1,poor
1058,Isaiah Briscoe,2018-19,39.0,14.3,3.5,2.2,1.9,0.8,poor
2464,Chris Silva,2020-21,15.0,6.1,2.1,0.4,1.8,0.7,poor


In [None]:
# class 3 df

df_three = pd.concat([class_three, traditional_df], axis=1)
class_three_df = df_three.dropna()
class_three_df.count()

#create two classes for average and slightly below average
# split class three into 2
trad_average_classes =[class_three_df["g"],class_three_df["mp"], class_three_df["pts"],class_three_df["ast"],class_three_df["trb"],class_three_df["tov"]]
trad_average_classes_df = pd.concat(trad_average_classes, axis=1)


# scale the average df

average_scaled = StandardScaler().fit_transform(trad_average_classes_df)
# Initialize PCA model
pca_av = PCA(n_components=3)
# Get 3 principal components for the data.
average_pca = pca.fit_transform(average_scaled)

# create PCA df
df_average_pca = pd.DataFrame(
    data = average_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=class_three_df.index)
df_average_pca.head()

# make "Player" and "Year" its own df for av

player_av = [class_three_df["player"],class_three_df["year"]]
player_name_av = pd.concat(player_av, axis=1)
player_name_av.head()


# Initialize the K-means model
model_av = KMeans(n_clusters=2, random_state=0)

# Fit the model
model_av.fit(df_average_pca)

# Predict clusters
predictions_av = model.predict(df_average_pca)

# Add the predicted class columns
player_name_av["class"] = model_av.labels_
player_name_av.head()

#sort by class av

sorted_class_av = player_name_av.sort_values(["class"])
class_one_av = sorted_class_av[(sorted_class_av["class"] == 0)]
class_two_av = sorted_class_av[(sorted_class_av["class"] == 1)]
class_two_av.count()

In [None]:
# class 1 df for averages

df_one_av = pd.concat([class_one_av, trad_average_classes_df], axis=1)
top_average = pd.DataFrame(index=class_three_df.index)
top_average = df_one_av.dropna()
top_average.drop('class', axis=1, inplace=True)

top_average["class"] = "average"

top_average.sample(30)

In [None]:
# class 2 df for averages

df_two_av = pd.concat([class_two_av, trad_average_classes_df], axis=1)
low_average = pd.DataFrame(index=class_three_df.index)
low_average = df_two_av.dropna()
low_average.drop('class', axis=1, inplace=True)

low_average["class"] = "below average"

low_average.sample(30)

In [None]:
# class 4 df

df_four = pd.concat([class_four, traditional_df], axis=1)
class_four_df = df_four.dropna()
class_four_df.sample(30)

In [None]:
#create two classes for elite and great
# split class 4 into 2
trad_top_classes =[class_four_df["g"],class_four_df["mp"], class_four_df["pts"],class_four_df["ast"],class_four_df["trb"],class_four_df["tov"]]
trad_top_classes_df = pd.concat(trad_top_classes, axis=1)


# scale the top df

top_scaled = StandardScaler().fit_transform(trad_top_classes_df)
# Initialize PCA model
pca_top = PCA(n_components=3)
# Get 3 principal components for the data.
top_pca = pca.fit_transform(top_scaled)

# create PCA df
df_top_pca = pd.DataFrame(
data = top_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=class_four_df.index)
df_top_pca.head()

# make "Player" and "Year" its own df for topp

player_top = [class_four_df["player"],class_four_df["year"]]
player_name_top = pd.concat(player_top, axis=1)
player_name_top.head()


# Initialize the K-means model
model_top = KMeans(n_clusters=2, random_state=0)

# Fit the model
model_top.fit(df_top_pca)

# Predict clusters
predictions_top = model_top.predict(df_top_pca)

# Add the predicted class columns
player_name_top["class"] = model_top.labels_
player_name_top.head()

#sort by class av

sorted_class_top = player_name_top.sort_values(["class"])
class_one_top = sorted_class_top[(sorted_class_top["class"] == 0)]
class_two_top = sorted_class_top[(sorted_class_top["class"] == 1)]


In [None]:
# class 1 df for top

df_one_top = pd.concat([class_one_top, trad_top_classes_df], axis=1)
great_top = pd.DataFrame(index=class_four_df.index)
great_top = df_one_top.dropna()
great_top.drop('class', axis=1, inplace=True)

great_top["class"] = "great"

great_top.sample(30)

In [None]:
# class 2 df for top

df_two_top = pd.concat([class_two_top, trad_top_classes_df], axis=1)
elite_top = pd.DataFrame(index=class_four_df.index)
elite_top = df_two_top.dropna()
elite_top.drop('class', axis=1, inplace=True)

elite_top["class"] = "elite"

elite_top.tail(30)

In [None]:
# class 5 df

df_five = pd.concat([class_five, traditional_df], axis=1)
class_five_df = df_five.dropna()
class_five_df.sample(30)

In [None]:
# split class 5 into two clusters. This group is good players that are mainly C's and Fowards
#create two classes for above average  and good fowards and centers
# split class 5 into 2
trad_c_classes =[class_five_df["g"],class_five_df["mp"], class_five_df["pts"],class_five_df["ast"],class_five_df["trb"],class_five_df["tov"]]
trad_c_classes_df = pd.concat(trad_c_classes, axis=1)


# scale the top df

c_scaled = StandardScaler().fit_transform(trad_c_classes_df)
# Initialize PCA model
pca_c = PCA(n_components=3)
# Get 3 principal components for the data.
c_pca = pca.fit_transform(c_scaled)

# create PCA df
df_c_pca = pd.DataFrame(
data = c_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=class_five_df.index)
df_c_pca.head()

# make "Player" and "Year" its own df for topp

player_c = [class_five_df["player"],class_five_df["year"]]
player_name_c = pd.concat(player_c, axis=1)
player_name_c.head()


# Initialize the K-means model
model_c = KMeans(n_clusters=2, random_state=0)

# Fit the model
model_c.fit(df_c_pca)

# Predict clusters
predictions_c = model_top.predict(df_c_pca)

# Add the predicted class columns
player_name_c["class"] = model_c.labels_
player_name_c.head()

#sort by class av

sorted_class_c = player_name_c.sort_values(["class"])
class_one_c = sorted_class_c[(sorted_class_c["class"] == 0)]
class_two_c = sorted_class_c[(sorted_class_c["class"] == 1)]



In [None]:
# class 1 df for c and fowards

df_c_top = pd.concat([class_one_c, trad_c_classes_df], axis=1)
above_av_big = pd.DataFrame(index=class_five_df.index)
above_av_big = df_c_top.dropna()
above_av_big.drop('class', axis=1, inplace=True)
above_av_big["class"] = "above average foward/centers"

above_av_big.sample(30)

In [None]:
# class 2 df for c and fowards

df_c_low = pd.concat([class_two_c, trad_c_classes_df], axis=1)
very_good_big = pd.DataFrame(index=class_five_df.index)
very_good_big = df_c_low.dropna()
very_good_big.drop('class', axis=1, inplace=True)

very_good_big["class"] = "very good foward/centers"

very_good_big.sample(30)

In [113]:
# create one data frame with every class

traditional_stats_ml = pd.concat([elite_top, great_top, very_good_big, above_av_big, top_g, low_g, top_average, low_average, class_two_df], ignore_index=False)

In [114]:
traditional_stats_df = traditional_stats_ml.reset_index(drop=True)
traditional_stats_df.head()

Unnamed: 0,player,year,g,mp,pts,ast,trb,tov,class
0,Giannis Antetokounmpo,2016-17,80.0,35.6,22.9,5.4,8.8,2.9,elite
1,DeMarcus Cousins,2016-17,72.0,34.2,27.0,4.6,11.0,3.7,elite
2,James Harden,2016-17,81.0,36.4,29.1,11.2,8.1,5.7,elite
3,LeBron James,2016-17,74.0,37.8,26.4,8.7,8.6,4.1,elite
4,Damian Lillard,2016-17,75.0,35.9,27.0,5.9,4.9,2.6,elite


In [115]:
# import into a csv

traditional_stats_df.to_csv('traditional_stats_all.csv')