In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [None]:
# -------------------------------
#    Importing Dependencies 
# -------------------------------

# Organisation Libraries 
import pandas as pd
import os
# SQL Libraries
import psycopg2

from getpass import getpass

# ML Libraryies
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
import plotly.express as px

In [None]:
# Using psycopg2 to pull from the NBA_Analysis DB
password = getpass()
cn = psycopg2.connect(host = "127.0.0.1", port = "5432", database = "NBA_Analysis", user = "postgres", password = password)
cur = cn.cursor()


#Querying test Data
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.stl, misc.drb, misc.blk
FROM main  
LEFT JOIN misc
ON main.id = misc.id;
""")
test_results = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.stl, misc.drb, misc.blk
FROM main  
LEFT JOIN misc
ON main.id = misc.id LIMIT 0;
""")
test_colnames = [desc[0] for desc in cur.description]

#NEW DATA ------------------------------------------s
# Over all player query
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.vorp, misc.per, misc.bpm
FROM main  
LEFT JOIN misc
ON main.id = misc.id
where main.g >= 3;
""")
ovr_results = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT main.id, main.player, main.u_player, misc.vorp, misc.per, misc.bpm
FROM main  
LEFT JOIN misc
ON main.id = misc.id LIMIT 0;
""")
ovr_colnames = [desc[0] for desc in cur.description]

#PCA Query ------------------------------------------
# Over all player query
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id
WHERE t1.g >= 3;
""")
pca_results = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id LIMIT 0;
""")


pca_colnames = [desc[0] for desc in cur.description]


# PCA Query ------------------------------------------
# Over all player query
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id
WHERE t1.g >= 3;
""")
analysis = cur.fetchall()

# Getting the column headers for test Table
cur.execute("""
SELECT t1.*, t2.ft, t2.fta, t2.ft_perc, t2.orb,
t2.drb, t2.trb, t2.ast, t2.stl, t2.blk, t2.tov, t2.pf,
t2.pts, t2.per, t2.bpm, t2.vorp
FROM main as t1 
LEFT JOIN misc as t2
ON t1.id = t2.id LIMIT 0;
""")

analysis_cols = [desc[0] for desc in cur.description]



In [None]:
# Assigning SQL data to df's
test_df = pd.DataFrame(test_results)
test_df.columns = test_colnames
test_df.head()

#overall df
ovr_df = pd.DataFrame(ovr_results)
ovr_df.columns = ovr_colnames
ovr_df.head()

#overall df
pca_df = pd.DataFrame(pca_results)
pca_df.columns = pca_colnames
ovr_df.head()

#overall df
nba_data = pd.DataFrame(analysis)
nba_data.columns = analysis_cols
nba_data.head()
print(nba_data.columns)

In [None]:
# import csv 

#nba_data = pd.read_csv("nba_data.csv")
#nba_data.count()

In [None]:
# Remove extreme outlier
nba_data['g'] = nba_data['g'].astype(float)
#Using drop() to delete rows based on column value
nba_data = nba_data.drop(nba_data[nba_data['g'] <= 6].index) 
nba_data = nba_data.drop(nba_data[nba_data['mp'] <= 3].index)
nba_data.count()

In [None]:
# create advanced columns df for the machine learning model

advanced = [nba_data["g"],nba_data["mp"], nba_data["per"],nba_data["bpm"],nba_data["vorp"]]
advanced_df = pd.concat(advanced, axis=1)
advanced_df

In [None]:
# scale the advanced df

nba_scaled = StandardScaler().fit_transform(advanced_df)
print(nba_scaled[0:5])

In [None]:
# Initialize PCA model
pca = PCA(n_components=3)

In [None]:
# Get 3 principal components for the data.
nba_pca = pca.fit_transform(nba_scaled)

In [None]:
# create PCA df
df_nba_pca = pd.DataFrame(
    data = nba_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=nba_data.index)
df_nba_pca.head()

In [None]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_nba_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [None]:
# make "Player" and "Year" its own df

player = [nba_data["player"],nba_data["year"]]
player_name = pd.concat(player, axis=1)
player_name.head()

In [None]:
# Initialize the K-means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(df_nba_pca)

# Predict clusters
predictions = model.predict(df_nba_pca)

# Add the predicted class columns
player_name["class"] = model.labels_
player_name.head()

In [307]:
#sort by class

sorted_class = player_name.sort_values(["class"])
class_one = sorted_class[(sorted_class["class"] == 0)]
class_two = sorted_class[(sorted_class["class"] == 1)]
class_three = sorted_class[(sorted_class["class"] == 2)]
class_four = sorted_class[(sorted_class["class"] == 3)]
class_five = sorted_class[(sorted_class["class"] == 4)]

In [None]:
# class 1 df

df_one = pd.concat([class_one, advanced_df], axis=1)
class_one_df = df_one.dropna()
class_one_df.sample(60)

# split class one into 2
advanced_average_classes = [class_one_df["g"],class_one_df["mp"], class_one_df["per"],class_one_df["bpm"],class_one_df["vorp"]]
advanced_average_classes_df = pd.concat(advanced_average_classes, axis=1)


# scale the average df

average_scaled = StandardScaler().fit_transform(advanced_average_classes_df)
# Initialize PCA model
pca_av = PCA(n_components=3)
# Get 3 principal components for the data.
average_pca = pca.fit_transform(average_scaled)

# create PCA df
df_average_pca = pd.DataFrame(
    data = average_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=class_one_df.index)
df_average_pca.head()




In [None]:
# make "Player" and "Year" its own df

player_av = [class_one_df["player"],class_one_df["year"]]
player_name_av = pd.concat(player_av, axis=1)
player_name_av.head()

In [None]:
# Elbow curve is 2 so we will have two clusters

# Initialize the K-means model
model_av = KMeans(n_clusters=2, random_state=0)

# Fit the model
model_av.fit(df_average_pca)

# Predict clusters
predictions_av = model.predict(df_average_pca)

# Add the predicted class columns
player_name_av["class"] = model_av.labels_
player_name_av.drop('class', axis=1, inplace=True)
player_name_av.head()

In [None]:
#sort by class av

sorted_class_av = player_name_av.sort_values(["class_av"])
class_one_av = sorted_class_av[(sorted_class_av["class_av"] == 0)]
class_two_av = sorted_class_av[(sorted_class_av["class_av"] == 1)]

In [None]:
# class 1 df for averages

df_one_av = pd.concat([class_one_av, advanced_average_classes_df], axis=1)
top_average = pd.DataFrame(index=class_one_df.index)
top_average = df_one_av.dropna()
top_average.drop('class_av', axis=1, inplace=True)

top_average["class"] = "average"

top_average.head()

In [308]:
# class 2 df for averages 

df_two_av = pd.concat([class_two_av, advanced_average_classes_df], axis=1)
low_average = pd.DataFrame(index=class_one_df.index)
low_average = df_two_av.dropna()
low_average.drop('class_av', axis=1, inplace=True)

low_average["class"] = "below average"

low_average.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,player,year,g,mp,per,bpm,vorp,class
3,Arron Afflalo,2016-17,61.0,25.9,8.9,-3.6,-0.7,below average
21,D.J. Augustin,2016-17,78.0,19.7,11.0,-3.0,-0.4,below average
22,Luke Babbitt,2016-17,68.0,15.7,8.3,-2.0,0.0,below average
23,Ron Baker,2016-17,52.0,16.5,7.5,-4.7,-0.6,below average
25,Leandro Barbosa,2016-17,67.0,14.4,11.5,-2.5,-0.1,below average


In [309]:
# class 2 df

df_two = pd.concat([class_two, advanced_df], axis=1)
below_average_df = df_two.dropna()
below_average_df.drop('class', axis=1, inplace=True)
below_average_df["class"] = "below average"

below_average_df.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,player,year,g,mp,per,bpm,vorp,class
1,Quincy Acy,2016-17,38.0,14.7,11.8,-2.1,0.0,below average
4,Alexis Ajinça,2016-17,39.0,15.0,12.9,-3.3,-0.2,below average
5,Cole Aldrich,2016-17,62.0,8.6,12.7,-0.8,0.2,below average
17,Joel Anthony,2016-17,19.0,6.4,11.6,-2.1,0.0,below average
19,Darrell Arthur,2016-17,41.0,15.6,12.8,-0.1,0.3,below average


In [310]:
# class 3 df

df_three = pd.concat([class_three, advanced_df], axis=1)
poor_average_df = df_three.dropna()
poor_average_df.drop('class', axis=1, inplace=True)
poor_average_df["class"] = "poor"

poor_average_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,player,year,g,mp,per,bpm,vorp,class
10,Chris Andersen,2016-17,12.0,9.5,11.6,-2.5,0.0,poor
11,Alan Anderson,2016-17,30.0,10.3,5.0,-6.1,-0.3,poor
20,Ömer Aşık,2016-17,31.0,15.5,9.8,-3.7,-0.2,poor
24,Wade Baldwin,2016-17,33.0,12.3,6.5,-5.5,-0.3,poor
39,DeAndre' Bembry,2016-17,38.0,9.8,8.8,-3.5,-0.1,poor


In [311]:
# class 4 df

df_four = pd.concat([class_four, advanced_df], axis=1)
good_players_df = df_four.dropna()
good_players_df.drop('class', axis=1, inplace=True)
good_players_df["class"] = "good"

good_players_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,player,year,g,mp,per,bpm,vorp,class
2,Steven Adams,2016-17,80.0,29.9,16.5,-0.2,1.1,good
6,LaMarcus Aldridge,2016-17,72.0,32.4,18.6,0.8,1.7,good
8,Tony Allen,2016-17,71.0,27.0,13.3,-1.0,0.5,good
13,Kyle Anderson,2016-17,72.0,14.2,12.5,1.8,1.0,good
14,Ryan Anderson,2016-17,72.0,29.4,13.5,-0.4,0.9,good


In [312]:
# class 5 df

df_five = pd.concat([class_five, advanced_df], axis=1)
class_five_df = df_five.dropna()
#class_five_df.sample(50)


In [313]:
# split class 5 into great and elite

# create great columns df for the machine learning model


top = [class_five_df["g"],class_five_df["mp"], class_five_df["per"],class_five_df["bpm"],class_five_df["vorp"]]
top_df = pd.concat(top, axis=1)
top_df.head()

Unnamed: 0,g,mp,per,bpm,vorp
15,80.0,35.6,26.1,7.3,6.7
35,77.0,34.9,20.1,3.3,3.5
48,66.0,33.0,20.5,3.6,3.1
65,76.0,37.0,25.1,7.3,6.6
86,69.0,33.2,23.2,6.8,5.1


In [314]:
# scale the great df

top_scaled = StandardScaler().fit_transform(top_df)
print(top_scaled[0:5])

[[ 1.0552057   0.8091452   0.83070905  0.87086442  1.6426105 ]
 [ 0.82725969  0.55691364 -1.06389054 -0.9399764  -0.37538986]
 [-0.00854236 -0.12771488 -0.9375839  -0.80416333 -0.6276399 ]
 [ 0.75127769  1.31360831  0.51494245  0.87086442  1.57954799]
 [ 0.21940366 -0.05564872 -0.08501408  0.64450932  0.63361032]]


In [315]:
# Initialize PCA model
pca_top = PCA(n_components=3)

In [316]:
# Get 3 principal components for the data.
top_pca = pca.fit_transform(top_scaled)

In [317]:
# create PCA df top
df_top_pca = pd.DataFrame(
    data = top_pca, columns = ["principal component 1", "principal component 2", "principal component 3"], index=class_five_df.index)
df_top_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
15,2.21333,-0.981045,0.20748
35,-1.064168,-1.411882,-0.059114
48,-1.338686,-0.343116,-0.020048
65,2.079154,-1.123076,-0.396937
86,0.705463,-0.096314,0.243921


In [318]:
# make "Player" and "Year" its own df

player_top = [class_five_df["player"],class_five_df["year"]]
player_name_top = pd.concat(player_top, axis=1)
player_name_top.head()

Unnamed: 0,player,year
15,Giannis Antetokounmpo,2016-17
35,Bradley Beal,2016-17
48,Eric Bledsoe,2016-17
65,Jimmy Butler,2016-17
86,Mike Conley,2016-17


In [319]:
# Initialize the K-means model
model_top = KMeans(n_clusters=2, random_state=0)

# Fit the model
model_top.fit(df_top_pca)

# Predict clusters
predictions_top = model_top.predict(df_top_pca)

# Add the predicted class columns
player_name_top["class"] = model_top.labels_
player_name_top.head()

Unnamed: 0,player,year,class
15,Giannis Antetokounmpo,2016-17,0
35,Bradley Beal,2016-17,1
48,Eric Bledsoe,2016-17,1
65,Jimmy Butler,2016-17,0
86,Mike Conley,2016-17,0


In [320]:
#sort by class top

sorted_class_top = player_name_top.sort_values(["class"])
class_one_top = sorted_class_top[(sorted_class_top["class"] == 0)]
class_two_top = sorted_class_top[(sorted_class_top["class"] == 1)]

In [321]:
#elite class

df_one_top = pd.concat([class_one_top, top_df], axis=1)
elite = pd.DataFrame(index=class_five_df.index)
elite_df = df_one_top.dropna()


In [322]:
#elite class df

elite_df.drop('class', axis=1, inplace=True)

elite_df["class"] = "elite"

elite_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,player,year,g,mp,per,bpm,vorp,class
15,Giannis Antetokounmpo,2016-17,80.0,35.6,26.1,7.3,6.7,elite
65,Jimmy Butler,2016-17,76.0,37.0,25.1,7.3,6.6,elite
86,Mike Conley,2016-17,69.0,33.2,23.2,6.8,5.1,elite
89,DeMarcus Cousins,2016-17,72.0,34.2,25.8,6.8,5.5,elite
97,Stephen Curry,2016-17,79.0,33.4,24.6,6.9,5.9,elite


In [323]:
#great class

df_two_top = pd.concat([class_two_top, top_df], axis=1)
great = pd.DataFrame(index=class_five_df.index)
great_df = df_two_top.dropna()


In [324]:
#great class df

great_df.drop('class', axis=1, inplace=True)

great_df["class"] = "great"

great_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,player,year,g,mp,per,bpm,vorp,class
35,Bradley Beal,2016-17,77.0,34.9,20.1,3.3,3.5,great
48,Eric Bledsoe,2016-17,66.0,33.0,20.5,3.6,3.1,great
107,DeMar DeRozan,2016-17,74.0,35.4,24.0,3.0,3.3,great
113,Goran Dragić,2016-17,73.0,33.7,19.8,2.6,2.9,great
144,Marc Gasol,2016-17,74.0,34.2,20.3,5.2,4.6,great


In [325]:
# create one data frame with every class

advanced_stats_ml = pd.concat([elite_df, great_df, good_players_df, top_average, low_average, below_average_df, poor_average_df], ignore_index=True)

In [326]:

advanced_stats_df = advanced_stats_ml.reset_index(drop=True)
advanced_stats_df.head()


Unnamed: 0,player,year,g,mp,per,bpm,vorp,class
0,Giannis Antetokounmpo,2016-17,80.0,35.6,26.1,7.3,6.7,elite
1,Jimmy Butler,2016-17,76.0,37.0,25.1,7.3,6.6,elite
2,Mike Conley,2016-17,69.0,33.2,23.2,6.8,5.1,elite
3,DeMarcus Cousins,2016-17,72.0,34.2,25.8,6.8,5.5,elite
4,Stephen Curry,2016-17,79.0,33.4,24.6,6.9,5.9,elite


In [328]:
# import into a csv

advanced_stats_df.to_csv('advanced_stats_all.csv')