In [7]:
import pandas as pd
import requests
import time
from time import sleep
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

%matplotlib inline

# practie scraping 538

538 only has player ratings for the current year. So this will be based off their 2020 rankings

In [8]:
url = 'https://projects.fivethirtyeight.com/2020-nba-player-ratings/'
res = requests.get(url)
res.status_code

200

In [9]:
soup = BeautifulSoup(res.content, 'lxml')
table538 = soup.find('table', {'id' : 'table'})
len(table538.find_all('tr'))

457

Grabbing column names - Some of the labels have a double heading. I'm only grabbing the last column heading row and will rename it later just for the sake of time

In [10]:
#searching for all Tr in the chart 
#starting with 3rd index because that's where James Harden starts
players = table538.find_all('tr')[2:]

In [11]:
scrape = []
col_names = []
for label in table538.find_all('tr')[1]:
    find = label.text
    col_names.append(find)
for i in range(len(players)):
    look = [details.text for details in players[i].find_all('td')]
    scrape.append(look)

In [12]:
#turning scrape into dataframe
scrape538  = pd.DataFrame(scrape, columns = col_names)

In [13]:
scrape538.head()

Unnamed: 0,Player,Team,Position(s),Minutes,Off.,Def.,Total,Off..1,Def..1,Total.1,Off..2,Def..2,Total.2,WAR
0,James Harden,Rockets,"PG, SG",669,10.2,-0.1,10.1,2.7,9.5,12.1,9.4,1.7,11.1,4.7
1,LeBron James,Lakers,"PG, SF, PF",636,7.5,0.5,8.0,9.8,4.9,14.7,8.2,1.3,9.5,4.0
2,Luka Doncic,Mavericks,"PG, SG, SF",578,10.2,1.4,11.6,7.2,-5.1,2.2,10.0,0.3,10.3,3.9
3,Giannis Antetokounmpo,Bucks,"SF, PF",595,5.7,2.6,8.3,3.7,6.3,10.0,5.5,3.4,8.9,3.6
4,Montrezl Harrell,Clippers,C,572,1.5,6.6,8.2,5.0,1.8,6.8,2.3,6.0,8.3,3.2


In [14]:
scrape538.shape

(455, 14)

In [15]:
scrape538.columns[1]

'Team'

In [16]:
scrape538.columns = ['Player', 'Team', 'Position(s)', 'Minutes', 'BSR Off.', 'BSR Def.', 'BSR Total',
                    'OOR Off.', 'OOR Def.', 'OOR Total', 'OVR Off.', 'OVR Def.', 'OVR total', 'WAR']

In [17]:
scrape538.to_csv('data/scrape538.csv')

In [18]:
scrape538.head()

Unnamed: 0,Player,Team,Position(s),Minutes,BSR Off.,BSR Def.,BSR Total,OOR Off.,OOR Def.,OOR Total,OVR Off.,OVR Def.,OVR total,WAR
0,James Harden,Rockets,"PG, SG",669,10.2,-0.1,10.1,2.7,9.5,12.1,9.4,1.7,11.1,4.7
1,LeBron James,Lakers,"PG, SF, PF",636,7.5,0.5,8.0,9.8,4.9,14.7,8.2,1.3,9.5,4.0
2,Luka Doncic,Mavericks,"PG, SG, SF",578,10.2,1.4,11.6,7.2,-5.1,2.2,10.0,0.3,10.3,3.9
3,Giannis Antetokounmpo,Bucks,"SF, PF",595,5.7,2.6,8.3,3.7,6.3,10.0,5.5,3.4,8.9,3.6
4,Montrezl Harrell,Clippers,C,572,1.5,6.6,8.2,5.0,1.8,6.8,2.3,6.0,8.3,3.2


# Perform Kmeans on only 538 Data

In [2]:
# Import from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import PolynomialFeatures, StandardScaler


In [17]:
len(scrape538)

447

In [18]:
scrape538.columns

Index(['Player', 'Team', 'Position(s)', 'Minutes', 'BSR Off.', 'BSR Def.',
       'BSR Total', 'OOR Off.', 'OOR Def.', 'OOR Total', 'OVR Off.',
       'OVR Def.', 'OVR total', 'WAR'],
      dtype='object')

In [21]:
X = scrape538.drop(['Player', 'Team', 'Position(s)'], axis = 1)
y = scrape538['Player']

In [22]:
# scale data
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [34]:
#dimensionality reduction
pca = PCA(n_components = 4)
pca.fit(X_sc)
X_pca = pca.transform(X_sc)

In [35]:
#creating clusters
km = KMeans(n_clusters = 5, random_state = 248)
km.fit(X_pca)
y_kmeans = km.predict(X_pca)
silhouette_score(X_sc, km.labels_)

0.2352700137580218

In [36]:
km.cluster_centers_

array([[-1.82930892, -0.23108414,  0.49488871,  0.90045467],
       [ 3.21311114, -0.55796388, -0.88598129, -0.24273692],
       [ 8.84519924,  1.45931632,  1.43564058,  1.08692484],
       [ 0.35189478, -0.26045362, -0.01350523, -0.32217047],
       [-1.70775714,  1.64314428, -0.68137757, -0.78390537]])

In [37]:
# kmeans cluster labels I have really no idea what any
#of these are doing but I'm just practicing 
from sklearn.cluster import SpectralClustering
from sklearn.metrics import pairwise_distances_argmin

In [29]:
#creating columns for km labels
scrape538['cluster'] = km.labels_