In [1]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
mvp_field = pd.read_csv('mvp-predict/data/mvpfield.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'mvp-predict/data/mvpfield.csv'

In [None]:
def download(filename, url): 

    if os.path.exists(filename):
        return (str(filename) + " already exists!")
    
    rslt = requests.get(url)
    
    if rslt.ok:  
    
        file = open(filename, 'wb')
    
        file.write(rslt.content)
    
        file.close()
        
    else:
        return("Bad url!")
    
    return (str(filename) + " created!")

def parse_mvp():
    download('mvp.html', 'https://www.basketball-reference.com/awards/mvp.html')
    f = open('mvp.html')
    file = f.read()
    f.close()
    doc = BeautifulSoup(file, 'html.parser')

    tables = doc.find_all("table")

    tbl = tables[0]
    rows = tbl.find_all('tr')
    hdr = rows[0]
    hdr2 = rows[1]
    dta = tbl.find_all('tr')[2:]


    header = []
    hd = hdr2.find_all('th')
    for th in hd:
        header.append(th.get_text())

    ints = ['Age', 'G']
    floats = ['MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'FG%', '3P%', 'FT%', 'WS', 'WS/48']

    mvp_list = []

    for tr in dta:
        curr_dict = {}
        curr_list = []
        for dat in tr:
            curr_list.append(dat.get_text())

        for i in range(len(curr_list)):
            season = curr_list[header.index("Season")]

            if curr_list[i] == None or curr_list[i] == '':
                curr_dict[header[i]] = 0.0
            elif header[i] in floats:
                curr_dict[header[i]] = float(curr_list[i])
            elif header[i] in ints:
                curr_dict[header[i]] = int(curr_list[i])
            else:
                curr_dict[header[i]] = curr_list[i]
        if curr_dict != {}:
            mvp_list.append(curr_dict)
    return mvp_list


In [None]:
def season_convert(season):
    if season == '1999-00':
        return 2000
    season_right = season[season.find("-")+1:]
    season_left = season[:2]
    return int(season_left+season_right)


In [None]:
mvp_history = pd.DataFrame(parse_mvp())
mvp_history['Season'] = mvp_history['Season'].apply(season_convert)
mvp_history.rename(columns = {'G':'Games Played'}, inplace = True)
mvp_history.rename(columns = {'WS':'Win Shares'}, inplace = True)
mvp_vals = [1 for i in range(65)]
mvp_history["MVP"] = mvp_vals
mvp_history_slice = mvp_history.iloc[2:11]
mvp_field = mvp_field.merge(mvp_history_slice[['Player', 'Season', 'MVP']], on = ['Player', 'Season'], how = 'left').fillna(0)

percent_70_played = len(mvp_history[mvp_history['Games Played'] > 70])/len(mvp_history)

In [None]:
ax = mvp_history.plot.scatter('Season', 'Games Played', cmap = 'plasma', c = 'Win Shares', sharex = False, figsize = (5,4))
ax.set_ylabel("Games Played")
ax.set_xlabel("Season")
ax.set_title("Figure 1: MVPs Win Shares and Games Played per Season", pad=20)
ax.get_figure().savefig('mvp_history.svg')

In [None]:
def parse_2021_advanced():

    download('stats2021_advanced.html', 'https://www.basketball-reference.com/leagues/NBA_2021_advanced.html')
    f = open('stats2021_advanced.html')
    file = f.read()
    f.close()
    doc = BeautifulSoup(file, 'html.parser')

    tables = doc.find_all("table")

    tbl = tables[0]
    rows = tbl.find_all('tr')
    hdr = rows[0]
    dta = tbl.find_all('tr')[1:]


    header = []
    hd = hdr.find_all('th')
    for th in hd:
        header.append(th.get_text())


    ints = ['Age', 'G', 'GS', 'Rk']
    strings = ['Player', 'Pos', 'Tm']
    

    players_2021 = []
    names = []
    for tr in dta:
        curr_dict = {}
        curr_list = []
        
        for dat in tr:
            try:

                this_row = []
                if tr.index(dat) == 1:
                    name = dat.get_text()
                    if name in names or name == "Player":
                        break
                    else:
                        names.append(name)
                this_row.append(dat.get_text())
                curr_list += this_row
            except AttributeError:
                break

    

        for i in range(len(curr_list)):

            if curr_list[i] == '' or curr_list[i] == None:
                curr_dict[header[i]] = 0.000
            
            elif header[i] in strings:
                curr_dict[header[i]] = curr_list[i]
                
            elif header[i] in ints:
                
                curr_dict[header[i]] = int(curr_list[i])
            else:
                curr_dict[header[i]] = float(curr_list[i])
                
        players_2021.append(curr_dict)
    
    return players_2021

players_2021 = pd.DataFrame(parse_2021_advanced()).dropna().reset_index()

train, test = train_test_split(mvp_field, random_state=0)

players_2021['G'] = players_2021['G'].apply(lambda x: int(((x/64)*72)*(82/72)))
players_2021['WS'] = ((players_2021['WS'] / 64)*72)*(82/72)

players_2021 = players_2021[players_2021.G >= 60]
players_2021 = players_2021[players_2021.WS >= 8]



In [None]:

model = Pipeline([
    ("scale", StandardScaler()),
    ("logr", LogisticRegression())
    
])

fields = ["VORP", "PER", "WS", "BPM", 'G']

model.fit(train[fields], train["MVP"])

players_2021["prediction"] = model.predict(players_2021[fields])
mvp_predicted = players_2021.sort_values(by="prediction", ascending=False) #Nikola Jokic is projected to win this year's MVP

mean_score = cross_val_score(model, test[fields],
                         test["MVP"], cv = 2).mean()
sd_score = cross_val_score(model, test[fields],
                         test["MVP"], cv = 2).std()

mean_score

In [None]:
ax = pd.Series(model['logr'].coef_.flatten(), index = fields).plot.barh(figsize = (5,4))
ax.set_xlabel("Weight")
ax.set_ylabel("Feature")
ax.set_title("Figure 3: Logistic Regression Coefficients", pad=20)
ax.get_figure().savefig('coefficients.svg')


In [None]:
past_mvps = pd.read_csv('mvp-predict/data/mvps.csv')
model['logr'].coef_.flatten()

In [None]:
km = KMeans(n_clusters = 2)


past_mvps['last_name'] = past_mvps['Name'].apply(lambda x: x[x.find(" ")+1:] )
past_mvps['cluster'] = km.fit_predict(past_mvps[['PER', 'WS']])

def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x'], point['y'], str(point['val']))


ax = past_mvps.plot.scatter(x = 'PER', y = 'WS', c = past_mvps['cluster'], cmap = 'Paired', sharex=False, figsize = (5,4))
centroids = pd.DataFrame(km.cluster_centers_, columns = ["average", "elite"])
centroids.plot.scatter(x = 0, y = 1, c = 'red', s = 100, ax = ax)
ax.set_xlabel("Player Efficiency Rating")
ax.set_ylabel("Win Shares")
label_point(past_mvps.PER, past_mvps.WS, past_mvps.last_name, ax)
ax.set_title("Figure 2: Elite vs. Average MVP Performances", pad=20)
ax.get_figure().savefig('performances.svg')

In [None]:
pca = PCA(n_components = .9)
# scale = model['scale']
# scale.fit(train[fields])
# scaled = scale.transform(train[fields])

pca.fit(train[fields])


key_df = pd.DataFrame(pca.components_)
key_df

In [None]:
arr = pca.transform(scaled)
data1_df = pd.DataFrame(arr)

new_df = data1_df @ key_df + pca.mean_

pca.explained_variance_ratio_.round(2)

In [None]:
pca.fit_transform(test[fields]) @ key_df + pca.mean_

In [None]:
mvp_field[fields]

In [None]:
past_mvps