In [138]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import random

import pickle

from bs4 import BeautifulSoup
import requests
import lxml
import unicodedata

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

#Mean Absolute Error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 


In [150]:
def strip_accents_and_periods(text):
    '''Normalize player name spellings'''
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text).replace('.','')
        
        
def lowercasestrip(string):
    '''Lowercase player names and remove spaces/punctuation'''
    try:
        string = (string.replace('-','').replace('_','').
                  replace(' ','').replace('.','').replace('\'','').lower())
    except:
        pass
    return string

def dollarstoint(string):
    '''Convert hoopshype salary info to integer number of dollas'''
    try:
        string = str(string)
        string = string.replace('$','').replace(',','') 
        return int(string)
    except:
        return np.nan

## Generate predictions on saved current season player stats

In [142]:
# old data

Unnamed: 0_level_0,Name,Salary
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
stephencurry,Stephen Curry,"$45,780,966"
johnwall,John Wall,"$44,310,840"
russellwestbrook,Russell Westbrook,"$44,211,146"
jamesharden,James Harden,"$43,848,000"
lebronjames,LeBron James,"$41,180,544"
...,...,...
georgioskalaitzakis,Georgios Kalaitzakis,"$462,629"
samdekker,Sam Dekker,"$350,000"
troywilliams,Troy Williams,"$122,741"
stanleyjohnson,Stanley Johnson,"$118,048"


In [151]:
dfcurrentsal = pd.read_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/dfsal2021_bbr.csv')
dfcurrentsal = dfcurrentsal.set_index('Player',drop=True)

dfnba = pd.read_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/df_clean.csv')
dfnba = dfnba.set_index('name-year',drop=True)

dfcurrent = pd.read_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/df_current_clean.csv')
dfcurrent = dfcurrent.set_index('Player',drop=True)

def add_new_features(df_):
    df = df_.copy()
    maxgames = df.G.max()
    df['GS/G'] = df['GS']/df['G']
    df['G/MaxG'] = df['G']/maxgames
    df['MP_sq'] = df['MP']**2
    df['PTS_sq'] = df['PTS']**2
    return df

dfnba = add_new_features(dfnba)
dfcurrent = add_new_features(dfcurrent)

display(dfnba.head())
display(dfcurrent.head())
    

Unnamed: 0_level_0,pos,name,weight,height,Age,G,GS,MP,FG%,3P,...,Next_Sal,min_sal_curr,min_sal_next,sal_cap_curr,sal_cap_next,height_ft,GS/G,G/MaxG,MP_sq,PTS_sq
name-year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Stephen_Curry_2016,G,Stephen-Curry,185,6-2,28.0,79.0,79.0,33.4,0.468,4.1,...,39.3449,875000.0,1313000.0,94143000.0,99093000.0,6.166667,1.0,0.963415,1115.56,640.09
Kevin_Durant_2016,F-G,Kevin-Durant,240,6-10,28.0,62.0,62.0,33.4,0.537,1.9,...,28.360732,875000.0,1313000.0,94143000.0,99093000.0,6.833333,1.0,0.756098,1115.56,630.01
Gordon_Hayward_2016,F-G,Gordon-Hayward,225,6-7,26.0,73.0,73.0,34.5,0.471,2.0,...,33.7242,875000.0,1313000.0,94143000.0,99093000.0,6.583333,1.0,0.890244,1190.25,479.61
Kyle_Lowry_2016,G,Kyle-Lowry,196,6-0,30.0,60.0,60.0,37.4,0.464,3.2,...,32.789208,875000.0,1313000.0,94143000.0,99093000.0,6.0,1.0,0.731707,1398.76,501.76
Otto_Porter_2016,F,Otto-Porter,198,6-8,23.0,80.0,80.0,32.6,0.516,1.9,...,28.1035,875000.0,1313000.0,94143000.0,99093000.0,6.666667,1.0,0.97561,1062.76,179.56


Unnamed: 0_level_0,Rk,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,AST,STL,BLK,TOV,PF,PTS,GS/G,G/MaxG,MP_sq,PTS_sq
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Precious Achiuwa,1,C,22,TOR,21,17,26.5,3.4,8.8,0.386,...,1.6,0.5,0.6,1.1,2.5,8.0,0.809524,0.7,702.25,64.0
Steven Adams,2,C,28,MEM,29,29,24.9,2.6,4.9,0.535,...,2.6,0.9,0.6,1.8,1.7,7.0,1.0,0.966667,620.01,49.0
Bam Adebayo,3,C,24,MIA,18,18,32.9,7.0,13.5,0.519,...,3.2,1.1,0.3,2.9,3.3,18.7,1.0,0.6,1082.41,349.69
Santi Aldama,4,PF,21,MEM,16,0,9.8,1.5,4.1,0.364,...,0.8,0.1,0.2,0.3,1.1,3.6,0.0,0.533333,96.04,12.96
LaMarcus Aldridge,5,C,36,BRK,25,8,23.6,6.0,10.4,0.573,...,0.9,0.4,1.2,0.8,1.7,14.0,0.32,0.833333,556.96,196.0


In [152]:
feats = ['Age', 'G/MaxG','GS/G','MP','MP_sq','3P','3P%','FT','TRB','AST', 'TOV','PTS','PTS_sq']

lr_filename = r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/lr_model.sav'
loaded_model = pickle.load(open(lr_filename, 'rb'))
result = loaded_model.predict(X)

# test = dfnba.loc[['Evan_Fournier_2020']][feats]
# float(loaded_model.predict(test))   --- 16.458, this is good!

market_vals = loaded_model.predict(dfcurrent[feats])
dfcurrent['Market_Val'] = market_vals


In [153]:
dfcurrent =  dfcurrent.reset_index()
dfcurrent['Name'] = dfcurrent['Player'].apply(strip_accents_and_periods).apply(lowercasestrip)
dfcurrent = dfcurrent.set_index('Name',drop=True)
dfcurrent

def addcurrentsal(dfcurrent_, dfcurrentsal_):
    '''create new dataframe with columns for current and previous salary info'''

    dfnew = dfcurrent_.copy()
    for index, row in dfcurrent_.iterrows():
        try:
            dfnew.loc[index, 'Current_Sal'] = dollarstoint(dfcurrentsal_.loc[index, 'Salary'])/1e6
        except: 
            dfnew.loc[index, 'Current_Sal'] = np.nan
    return dfnew
            
dfcurrent = addcurrentsal(dfcurrent, dfcurrentsal)

In [154]:
dfcurrent.to_csv(r'/Users/andrei/Dropbox/Metis/7-Engineering/andrei-eng-project/data/df_marketvalues.csv')

In [155]:
dfcurrent[dfcurrent.Current_Sal.isna()]

Unnamed: 0_level_0,Player,Rk,Pos,Age,Tm,G,GS,MP,FG,FGA,...,BLK,TOV,PF,PTS,GS/G,G/MaxG,MP_sq,PTS_sq,Market_Val,Current_Sal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
josealvarado,Jose Alvarado,9,PG,23,NOP,8,0,5.1,0.6,1.6,...,0.0,0.1,0.4,1.8,0.0,0.266667,26.01,3.24,4.234767,
joelayayi,Joel Ayayi,18,SG,21,WAS,5,0,2.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.166667,6.76,0.0,6.564728,
keljinblevins,Keljin Blevins,46,SF,26,POR,3,0,4.7,0.7,1.3,...,0.0,0.7,0.0,2.0,0.0,0.1,22.09,4.0,0.136127,
ignasbrazdeikis,Ignas Brazdeikis,58,SF,23,ORL,13,0,8.5,0.6,2.4,...,0.0,0.2,1.0,1.8,0.0,0.433333,72.25,3.24,3.433267,
devontaecacok,Devontae Cacok,77,PF,25,SAS,1,0,2.0,1.0,1.0,...,0.0,0.0,0.0,2.0,0.0,0.033333,4.0,4.0,7.250017,
justinchampagnie,Justin Champagnie,87,SF,20,TOR,14,0,6.8,0.9,1.9,...,0.1,0.0,0.8,2.2,0.0,0.466667,46.24,4.84,4.720338,
chrischiozza,Chris Chiozza,88,PG,26,GSW,11,0,11.3,1.0,2.7,...,0.0,0.8,1.4,2.9,0.0,0.366667,127.69,8.41,1.73391,
amircoffey,Amir Coffey,94,SG,24,LAC,17,3,12.5,0.8,2.4,...,0.1,0.3,0.8,3.3,0.176471,0.566667,156.25,10.89,2.287818,
tylercook,Tyler Cook,98,PF,24,CHI,2,0,8.5,1.0,2.0,...,0.0,0.5,0.5,3.0,0.0,0.066667,72.25,9.0,2.8907,
sharifecooper,Sharife Cooper,99,PG,20,ATL,6,0,2.3,0.3,1.0,...,0.0,0.3,0.2,0.7,0.0,0.2,5.29,0.49,5.9467,


In [156]:
dfcurrent.Current_Sal.isna().sum()

59

In [125]:
dfcurrent[dfcurrent.Current_Sal.isna()]

Unnamed: 0_level_0,Player,Rk,Pos,Age,Tm,G,GS,MP,FG,FGA,...,BLK,TOV,PF,PTS,GS/G,G/MaxG,MP_sq,PTS_sq,Market_Val,Current_Sal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
santialdama,Santi Aldama,4,PF,21,MEM,16,0,9.8,1.5,4.1,...,0.2,0.3,1.1,3.6,0.0,0.533333,96.04,12.96,4.213738,
marvinbagleyiii,Marvin Bagley III,21,PF,22,SAC,13,1,19.6,3.2,6.6,...,0.3,0.5,1.2,8.1,0.076923,0.433333,384.16,65.61,3.879232,
brandonbostonjr,Brandon Boston Jr,53,SG,20,LAC,16,0,12.6,1.9,4.9,...,0.2,0.7,0.9,5.8,0.0,0.533333,158.76,33.64,0.990304,
gregbrowniii,Greg Brown III,66,SF,20,POR,11,0,4.6,0.6,1.8,...,0.4,0.2,0.6,1.5,0.0,0.366667,21.16,2.25,5.518955,
troybrownjr,Troy Brown Jr,70,SF,22,CHI,18,1,12.8,1.7,4.1,...,0.1,0.4,0.8,4.3,0.055556,0.6,163.84,18.49,3.000929,
vernoncareyjr,Vernon Carey Jr,82,C,20,CHO,2,1,6.0,1.5,3.0,...,0.0,0.5,0.5,3.5,0.5,0.066667,36.0,12.25,3.135281,
wendellcarterjr,Wendell Carter Jr,84,C,22,ORL,29,29,29.4,5.0,10.0,...,0.6,1.5,2.6,12.9,1.0,0.966667,864.36,166.41,14.996341,
enesfreedom,Enes Freedom,147,C,29,BOS,17,0,11.2,1.6,3.4,...,0.5,0.6,1.1,4.1,0.0,0.566667,125.44,16.81,1.95844,
mauriceharkless,Maurice Harkless,178,SF,28,SAC,19,14,18.8,1.4,3.6,...,0.4,0.5,1.5,3.6,0.736842,0.633333,353.44,12.96,3.548215,
derrickjonesjr,Derrick Jones Jr,235,PF,24,CHI,22,2,18.5,2.6,4.5,...,1.0,0.5,2.4,7.0,0.090909,0.733333,342.25,49.0,3.004549,


In [114]:
df = dfcurrent.copy()
df


Unnamed: 0_level_0,Player,Rk,Pos,Age,Tm,G,GS,MP,FG,FGA,...,BLK,TOV,PF,PTS,GS/G,G/MaxG,MP_sq,PTS_sq,Market_Val,Current_Sal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
preciousachiuwa,Precious Achiuwa,1,C,22,TOR,21,17,26.5,3.4,8.8,...,0.6,1.1,2.5,8.0,0.809524,0.700000,702.25,64.00,8.086684,2.711280
stevenadams,Steven Adams,2,C,28,MEM,29,29,24.9,2.6,4.9,...,0.6,1.8,1.7,7.0,1.000000,0.966667,620.01,49.00,8.711486,17.073171
bamadebayo,Bam Adebayo,3,C,24,MIA,18,18,32.9,7.0,13.5,...,0.3,2.9,3.3,18.7,1.000000,0.600000,1082.41,349.69,19.186280,28.103500
santialdama,Santi Aldama,4,PF,21,MEM,16,0,9.8,1.5,4.1,...,0.2,0.3,1.1,3.6,0.000000,0.533333,96.04,12.96,4.213738,
lamarcusaldridge,LaMarcus Aldridge,5,C,36,BRK,25,8,23.6,6.0,10.4,...,1.2,0.8,1.7,14.0,0.320000,0.833333,556.96,196.00,10.022009,2.641691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
thaddeusyoung,Thaddeus Young,477,PF,33,SAS,22,0,14.1,2.9,5.1,...,0.3,1.0,1.4,6.2,0.000000,0.733333,198.81,38.44,5.858925,14.190000
traeyoung,Trae Young,478,PG,23,ATL,28,28,34.2,9.4,20.4,...,0.1,4.1,1.7,27.0,1.000000,0.933333,1169.64,729.00,39.743363,8.326471
omeryurtseven,Omer Yurtseven,479,C,23,MIA,20,0,7.2,1.3,2.4,...,0.5,0.4,0.7,3.1,0.000000,0.666667,51.84,9.61,4.989822,1.489065
codyzeller,Cody Zeller,480,C,29,POR,24,0,13.3,1.8,3.2,...,0.3,0.7,2.1,5.1,0.000000,0.800000,176.89,26.01,4.057463,2.389641
