In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
import statsmodels.api as sm
import dmba
from dmba import regressionSummary, classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, accuracy_score, r2_score

from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor


from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# merge dataset with advanced stats
def merge(first_dict, adv_stat):
    result = first_dict
    for i in result:
        df_left = result[i]
        df_right = adv_stat[i]
        df = pd.merge(df_left, df_right, on = 'Player', how='left')
        result[i] = df
    return result
# Drop duplicated and useless columns
def drop_col(dict_input):
    new_dict = dict_input
    for i in dict_input:
        result = new_dict[i]
        result = result.drop(['Age_x','Pos','Tm_x', 'First', 'PtsWon', 'PtsMax', 'Age_y', 'Tm_y', 'G_y', 'MP_y',
           'WS_y', 'WS/48_y', 'year_x'], axis = 1)
        new_dict[i] = result
    return new_dict
# Deal with NA values
def fill_na_val(dict_input):
    new_dict = dict_input
    for i in dict_input:
        result = new_dict[i]['3P%']
        result = result.fillna(0)
        new_dict[i]['3P%'] = result
    return new_dict
def check_na(data_dict):
    for i in data_dict:
        dataframe = data_dict[i]
        print("{}:{}".format(i, dataframe.isna().sum().sum()), end = ' ')
    print('')
def merge_year(data_dict):
    result = pd.DataFrame()
    for i in data_dict:
        to_merge = data_dict[i]
        result = pd.concat([result, to_merge])
    return result
# reindex vote share to the last column
def reindexed(data, target):
    target_var = data[target]
    result = data.iloc[:, data.columns != target]
    result[target] = target_var
    return result

In [4]:
# Create list for filenames for reading csv
years = list(range(1999, 2020))
years.remove(2017)
year_names = []
for i in years:
    year_names.append('{}-{}'.format(i, i + 1))

# save datasets in dictionaries
mvp_names = []
for i in year_names:
    mvp_names.append('data/NBA_MVP_' + i + '.csv')
MVP_dict = {}
for i in range(len(years)):
    df = pd.DataFrame(pd.read_csv(mvp_names[i]))
    df['year'] = year_names[i]
    MVP_dict[years[i]] = df
    
dpoy_names = []
for i in year_names:
    dpoy_names.append('data/NBA_DPOY_' + i + '.csv')
DPOY_dict = {}
for i in range(len(years)):
    df = pd.DataFrame(pd.read_csv(dpoy_names[i]))
    df['year'] = year_names[i]
    DPOY_dict[years[i]] = df

roy_names = []
for i in year_names:
    roy_names.append('data/NBA_ROY_' + i + '.csv')
ROY_dict = {}
for i in range(len(years)):
    df = pd.DataFrame(pd.read_csv(roy_names[i]))
    df['year'] = year_names[i]
    ROY_dict[years[i]] = df

adv_names = []
for i in year_names:
    adv_names.append('data/NBA_advanced_' + i + '.csv')
adv_dict = {}
for i in range(len(years)):
    df = pd.DataFrame(pd.read_csv(adv_names[i]))
    df['year'] = year_names[i]
    adv_dict[years[i]] = df

MVP_dict = merge(MVP_dict, adv_dict)
DPOY_dict = merge(DPOY_dict, adv_dict)
ROY_dict = merge(ROY_dict, adv_dict)

MVP_dict = drop_col(MVP_dict)
DPOY_dict = drop_col(DPOY_dict)
ROY_dict = drop_col(ROY_dict)

MVP_dict = fill_na_val(MVP_dict)
DPOY_dict = fill_na_val(DPOY_dict)
ROY_dict = fill_na_val(ROY_dict) 

# Notice that 2018 ROY data has incomplete data, drop the incomplete rows
ROY_dict[2018] = ROY_dict[2018].dropna()

ROY_data = merge_year(ROY_dict)
MVP_data = merge_year(MVP_dict)
DPOY_data = merge_year(DPOY_dict)

ROY_data = reindexed(ROY_data, 'Share')
MVP_data = reindexed(MVP_data, 'Share')
DPOY_data = reindexed(DPOY_data, 'Share')

# Create all players' stat DF for visualization
all_adv = merge_year(adv_dict)
# take mp above 75%
all_adv = all_adv[all_adv.MP >= np.percentile(all_adv.MP, 75)]

In [5]:
all_adv = all_adv.reset_index(drop=True)
MVP_data = MVP_data.reset_index(drop=True)

In [6]:
all_2022 = pd.DataFrame(pd.read_excel('./2022pred/2022stats.xlsx'))
adv_2022 = pd.DataFrame(pd.read_excel('./2022pred/2022adv.xlsx'))
rook_2022 = pd.DataFrame(pd.read_excel('./2022pred/2022Rookies.xlsx'))

data_2022 = pd.merge(all_2022, adv_2022, on = 'Player', how='left')

data_2022 = data_2022.drop(['Age_x','Pos','Tm_x', 'Age_y', 'Tm_y', 'G_y', 'MP_y', 'WS_y', 'WS/48_y'], axis = 1)

data_2022 = data_2022.fillna(0)

rook_ref = data_2022
data_2022 = data_2022[data_2022.MP_x >= 30]