# NBA All Star Prediction

### Data Cleanup / Imports / Initialization

In [1]:

# data manipulation/visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# modelling and blackbox analysis
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from xgboost import XGBClassifier as XGBC
import shap
from pdpbox import pdp 
# pdp_plot_utils.py in the pdpbox source code requires a small tweak
# (weird conflict with matplotlib)
# Fix:
# line 251: parameter should be called "fontsize", not "contour_label_fontsize"
#
# for more info see: https://github.com/SauceCat/PDPbox/issues/40
from scipy.special import expit

# classification metrics and utils
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import roc_curve, plot_confusion_matrix

# basic utils
import os.path
import pickle
from math import exp

from IPython.display import display

pd.options.mode.chained_assignment = None  # default='warn'

GPU = False
tree_method = 'gpu_hist' if GPU else 'auto'

# check if this is our first time running the script
# if the model exists from a prior execution, we can load the model from memory instead of retraining it from scratch
already_trained = os.path.exists('./ASG_predictor.model')

# directory to store plots concerning model evaluation and interpretation
if not os.path.exists('../Plots'):
    os.makedirs('../Plots')

# labeled dataset
df_train = pd.read_csv('../Data/ASG_train.csv')

# current year's dataset (unlabelled)
df_to_predict = pd.read_csv('../Data/ASG_to_predict.csv')

In [2]:
df_train.head(10)

Unnamed: 0,Year,Avg. Pace,PLAYER,TEAM,Team Conference Rank,GP,Team GP,W,PTS,REB,...,BLK,TOV,TS%,3PM,DEFWS,USG%,PIE,Prior ASG Appearances,AS Last Year?,Selected?
0,1996,90.1,Michael Jordan,CHI,1,40,40,35,30.9,5.8,...,0.5,1.7,56.4,1.2,0.004,33.6,19.8,10,1,1
1,1996,90.1,Shaquille O'Neal,LAL,3,40,41,28,26.2,13.2,...,3.1,3.1,55.7,0.0,0.003,30.4,18.4,4,1,1
2,1996,90.1,Latrell Sprewell,GSW,7,39,39,16,25.9,4.9,...,0.8,4.0,57.1,2.2,0.001,28.2,14.5,2,0,1
3,1996,90.1,Karl Malone,UTA,4,40,40,27,25.8,10.8,...,0.7,3.1,57.6,0.0,0.003,31.5,20.8,9,1,1
4,1996,90.1,Hakeem Olajuwon,HOU,1,37,41,28,24.1,9.4,...,2.2,3.7,54.8,0.1,0.003,32.2,16.3,11,1,1
5,1996,90.1,Mitch Richmond,SAC,8,40,40,16,24.1,3.9,...,0.3,2.8,56.0,1.8,0.002,28.4,14.3,4,1,1
6,1996,90.1,Glen Rice,CHH,6,37,40,21,24.0,4.2,...,0.4,2.2,57.8,2.5,0.002,25.4,11.3,1,1,1
7,1996,90.1,Allen Iverson,PHI,15,34,39,7,22.8,4.5,...,0.4,4.9,50.3,2.2,0.0,29.0,10.3,0,0,0
8,1996,90.1,Vin Baker,MIL,10,36,39,18,22.4,10.7,...,1.4,3.6,56.2,0.2,0.003,26.9,14.2,2,1,1
9,1996,90.1,Gary Payton,SEA,2,40,40,29,22.2,4.4,...,0.1,2.5,55.1,1.4,0.004,26.0,15.6,3,1,1


In [3]:
df_to_predict.head(10)

Unnamed: 0,Year,Avg. Pace,Player,Team,Team Conference Rank,GP,Team GP,W,PTS,REB,...,STL,BLK,TOV,TS%,3PM,DEFWS,USG%,PIE,Prior ASG Appearances,AS Last Year
0,2020,99.5,Aaron Gordon,ORL,13,19,33,13,13.8,7.2,...,0.7,0.8,2.8,0.528,1.6,0.7,23.3,10.8,0,0
1,2020,99.5,Aaron Holiday,IND,4,30,30,15,7.4,1.3,...,0.5,0.1,0.8,0.465,1.1,0.4,19.1,4.3,0,0
2,2020,99.5,Aaron Nesmith,BOS,9,17,32,15,4.4,2.6,...,0.2,0.3,0.6,0.557,1.0,0.2,12.2,3.4,0,0
3,2020,99.5,Abdel Nader,PHO,4,15,31,20,6.0,2.1,...,0.3,0.1,0.6,0.609,0.7,0.2,19.5,8.9,0,0
4,2020,99.5,Adam Mokoka,CHI,7,7,31,15,1.6,0.6,...,0.3,0.1,0.3,0.393,0.1,0.0,16.3,4.0,0,0
5,2020,99.5,Al Horford,OKC,12,22,32,13,14.6,6.9,...,0.9,0.8,1.1,0.541,2.1,0.9,22.5,12.9,5,0
6,2020,99.5,Al-Farouq Aminu,ORL,13,6,33,13,3.2,3.5,...,1.2,0.5,1.3,0.47,0.3,0.2,13.0,7.7,0,0
7,2020,99.5,Alec Burks,NYK,6,21,33,16,11.6,4.7,...,0.7,0.2,1.1,0.57,2.1,0.7,19.7,10.2,0,0
8,2020,99.5,Aleksej Pokusevski,OKC,12,17,32,13,3.3,3.5,...,0.5,1.1,1.3,0.301,0.6,0.5,16.9,3.9,0,0
9,2020,99.5,Alex Caruso,LAL,3,26,33,22,5.3,2.5,...,1.0,0.2,1.2,0.531,0.9,0.9,13.8,8.0,0,0


In [4]:
# Outlier and Game Adjustment Code taken from: https://github.com/cjporteo/ml-NBA-asg-predictor/

# pull out the current season from the df for file naming purposes
prediction_year = df_to_predict.loc[0, 'Year'] + 1

# save the PLAYER and TEAM columns for later
# these features get dropped for the prediction process but we will add them back in later for traceability
names_and_teams = df_to_predict[['Player', 'Team']]

outliers = set([
    ('Dirk Nowitzki', 2018), # A: Dirk's selection in the 2019 ASG was very much a "respect" pick - career tribute
    ('Kobe Bryant', 2015), # A
    ('Kobe Bryant', 2013), # A/B
    ('Yao Ming', 2010), # B: China rallying for him despite playing in only 5 out of a possible 44 games (11.3%)
    ('Allen Iverson', 2009), # A
    ('Allen Iverson', 2008), # A
    ("Shaquille O'Neal", 2006), # A/B
    ('Ray Allen', 2003), # B
    ('Jamaal Magloire', 2003), # infamously questionable pick
    ('Vince Carter', 2002), # B
    ('Grant Hill', 2000), # B
    ('Anfernee Hardaway', 1997), #A/B
    ('Anfernee Hardaway', 1996) #A/B
])

# stores the indices of our outliers
outlier_indices = []

# gather outlier indices
def process_outliers(row):
    if (row['PLAYER'], row['Year']) in outliers:
        outlier_indices.append(row.name)

# reject the outliers
df_train[['PLAYER', 'Year']].apply(process_outliers, axis=1)
df_train.drop(outlier_indices, inplace=True)

for df in [df_train, df_to_predict]:
    # drop any player who appeared in less than 7 games 
    df.drop(df[df['GP'] < 7].index, inplace=True)

    # the percent of team's games the player played in
    df['Play Pct.'] = (df['GP'] / df['Team GP']).map(lambda pct : min(pct, 1))

    # nomalized via league average pace for that year
    for col in ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', '3PM']:
        df['Adjusted ' + col] = df[col] / df['Avg. Pace']

# final features used for this model
features = [
    'Adjusted PTS',
    'Adjusted REB',
    'Adjusted AST',
    'Adjusted STL',
    'Adjusted BLK',
    'Adjusted TOV',
    'Adjusted 3PM',
    'DEFWS',
    'TS%',
    'USG%',
    'PIE',
    'Play Pct.',
    'Team Conference Rank',
    'Prior ASG Appearances',
    'AS Last Year?'
]

### Exploratory Data Analysis¶


In [5]:
df_train = df_train[features + ['Selected?']]
df_train.describe()

Unnamed: 0,Adjusted PTS,Adjusted REB,Adjusted AST,Adjusted STL,Adjusted BLK,Adjusted TOV,Adjusted 3PM,DEFWS,TS%,USG%,PIE,Play Pct.,Team Conference Rank,Prior ASG Appearances,AS Last Year?,Selected?
count,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0,9185.0
mean,0.093283,0.040734,0.020493,0.007382,0.004826,0.013729,0.006442,0.001681,51.58417,18.262635,8.628285,0.759407,7.872183,0.494611,0.060316,0.063255
std,0.064757,0.027551,0.019777,0.004985,0.005487,0.008912,0.007396,0.001247,7.441232,4.985169,3.935962,0.249638,4.286073,1.707676,0.238084,0.243435
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003,0.0,4.3,-16.7,0.145833,1.0,0.0,0.0,0.0
25%,0.042599,0.020675,0.006615,0.00333,0.0011,0.006979,0.0,0.001,47.9,14.7,6.3,0.588235,4.0,0.0,0.0,0.0
50%,0.077742,0.034079,0.014024,0.006515,0.003222,0.011905,0.004,0.001,52.2,17.8,8.7,0.853659,8.0,0.0,0.0,0.0
75%,0.130676,0.053838,0.027203,0.009923,0.006263,0.018681,0.010953,0.002,56.0,21.5,11.0,0.97561,12.0,0.0,0.0,0.0
max,0.38453,0.18313,0.142237,0.032223,0.048405,0.060166,0.053,0.014,105.0,40.7,28.6,1.0,15.0,16.0,1.0,1.0


In [6]:
# Handle missing values
df_train.isnull().sum()

Adjusted PTS             0
Adjusted REB             0
Adjusted AST             0
Adjusted STL             0
Adjusted BLK             0
Adjusted TOV             0
Adjusted 3PM             0
DEFWS                    0
TS%                      0
USG%                     0
PIE                      0
Play Pct.                0
Team Conference Rank     0
Prior ASG Appearances    0
AS Last Year?            0
Selected?                0
dtype: int64

In [23]:
# Handle duplicate records
dup = df_train.duplicated()
print(dup.sum())
df_train[dup]
df_train.drop_duplicates(inplace=True)

0


In [29]:
%%capture
# Scatterplots between each feature and selected
import seaborn as sns
for feat in features:
    plt.ion()
    fig, ax = plt.subplots(figsize=(5,5))
    sns.catplot(x="Selected?", y=feat, data=df_train)
    plt.savefig('../Plots/ScatterPlots/{}-scatterplot.png'.format(feat))

In [40]:
%%capture
# Heat Map to show correlation between different features
import plotly.figure_factory as ff
from plotly.offline import iplot
corrs=df_train.corr()
# Plotting Heatmap to know about Correlation
figure=ff.create_annotated_heatmap(z=corrs.values,x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.layout.margin = dict(l=200, t=200)
figure.layout.height = 800
figure.layout.width = 1000
figure.write_image(file='../Plots/HeatMap/heatMap.png', format='.png')
iplot(figure)

In [31]:
%%capture
# Violin plots to show how each feature contributes to selection
for feat in features:
    fig, ax = plt.subplots(figsize=(5,5))
    sns.violinplot(ax=ax, data=df_train, y=feat, x=[''] * len(df_train), hue='Selected?', split=True)
    ax.legend_.set_title('Result')
    new_labels = ['Not All-Star', 'All-Star']
    for t, l in zip(ax.legend_.texts, new_labels):
        t.set_text(l)
    feat = feat.replace('?', '').replace('%', '')
    plt.savefig('../Plots/ViolinPlots/{}-violinplot.png'.format(feat))
    plt.show()