# NBA All Star Prediction

### Data Cleanup / Imports / Initialization

In [2]:

# data manipulation/visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# modelling and blackbox analysis
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier as XGBC
import shap
from pdpbox import pdp 
from scipy.special import expit

# classification metrics and utils
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import roc_curve, plot_confusion_matrix

# basic utils
import os.path
import pickle
from math import exp

from IPython.display import display

pd.options.mode.chained_assignment = None  # default='warn'

GPU = False
tree_method = 'gpu_hist' if GPU else 'auto'

# check if this is our first time running the script
# if the model exists from a prior execution, we can load the model from memory instead of retraining it from scratch
already_trained = os.path.exists('./ASG_predictor.model')

# directory to store plots concerning model evaluation and interpretation
if not os.path.exists('../Plots'):
    os.makedirs('../Plots')

# labeled dataset
df_train = pd.read_csv('../Data/ASG_train.csv')

# current year's dataset (unlabelled)
df_to_predict = pd.read_csv('../Data/ASG_to_predict.csv')

In [3]:
df_train.head(10)

Unnamed: 0,Year,Avg. Pace,PLAYER,TEAM,Team Conference Rank,GP,Team GP,W,PTS,REB,...,BLK,TOV,TS%,3PM,DEFWS,USG%,PIE,Prior ASG Appearances,AS Last Year?,Selected?
0,1996,90.1,Michael Jordan,CHI,1,40,40,35,30.9,5.8,...,0.5,1.7,56.4,1.2,0.004,33.6,19.8,10,1,1
1,1996,90.1,Shaquille O'Neal,LAL,3,40,41,28,26.2,13.2,...,3.1,3.1,55.7,0.0,0.003,30.4,18.4,4,1,1
2,1996,90.1,Latrell Sprewell,GSW,7,39,39,16,25.9,4.9,...,0.8,4.0,57.1,2.2,0.001,28.2,14.5,2,0,1
3,1996,90.1,Karl Malone,UTA,4,40,40,27,25.8,10.8,...,0.7,3.1,57.6,0.0,0.003,31.5,20.8,9,1,1
4,1996,90.1,Hakeem Olajuwon,HOU,1,37,41,28,24.1,9.4,...,2.2,3.7,54.8,0.1,0.003,32.2,16.3,11,1,1
5,1996,90.1,Mitch Richmond,SAC,8,40,40,16,24.1,3.9,...,0.3,2.8,56.0,1.8,0.002,28.4,14.3,4,1,1
6,1996,90.1,Glen Rice,CHH,6,37,40,21,24.0,4.2,...,0.4,2.2,57.8,2.5,0.002,25.4,11.3,1,1,1
7,1996,90.1,Allen Iverson,PHI,15,34,39,7,22.8,4.5,...,0.4,4.9,50.3,2.2,0.0,29.0,10.3,0,0,0
8,1996,90.1,Vin Baker,MIL,10,36,39,18,22.4,10.7,...,1.4,3.6,56.2,0.2,0.003,26.9,14.2,2,1,1
9,1996,90.1,Gary Payton,SEA,2,40,40,29,22.2,4.4,...,0.1,2.5,55.1,1.4,0.004,26.0,15.6,3,1,1


In [4]:
df_to_predict.head(10)

Unnamed: 0,Year,Avg. Pace,PLAYER,TEAM,Team Conference Rank,GP,Team GP,W,PTS,REB,...,STL,BLK,TOV,TS%,3PM,DEFWS,USG%,PIE,Prior ASG Appearances,AS Last Year?
0,2020,99.5,Aaron Gordon,ORL,13,19,33,13,13.8,7.2,...,0.7,0.8,2.8,0.528,1.6,0.7,23.3,10.8,0,0
1,2020,99.5,Aaron Holiday,IND,4,30,30,15,7.4,1.3,...,0.5,0.1,0.8,0.465,1.1,0.4,19.1,4.3,0,0
2,2020,99.5,Aaron Nesmith,BOS,9,17,32,15,4.4,2.6,...,0.2,0.3,0.6,0.557,1.0,0.2,12.2,3.4,0,0
3,2020,99.5,Abdel Nader,PHO,4,15,31,20,6.0,2.1,...,0.3,0.1,0.6,0.609,0.7,0.2,19.5,8.9,0,0
4,2020,99.5,Adam Mokoka,CHI,7,7,31,15,1.6,0.6,...,0.3,0.1,0.3,0.393,0.1,0.0,16.3,4.0,0,0
5,2020,99.5,Al Horford,OKC,12,22,32,13,14.6,6.9,...,0.9,0.8,1.1,0.541,2.1,0.9,22.5,12.9,5,0
6,2020,99.5,Al-Farouq Aminu,ORL,13,6,33,13,3.2,3.5,...,1.2,0.5,1.3,0.47,0.3,0.2,13.0,7.7,0,0
7,2020,99.5,Alec Burks,NYK,6,21,33,16,11.6,4.7,...,0.7,0.2,1.1,0.57,2.1,0.7,19.7,10.2,0,0
8,2020,99.5,Aleksej Pokusevski,OKC,12,17,32,13,3.3,3.5,...,0.5,1.1,1.3,0.301,0.6,0.5,16.9,3.9,0,0
9,2020,99.5,Alex Caruso,LAL,3,26,33,22,5.3,2.5,...,1.0,0.2,1.2,0.531,0.9,0.9,13.8,8.0,0,0


In [5]:
# Game Adjustment Code taken from: https://github.com/cjporteo/ml-NBA-asg-predictor/

prediction_year = df_to_predict.loc[0, 'Year'] + 1

names_and_teams = df_to_predict[['PLAYER', 'TEAM']]

for df in [df_train, df_to_predict]:

    # the percent of team's games the player played in
    df['Play Pct.'] = (df['GP'] / df['Team GP']).map(lambda pct : min(pct, 1))

    # nomalized via league average pace for that year
    for col in ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', '3PM']:
        df['Adjusted ' + col] = df[col] / df['Avg. Pace']

# final features used for this model
features = [
    'Adjusted PTS',
    'Adjusted REB',
    'Adjusted AST',
    'Adjusted STL',
    'Adjusted BLK',
    'Adjusted TOV',
    'Adjusted 3PM',
    'DEFWS',
    'TS%',
    'USG%',
    'PIE',
    'Play Pct.',
    'Team Conference Rank',
    'Prior ASG Appearances',
    'AS Last Year?'
]

### Exploratory Data Analysis¶


In [6]:
df_train = df_train[features + ['Selected?']]
df_train.describe()

Unnamed: 0,Adjusted PTS,Adjusted REB,Adjusted AST,Adjusted STL,Adjusted BLK,Adjusted TOV,Adjusted 3PM,DEFWS,TS%,USG%,PIE,Play Pct.,Team Conference Rank,Prior ASG Appearances,AS Last Year?,Selected?
count,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0,10081.0
mean,0.08749,0.038476,0.019285,0.006954,0.004555,0.013013,0.006036,0.001604,50.466333,18.149449,8.154568,0.700878,7.898621,0.471779,0.05674,0.058923
std,0.065466,0.027777,0.019575,0.005097,0.005429,0.009079,0.007294,0.001311,11.361965,5.410693,8.842835,0.304265,4.286288,1.680603,0.231357,0.235492
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.007,0.0,0.0,-400.0,0.020833,1.0,0.0,0.0,0.0
25%,0.036209,0.018458,0.005537,0.003257,0.001087,0.00639,0.0,0.001,47.1,14.5,5.9,0.475,4.0,0.0,0.0,0.0
50%,0.071665,0.031868,0.012945,0.006,0.003112,0.010953,0.003222,0.001,51.9,17.7,8.5,0.813953,8.0,0.0,0.0,0.0
75%,0.125385,0.051479,0.025779,0.009783,0.005549,0.018,0.01,0.002,56.0,21.5,10.9,0.974359,12.0,0.0,0.0,0.0
max,0.38453,0.18313,0.142237,0.032967,0.048405,0.060703,0.053,0.014,150.0,66.7,300.0,1.0,15.0,17.0,1.0,1.0


In [7]:
# Handle missing values
df_train.isnull().sum()

Adjusted PTS             0
Adjusted REB             0
Adjusted AST             0
Adjusted STL             0
Adjusted BLK             0
Adjusted TOV             0
Adjusted 3PM             0
DEFWS                    0
TS%                      0
USG%                     0
PIE                      0
Play Pct.                0
Team Conference Rank     0
Prior ASG Appearances    0
AS Last Year?            0
Selected?                0
dtype: int64

In [8]:
# Handle duplicate records
dup = df_train.duplicated()
print(dup.sum())
df_train[dup]
df_train.drop_duplicates(inplace=True)

5


In [9]:
%%capture
# Scatterplots between each feature and selected
import seaborn as sns
for feat in features:
    plt.ion()
    fig, ax = plt.subplots(figsize=(5,5))
    sns.catplot(x="Selected?", y=feat, data=df_train)
    plt.savefig('../Plots/ScatterPlots/{}-scatterplot.png'.format(feat))

In [10]:
%%capture
# Heat Map to show correlation between different features
import plotly.figure_factory as ff
from plotly.offline import iplot
corrs=df_train.corr()
# Plotting Heatmap to know about Correlation
figure=ff.create_annotated_heatmap(z=corrs.values,x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.layout.margin = dict(l=200, t=200)
figure.layout.height = 800
figure.layout.width = 1000
figure.write_image(file='../Plots/HeatMap/heatMap.png', format='.png')
iplot(figure)

In [11]:
%%capture
# Violin plots to show how each feature contributes to selection
for feat in features:
    fig, ax = plt.subplots(figsize=(5,5))
    sns.violinplot(ax=ax, data=df_train, y=feat, x=[''] * len(df_train), hue='Selected?', split=True)
    ax.legend_.set_title('Result')
    new_labels = ['Not All-Star', 'All-Star']
    for t, l in zip(ax.legend_.texts, new_labels):
        t.set_text(l)
    feat = feat.replace('?', '').replace('%', '')
    plt.savefig('../Plots/ViolinPlots/{}-violinplot.png'.format(feat))
    plt.show()

### Apply PCA to Reduce Dimensionality

In [12]:
from sklearn.decomposition import PCA

# Want to keep 95% of the variance rather than manually declaring number of components
pca = PCA(n_components = 0.95)
pca.fit(df_train[features].values)
reduced = pca.transform(df_train[features].values)

### Divide Data into Test, Training, and Validation

In [13]:
# 70% train, 15% test, 15% val split
train, other = train_test_split(df_train, train_size=0.7, stratify=df_train['Selected?'], shuffle=True, random_state=0)
validation, test = train_test_split(other, train_size=0.5, stratify=other['Selected?'], shuffle=True, random_state=0)

X_train, y_train = train[features], train['Selected?']
X_val, y_val = validation[features], validation['Selected?']
X_test, y_test = test[features], test['Selected?']

tuned_model = None

### Model Creation

In [17]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

if not tuned_model:
    X_train_oversampled, y_train_oversampled = SMOTE(random_state=0).fit_resample(X_train, y_train)

    # Apply feature scaling on the data
    scaler = StandardScaler()
    scaler.fit(X_train_oversampled)
    X_train_oversampled = scaler.transform(X_train_oversampled)

    # Tune hyperparameters using GridSearchCV
    clf = GridSearchCV(SGDClassifier(loss='log', ), {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'alpha': [a * (10**b) for b in range(-6, -1) for a in range(1, 10)] + [0.1],
        'n_jobs': [-1]
    }, cv=5, return_train_score=False)
    clf.fit(X_train_oversampled, y_train_oversampled)
    clf.best_params_

    # Build model with optimal parameters 
    tuned_model = SGDClassifier(loss='log', alpha=clf.best_params_['alpha'], penalty=clf.best_params_['penalty'], n_jobs=-1)
    tuned_model.fit(X=X_train_oversampled, y=y_train_oversampled)

    # serialize the model and export to local memory
    pickle.dump(tuned_model, open('ASG_predictor.model', 'wb'))

### Apply Model to Current Season

In [18]:
# Apply the model to the current season and get a prediction of current all-stars

scaler = StandardScaler()

df_to_predict_new = df_to_predict[features]
scaler.fit(df_to_predict_new)
df_to_predict_new = scaler.transform(df_to_predict_new)

results_dict = {'Player': [], 'AS Prob': [], 'Model Prediction': []}
df_results = pd.DataFrame(data=results_dict)

df_to_predict['AS Prob.'] = [prob[1] for prob in tuned_model.predict_proba(df_to_predict_new)]

# Pick top 12 probabilities from the East and West 

eastern_conference = set(['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DET', 'IND', 'MIA',
                          'MIL', 'NYK', 'ORL', 'PHI', 'TOR', 'WAS'])

df_to_predict['Conf.'] = df_to_predict['TEAM'].map(lambda x : 'East' if x in eastern_conference else 'West')

df_east = df_to_predict[df_to_predict['Conf.'] == 'East'].sort_values('AS Prob.', ascending=False).reset_index(drop=True)
df_west = df_to_predict[df_to_predict['Conf.'] == 'West'].sort_values('AS Prob.', ascending=False).reset_index(drop=True)

final = [df_east[0:12], df_west[0:12]]
df_results_final = pd.concat(final, ignore_index=True)

df_to_predict.to_csv('2021_NBA_ASG_predictor_full.csv')
df_results_final.to_csv('2021_NBA_ASG_predictor_results.csv')

In [21]:
# Final predicted results
print(df_results_final['PLAYER'])

0     Giannis Antetokounmpo
1               Joel Embiid
2              Bradley Beal
3               Zach LaVine
4          Domantas Sabonis
5         Russell Westbrook
6                Trae Young
7             Julius Randle
8            Nikola Vučević
9              Jayson Tatum
10              Ben Simmons
11             Jimmy Butler
12             LeBron James
13             Kevin Durant
14             James Harden
15            Stephen Curry
16             Nikola Jokić
17             Kyrie Irving
18           Damian Lillard
19            Anthony Davis
20              Luka Dončić
21            Kawhi Leonard
22              Paul George
23         Donovan Mitchell
Name: PLAYER, dtype: object


### Evaluation Metrics

#### Accuracy

The model correctly predicted <strong>21/24</strong> all stars. <br><br>
Omitted All-Stars: Zion Williamson, Jaylen Brown, Chris Paul.

In [29]:
# Classification Metrics function taken from https://github.com/cjporteo/ml-NBA-asg-predictor/
tuned_model = pickle.load(open('ASG_predictor.model', 'rb'))

# run the tuned model on the test set and get all-star prediction probabilities
y_test_proba = [prob[1] for prob in tuned_model.predict_proba(X_test)]

def show_classification_metrics():

    # range of classification thresholds to consider
    thresholds = np.arange(0.1, 1, 0.0025)

    highest_f1_score = -float('inf')
    optimal_threshold = thresholds[0]
    
    for threshold in thresholds:
        y_pred = [0 if prob < threshold else 1 for prob in y_test_proba]
        curr_f1_score = f1_score(y_test, y_pred)
        if curr_f1_score > highest_f1_score:
            highest_f1_score = curr_f1_score
            optimal_threshold = threshold
    print('Optimal threshold: {}'.format(optimal_threshold))
        
    y_pred = [0 if prob < optimal_threshold else 1 for prob in y_test_proba]

    metrics = [
        'Accuracy',
        'Precision',
        'Recall',
        'F1 Score',
        'Log. Loss',
        'ROC AUC'
    ]

    scores = [
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
  
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        log_loss(y_test, y_test_proba),
        roc_auc_score(y_test, y_test_proba)
    ]

    df_classification_metrics = pd.DataFrame({'Metric' : metrics, 'Score' : scores}).set_index('Metric')

    return df_classification_metrics

In [30]:
show_classification_metrics()

Optimal threshold: 0.9975000000000008


Unnamed: 0_level_0,Score
Metric,Unnamed: 1_level_1
Accuracy,0.939815
Precision,0.489362
Recall,0.516854
F1 Score,0.502732
Log. Loss,0.96728
ROC AUC,0.854746
