## Waiver Prediction Models


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

In [2]:
player_data = pd.read_csv("Data/merged_data/train_data.csv")

As a pilot test for Collin's idea, let's run these models again with only data since 2015

In [3]:
len(player_data)

12655

In [4]:
player_data = player_data.loc[player_data['SEASON_START'] > 2012]

In [5]:
len(player_data)

4626

In [6]:
player_data['SEASON_START'].max()

np.int64(2023)

In [7]:
player_data['WAIVED_BY_START_OF_NEXT_SEASON'] = player_data[['WAIVED', 'RELEASED']].any(axis=1).astype(int)

In [8]:
player_data.sample()

Unnamed: 0,NAME,PLAYER_ID,SEASON_START,TEAMS_LIST,PLAYER_AGE,EXPERIENCE,POS,GP,GS,MIN,...,WAIVED_REG,WAIVED_POST,RELEASED_OFF,RELEASED_REG,RELEASED_POST,TRADED_OFF,TRADED_REG,TRADED_POST,IN_LEAGUE_NEXT,WAIVED_BY_START_OF_NEXT_SEASON
6080,Jonah Bolden,1628413,2019,"['PHI', 'PHX']",24.0,2,C,7,0.0,47.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [9]:
player_data = player_data.dropna(subset=['SALARY'])

In [10]:
len(player_data)

4543

In [11]:
player_data = player_data[player_data['MIN'] != 0]

In [12]:
len(player_data)

4543

In [13]:
columns_to_normalize = ['FGM', 'FGA', 'PTS', 'PF', 'DREB', 'OREB', 'REB', 'FTA', 'FTM', 'STL', 'TOV', 'BLK', 'AST', 'FG3A', 'FG3M']

# Normalize the selected columns by dividing by 'MIN'
player_data[columns_to_normalize] = player_data[columns_to_normalize].div(player_data['MIN'], axis=0)

# Rename columns to include "per minute"
rename_dict = {col: f"{col} / MIN" for col in columns_to_normalize}
player_data.rename(columns=rename_dict, inplace=True)

In [14]:
player_data.sample()

Unnamed: 0,NAME,PLAYER_ID,SEASON_START,TEAMS_LIST,PLAYER_AGE,EXPERIENCE,POS,GP,GS,MIN,...,WAIVED_REG,WAIVED_POST,RELEASED_OFF,RELEASED_REG,RELEASED_POST,TRADED_OFF,TRADED_REG,TRADED_POST,IN_LEAGUE_NEXT,WAIVED_BY_START_OF_NEXT_SEASON
2013,Patrick McCaw,1627775,2016,['GSW'],21.0,1,SG,71,20.0,1074.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


Now let's find the most correlated features

In [15]:
numeric_data = player_data.select_dtypes(include=['number'])

In [16]:
correlations = numeric_data.corr()['WAIVED_BY_START_OF_NEXT_SEASON']
sorted_correlations = correlations.sort_values(ascending=False)
pd.set_option('display.max_rows', None)

# Print the sorted correlations
print(sorted_correlations)

# Reset display options if needed
pd.reset_option('display.max_rows')

WAIVED_BY_START_OF_NEXT_SEASON    1.000000
WAIVED                            0.998538
WAIVED_REG                        0.717978
WAIVED_OFF                        0.700122
RELEASED                          0.093613
RELEASED_REG                      0.093613
PF / MIN                          0.093301
PLAYER_ID                         0.073586
SEASON_START                      0.070676
WAIVED_POST                       0.069744
TRADED_REG                        0.058513
TOV_PERCENT                       0.056068
X3P_AR                            0.045967
TRADED                            0.034448
PLAYER_AGE                        0.031438
F_TR                              0.028910
OREB / MIN                        0.026912
ORB_PERCENT                       0.025480
STL / MIN                         0.018781
STL_PERCENT                       0.016971
TRADED_POST                      -0.004234
TRADED_OFF                       -0.006054
FG3A / MIN                       -0.024734
REB / MIN  

Since player_ID is correlated at 10% despite being complete random, let's take anything that's correlated at 15% or higher

In [17]:
Xfeatures = ['GP', 'MIN', 'DWS', 'GS', 'WS', 'OBPM', 'BPM', 'PER', 'OWS', 'WS_48', 'FT_PCT', 'FG_PCT', 'FGM / MIN', 'PTS / MIN', 'VORP', 'TS_PERCENT', 'SALARY', 'FTM / MIN', 'USG_PERCENT', 'FTA / MIN', 'FGA / MIN' ]
len(Xfeatures)

21

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
train_player_data, test_player_data = train_test_split(player_data, test_size=0.2, stratify=player_data['WAIVED_BY_START_OF_NEXT_SEASON'], random_state=812)
print(len(train_player_data))
print(len(test_player_data))

3634
909


In [20]:
train_player_data_waived = train_player_data.loc[train_player_data['WAIVED_BY_START_OF_NEXT_SEASON'] == 1]
p = len(train_player_data_waived)/len(train_player_data)
print(p)
print(p*(1 - p))

0.18464501926252064
0.15055123612406401


About 14% of the players are waived, so a randomly guessing waived or not with that probability would have an F1-score of about 14% and a Brier score of about .1175

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, brier_score_loss, balanced_accuracy_score

In [22]:

train_player_data = train_player_data.dropna(subset = Xfeatures)
print(len(train_player_data))
X = train_player_data[Xfeatures]
y = train_player_data['WAIVED_BY_START_OF_NEXT_SEASON']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=812, stratify = y)
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors = 15))
])
knn_pipeline.fit(X_train, y_train)
y_pred = knn_pipeline.predict(X_test)
f1score = f1_score(y_test, y_pred)
print("F1 score: ", f1score)

3616
F1 score:  0.3626943005181347


In [23]:
y_prob = knn_pipeline.predict_proba(X_test)
y_prob = y_prob[:, 1]
brier_score = brier_score_loss(y_test, y_prob)
print("Brier Score: ", brier_score)

Brier Score:  0.11810926949048497


In [24]:
from sklearn.linear_model import LogisticRegression
X = train_player_data[Xfeatures]
y = train_player_data['WAIVED_BY_START_OF_NEXT_SEASON']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=813, stratify = y)


In [25]:
log_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
f1score = f1_score(y_test, y_pred)
print("F1 score: ", f1score)

F1 score:  0.5


In [26]:
y_prob = log_reg.predict_proba(X_test)

In [27]:

y_prob
y_prob = y_prob[:, 1]
brier_score = brier_score_loss(y_test, y_prob)
print("Brier Score: ", brier_score)

Brier Score:  0.1016985287825115


In [28]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [29]:
tree_cfr = DecisionTreeClassifier(max_depth=6, random_state=814)
X = train_player_data[Xfeatures]
y = train_player_data['WAIVED_BY_START_OF_NEXT_SEASON']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=815, stratify = y)
tree_cfr.fit(X_train, y_train)
y_pred = tree_cfr.predict(X_test)
f1score = f1_score(y_test, y_pred)
print("F1 score: ", f1score)
print("Balanced Accuracy: ", balanced_accuracy_score(y_test, y_pred))

F1 score:  0.5391304347826087
Balanced Accuracy:  0.7054439195190711


In [30]:
y_prob = tree_cfr.predict_proba(X_test)
y_prob = y_prob[:, 1]
brier_score = brier_score_loss(y_test, y_prob)
print("Brier Score: ", brier_score)

Brier Score:  0.11119775532112296


In [31]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    n_estimators = 50, 
    max_depth = 3, 
    max_features = 6, 
    bootstrap= True, 
    max_samples = 500,
    random_state = 816
    )

In [32]:
X = train_player_data[Xfeatures]
y = train_player_data['WAIVED_BY_START_OF_NEXT_SEASON']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=817, stratify = y)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
f1score = f1_score(y_test, y_pred)
print("F1 score: ", f1score)

F1 score:  0.5024154589371981


In [33]:
y_prob = tree_cfr.predict_proba(X_test)
y_prob = y_prob[:, 1]
brier_score = brier_score_loss(y_test, y_prob)
print("Brier Score: ", brier_score)

Brier Score:  0.08532659428264887


Of the non-ensemble methods, the decision tree works best, so let's boost that one with AdaBoost

In [34]:
from sklearn.ensemble import AdaBoostClassifier

In [35]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=6),
                                n_estimators=100,
                                algorithm = 'SAMME',
                                learning_rate = 0.05,
                                random_state=818)

In [36]:
X = train_player_data[Xfeatures]
y = train_player_data['WAIVED_BY_START_OF_NEXT_SEASON']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=817, stratify = y)
ada_clf.fit(X_train, y_train)

In [37]:
y_pred = tree_cfr.predict(X_test)
f1score = f1_score(y_test, y_pred)
print("F1 score: ", f1score)

F1 score:  0.6491228070175439


In [38]:
y_prob = ada_clf.predict_proba(X_test)
y_prob = y_prob[:, 1]
brier_score = brier_score_loss(y_test, y_prob)
print("Brier Score: ", brier_score)

Brier Score:  0.11921158634682348
