# Import Dependencies

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
columns = [
    "Salary cap", "Outcome in wins", "Playoffs", "Wins", "Losses", "PCT", "FG",
    "FGA", "FG%", "3P", "3PA", "3P%","2P", "2PA", "2P%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", 
    "AST", "STL", "BLK", "TOV", "PF", "PTS"
]

target = ["Playoffs"]

In [5]:
# Load the data
file_path = Path('../Data/NBA_Salary3.csv')
df = pd.read_csv(file_path)[:-2]
# df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

In [6]:
df.dtypes

Salary cap      int64
Playoffs       object
Wins            int64
Losses          int64
PCT           float64
FG            float64
FG%           float64
3P            float64
3P%           float64
2P            float64
2P%           float64
FT            float64
FT%           float64
ORB           float64
DRB           float64
TRB           float64
AST           float64
STL           float64
BLK           float64
TOV           float64
PF            float64
PTS           float64
dtype: object

# Split the Data into Training and Testing

In [7]:
#Create our features
# X = pd.get_dummies(df, columns= ["Salary cap",
#                                 "PCT",
#                                 "FG%",
#                                 "3P%",
#                                 "2P%",
#                                 "FT%",
#                                 "TRB",
#                                 "AST",
#                                 "STL",
#                                 "BLK",
#                                 "TOV",
#                                 "PTS"]).drop('Playoffs', axis=1)

X = df.copy()
X = X.drop("Playoffs", axis=1)

# Create our target

y = df['Playoffs']
X.head()

Unnamed: 0,Salary cap,Wins,Losses,PCT,FG,FG%,3P,3P%,2P,2P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,178980766,53,29,0.646,40.5,0.469,14.3,0.364,26.2,0.557,...,0.769,9.8,35.7,45.5,27.1,8.8,4.5,14.9,21.0,111.0
1,174811922,44,38,0.537,42.0,0.475,11.5,0.361,30.5,0.538,...,0.805,10.3,34.1,44.4,25.3,7.1,5.5,14.1,20.4,112.0
2,168378382,42,40,0.512,40.1,0.458,12.8,0.374,27.3,0.512,...,0.793,9.1,34.9,44.0,24.0,7.4,5.0,13.7,18.6,108.4
3,164409293,33,49,0.402,41.6,0.469,12.0,0.347,29.7,0.546,...,0.732,9.5,34.5,44.0,24.0,7.6,5.2,14.5,20.2,112.1
4,162135421,51,31,0.622,41.8,0.468,14.1,0.366,27.8,0.544,...,0.776,10.2,36.5,46.7,23.9,7.6,4.0,13.4,18.2,115.5


In [8]:
# Convert the target column values to 1 and 0 based on their values
x = {'Yes': '1'}   
df = df.replace(x)

x = {'No': '0'}   
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,Salary cap,Playoffs,Wins,Losses,PCT,FG,FG%,3P,3P%,2P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,178980766,1,53,29,0.646,40.5,0.469,14.3,0.364,26.2,...,0.769,9.8,35.7,45.5,27.1,8.8,4.5,14.9,21.0,111.0
1,174811922,1,44,38,0.537,42.0,0.475,11.5,0.361,30.5,...,0.805,10.3,34.1,44.4,25.3,7.1,5.5,14.1,20.4,112.0
2,168378382,0,42,40,0.512,40.1,0.458,12.8,0.374,27.3,...,0.793,9.1,34.9,44.0,24.0,7.4,5.0,13.7,18.6,108.4
3,164409293,0,33,49,0.402,41.6,0.469,12.0,0.347,29.7,...,0.732,9.5,34.5,44.0,24.0,7.6,5.2,14.5,20.2,112.1
4,162135421,1,51,31,0.622,41.8,0.468,14.1,0.366,27.8,...,0.776,10.2,36.5,46.7,23.9,7.6,4.0,13.4,18.2,115.5


In [9]:
X.describe()

Unnamed: 0,Salary cap,Wins,Losses,PCT,FG,FG%,3P,3P%,2P,2P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,...,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0
mean,124188500.0,39.054054,38.608108,0.501682,40.714189,0.462068,11.833108,0.359,28.883108,0.524311,...,0.772446,10.091892,34.377027,44.462162,24.365541,7.65,4.860811,14.114865,20.114865,110.510811
std,17155050.0,11.675769,11.160935,0.14321,1.608608,0.013792,1.681245,0.015259,2.141599,0.02092,...,0.028816,1.014354,1.636409,1.927545,1.928085,0.780851,0.697384,1.068368,1.396369,4.246105
min,79180080.0,15.0,17.0,0.207,36.7,0.429,8.0,0.326,23.1,0.476,...,0.694,8.0,31.0,39.8,19.5,6.1,2.4,11.1,17.2,98.8
25%,115003800.0,31.0,30.0,0.402,39.4,0.451,10.675,0.349,27.375,0.509,...,0.75375,9.4,33.4,43.0,23.175,7.1,4.4,13.4,19.1,107.275
50%,123859600.0,41.5,38.0,0.524,40.7,0.464,11.6,0.357,29.0,0.525,...,0.7745,10.0,34.3,44.35,24.1,7.6,4.9,14.1,20.05,111.2
75%,132083700.0,48.0,46.0,0.61025,41.9,0.471,12.925,0.369,30.525,0.54025,...,0.792,10.625,35.4,45.5,25.725,8.2,5.3,14.8,21.125,113.7
max,178980800.0,65.0,65.0,0.793,44.7,0.503,16.7,0.411,33.4,0.575,...,0.839,14.1,42.2,51.7,29.4,10.0,7.5,17.0,23.6,120.1


In [10]:
# Check the balance of our target values
y.value_counts()

Yes    79
No     69
Name: Playoffs, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'No': 51, 'Yes': 60})

# Balanced Random Forest Classifier

In [12]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier
rando_forest_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
rando_forest_model.fit(X_train, y_train)
print(Counter(y_train))

Counter({'Yes': 60, 'No': 51})


In [13]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

y_pred = rando_forest_model.predict(X_test)
balanced_accuracy_score(y_test,y_pred)

0.9736842105263157

In [14]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual No_Playoffs", "Actual Yes_Playoffs"], columns=["Predicted No_Playoffs", "Predicted Yes_Playoffs"])
cm_df

Unnamed: 0,Predicted No_Playoffs,Predicted Yes_Playoffs
Actual No_Playoffs,18,0
Actual Yes_Playoffs,1,18


In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.95      1.00      0.95      0.97      0.97      0.95        18
        Yes       1.00      0.95      1.00      0.97      0.97      0.94        19

avg / total       0.97      0.97      0.97      0.97      0.97      0.95        37



In [16]:
# # List the features sorted in descending order by feature importance
# features_sorted = sorted(zip(rando_forest_model.feature_importances_, X.columns), reverse=True)
# for importance in importances:
#     print(f'{importance[1]}:  {importance[0]*100:.1f}%')
    
    
features_rank = sorted(zip(rando_forest_model.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

PCT: (0.24235275544328821)
Losses: (0.1982509642267184)
Wins: (0.17379346284855346)
3P%: (0.050737682641564044)
Salary cap: (0.040758574188756856)
FT%: (0.03757087352460228)
TOV: (0.03462708428600668)
DRB: (0.03256322108991682)
TRB: (0.02305405018045281)
STL: (0.019222598823752646)
FG%: (0.01858309764446913)
FT: (0.017302746566066694)
BLK: (0.016789715559538936)
2P%: (0.015466644512926526)
PF: (0.014030867163444673)
ORB: (0.01211611436207838)
PTS: (0.011698258562187978)
FG: (0.011518962638296095)
2P: (0.011512454521434292)
3P: (0.009790560991291546)
AST: (0.008259310224653493)
