# Import Dependencies

In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [17]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [18]:
columns = [
    "Salary cap", "Season", "Outcome in wins", "Playoffs", "Wins", "Losses", "PCT", "FG",
    "FGA", "FG%", "3P", "3PA", "3P%","2P", "2PA", "2P%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", 
    "AST", "STL", "BLK", "TOV", "PF", "PTS"
]

target = ["Playoffs"]

In [19]:
# Load the data
file_path = Path('Data/NBA_Salary.csv')
df = pd.read_csv(file_path)[:-2]

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

In [20]:
df.dtypes

Salary cap           int64
Outcome in wins      int64
Playoffs            object
Wins                 int64
Losses               int64
PCT                float64
FG                 float64
FGA                float64
FG%                float64
3P                 float64
3PA                float64
3P%                float64
2P                 float64
2PA                float64
2P%                float64
FT                 float64
FTA                float64
FT%                float64
ORB                float64
DRB                float64
TRB                float64
AST                float64
STL                float64
BLK                float64
TOV                float64
PF                 float64
PTS                float64
dtype: object

# Split the Data into Training and Testing

In [21]:
# Create our features
X = pd.get_dummies(df, columns= ["Salary cap",
                                "PCT",
                                "FG%",
                                "3P%",
                                "2P%",
                                "FT%",
                                "TRB",
                                "AST",
                                "STL",
                                "BLK",
                                "TOV",
                                "PTS"]).drop('Playoffs', axis=1)

# Create our target

y = df['Playoffs']
X.head()

Unnamed: 0,Outcome in wins,Wins,Losses,FG,FGA,3P,3PA,2P,2PA,FT,...,PTS_116.3,PTS_116.4,PTS_116.6,PTS_117.0,PTS_117.7,PTS_117.8,PTS_118.0,PTS_118.6,PTS_118.7,PTS_120.1
0,0,53,29,40.5,86.4,14.3,39.4,26.2,47.0,15.6,...,0,0,0,0,0,0,0,0,0,0
1,0,44,38,42.0,88.4,11.5,31.7,30.5,56.7,17.5,...,0,0,0,0,0,0,0,0,0,0
2,0,42,40,40.1,87.4,12.8,34.2,27.3,53.3,15.5,...,0,0,0,0,0,0,0,0,0,0
3,0,33,49,41.6,88.8,12.0,34.5,29.7,54.3,16.8,...,0,0,0,0,0,0,0,0,0,0
4,0,51,31,41.8,89.4,14.1,38.4,27.8,51.0,17.8,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X.describe()

Unnamed: 0,Outcome in wins,Wins,Losses,FG,FGA,3P,3PA,2P,2PA,FT,...,PTS_116.3,PTS_116.4,PTS_116.6,PTS_117.0,PTS_117.7,PTS_117.8,PTS_118.0,PTS_118.6,PTS_118.7,PTS_120.1
count,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,...,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0,148.0
mean,2.209459,39.054054,38.608108,40.714189,88.12027,11.833108,32.962838,28.883108,55.154054,17.268243,...,0.006757,0.006757,0.006757,0.006757,0.006757,0.006757,0.006757,0.006757,0.006757,0.006757
std,4.069554,11.675769,11.160935,1.608608,2.282795,1.681245,4.430495,2.141599,4.466982,1.529352,...,0.082199,0.082199,0.082199,0.082199,0.082199,0.082199,0.082199,0.082199,0.082199,0.082199
min,0.0,15.0,17.0,36.7,82.8,8.0,22.5,23.1,41.9,12.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,31.0,30.0,39.4,86.475,10.675,29.975,27.375,52.475,16.275,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,41.5,38.0,40.7,88.1,11.6,32.7,29.0,55.3,17.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.25,48.0,46.0,41.9,89.625,12.925,35.95,30.525,58.3,18.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,16.0,65.0,65.0,44.7,94.4,16.7,45.4,33.4,64.4,22.6,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
# Check the balance of our target values
y.value_counts()

Yes    79
No     69
Name: Playoffs, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'No': 51, 'Yes': 60})

# Balanced Random Forest Classifier

In [29]:
# Resample the training data with the BalancedRandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier
rando_forest_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
rando_forest_model.fit(X_train, y_train)
print(Counter(y_train))

AttributeError: can't set attribute

In [27]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred = rando_forest_model.predict(X_test)
balanced_accuracy_score(y_test,y_pred)

AttributeError: 'BalancedRandomForestClassifier' object has no attribute 'n_classes_'

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual Yes_Playoffs", "Actual No_Playoffs"], columns=["Predicted Yes_Playoffs", "Predicted No_Playoffs"])
cm_df

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
features_sorted = sorted(zip(rando_forest_model.feature_importances_, X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}:  {importance[0]*100:.1f}%')