# General Overview - Over Sampling Methods

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (classification_report,
                             confusion_matrix)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import imblearn
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN, 
                                    BorderlineSMOTE)



In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [4]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [5]:
tree.shape

(651535, 26)

## separate variables using train test split

In [6]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [7]:
# target variable = health
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 24) (488651,)
(162884, 24) (162884,)


## algorithm functions

In [8]:
# logistic regression

def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix: \n', cm)
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [9]:
# k-nearest neighbors

def knn(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    print('KNN Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix: \n', cm)
    
    # classificatin report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [10]:
# decision tree classifier

def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix: \n', cm)
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [11]:
# random forest classifier

def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix: \n', cm)
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [12]:
# Gaussian naive bayes

def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB()
    gaussian.fit(X_train, y_train)
    y_pred = gaussian.predict(X_test)
    
    print('Gaussian Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
    print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix: \n', cm)
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

# Baseline - DummyClassifier

This is the control for our models. The accuracy scores show the success rates we should expect based on the strategies used for simple guessing.

In [13]:
strategies = ['most_frequent', 'stratified', 'uniform', 'constant'] # strategies available
  
for s in strategies: 
    if s =='constant': 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42, constant='Good') 
    else: 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42) 
    dummy_classifier.fit(X_train, y_train) 
    score = dummy_classifier.score(X_test, y_test) 
    print(s, score)

most_frequent 0.8108960978364972
stratified 0.6815525159008865
uniform 0.3338879202377152
constant 0.8108960978364972


# Random Over Sampler

This standard method of over sampling selects minority samples and replicates them until they match the number of samples of the majority class.

In [14]:
# initialize
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ros)))

# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_ros, y_ros, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)



Resampled dataset shape: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})
(1188735, 24) (1188735,)
(396246, 24) (396246,)


## Logistic Regression

In [15]:
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.41811673754032647
Accuracy Score, Test Set:  0.41762945241087607
Confusion Matrix: 
 [[22399 57764 52337]
 [17426 72130 41971]
 [17132 44132 70955]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.17      0.24    132500
        Good       0.41      0.55      0.47    131527
        Poor       0.43      0.54      0.48    132219

    accuracy                           0.42    396246
   macro avg       0.41      0.42      0.40    396246
weighted avg       0.41      0.42      0.39    396246



## KNN Classifier

In [16]:
knn(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

KNN Classifier 

Accuracy Score, Training Set:  0.45203262291427443
Accuracy Score, Test Set:  0.44320699767316263
Confusion Matrix: 
 [[61902 42854 27744]
 [49181 58340 24006]
 [40267 36575 55377]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.41      0.47      0.44    132500
        Good       0.42      0.44      0.43    131527
        Poor       0.52      0.42      0.46    132219

    accuracy                           0.44    396246
   macro avg       0.45      0.44      0.44    396246
weighted avg       0.45      0.44      0.44    396246



## Decision Tree Classifier

In [17]:
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5073813760005383
Accuracy Score, Test Set: 0.4940945776108781
Confusion Matrix: 
 [[42823 51598 38079]
 [22784 76455 32288]
 [15741 39973 76505]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.53      0.32      0.40    132500
        Good       0.46      0.58      0.51    131527
        Poor       0.52      0.58      0.55    132219

    accuracy                           0.49    396246
   macro avg       0.50      0.49      0.49    396246
weighted avg       0.50      0.49      0.49    396246



## Random Forest Classifier

In [18]:
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5073645513928672
Accuracy Score, Test Set: 0.49455893561070646
Confusion Matrix: 
 [[42082 52014 38404]
 [22240 77021 32266]
 [15405 39950 76864]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.53      0.32      0.40    132500
        Good       0.46      0.59      0.51    131527
        Poor       0.52      0.58      0.55    132219

    accuracy                           0.49    396246
   macro avg       0.50      0.49      0.49    396246
weighted avg       0.50      0.49      0.49    396246



## Gaussian Naive Bayes

In [19]:
gaussian(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.39208654578186053
Accuracy Score, Test Set: 0.391463888594459
Confusion Matrix: 
 [[  8068  99386  25046]
 [  6005 114101  11421]
 [  6122  93150  32947]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.06      0.11    132500
        Good       0.37      0.87      0.52    131527
        Poor       0.47      0.25      0.33    132219

    accuracy                           0.39    396246
   macro avg       0.42      0.39      0.32    396246
weighted avg       0.42      0.39      0.32    396246



# SMOTE - Synthetic Minority Over-sampling Technique

The over sampling method SMOTE, or Synthetic Minority Over-sampling Technique, selects minority samples and increases them randomly to match the majority classes.

In [20]:
# initialize
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_sm)))

# train test split
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

print(X_train_sm.shape, y_train_sm.shape)
print(X_test_sm.shape, y_test_sm.shape)



Resampled dataset shape: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})
(1188735, 24) (1188735,)
(396246, 24) (396246,)


## Logistic Regression

In [21]:
logreg(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4214496923199872
Accuracy Score, Test Set:  0.42101119001832193
Confusion Matrix: 
 [[23522 55238 53740]
 [17543 68883 45101]
 [16377 41423 74419]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.41      0.18      0.25    132500
        Good       0.42      0.52      0.46    131527
        Poor       0.43      0.56      0.49    132219

    accuracy                           0.42    396246
   macro avg       0.42      0.42      0.40    396246
weighted avg       0.42      0.42      0.40    396246



## KNN Classifier

In [22]:
knn(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

KNN Classifier 

Accuracy Score, Training Set:  0.448361493520423
Accuracy Score, Test Set:  0.4374176647839978
Confusion Matrix: 
 [[67601 37578 27321]
 [56202 51329 23996]
 [47934 29890 54395]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.51      0.44    132500
        Good       0.43      0.39      0.41    131527
        Poor       0.51      0.41      0.46    132219

    accuracy                           0.44    396246
   macro avg       0.45      0.44      0.44    396246
weighted avg       0.45      0.44      0.44    396246



## Decision Tree Classifier

In [23]:
decision_tree(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5036522017102214
Accuracy Score, Test Set: 0.4879594999066237
Confusion Matrix: 
 [[42766 49811 39923]
 [24014 74305 33208]
 [16470 39468 76281]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.51      0.32      0.40    132500
        Good       0.45      0.56      0.50    131527
        Poor       0.51      0.58      0.54    132219

    accuracy                           0.49    396246
   macro avg       0.49      0.49      0.48    396246
weighted avg       0.49      0.49      0.48    396246



## Random Forest Classifier

In [24]:
random_forest(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5036387420240844
Accuracy Score, Test Set: 0.4890875870040329
Confusion Matrix: 
 [[42019 50418 40063]
 [23087 75340 33100]
 [16209 39570 76440]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.52      0.32      0.39    132500
        Good       0.46      0.57      0.51    131527
        Poor       0.51      0.58      0.54    132219

    accuracy                           0.49    396246
   macro avg       0.49      0.49      0.48    396246
weighted avg       0.49      0.49      0.48    396246



## Gaussian Naive Bayes

In [25]:
gaussian(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.370339899136477
Accuracy Score, Test Set: 0.37038354961311915
Confusion Matrix: 
 [[ 11964  10155 110381]
 [ 10461  16665 104401]
 [  8067   6018 118134]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.09      0.15    132500
        Good       0.51      0.13      0.20    131527
        Poor       0.35      0.89      0.51    132219

    accuracy                           0.37    396246
   macro avg       0.42      0.37      0.29    396246
weighted avg       0.42      0.37      0.29    396246



# ADASYN - Adaptive Synthetic

The Adaptive Synthetic method, or ADASYN, is similar to SMOTE over sampling. However, ADASYN selectively generates sample points around minority samples using a density distributor and does not create uniform weights like SMOTE.

In [26]:
# initialize
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ada)))

# train test split
X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada, y_ada, test_size=0.25, random_state=42)

print(X_train_ada.shape, y_train_ada.shape)
print(X_test_ada.shape, y_test_ada.shape)



Resampled dataset shape: Counter({'Poor': 535394, 'Good': 528327, 'Fair': 526527})
(1192686, 24) (1192686,)
(397562, 24) (397562,)


## Logistic Regression

In [27]:
logreg(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4091043241892669
Accuracy Score, Test Set:  0.40859538889531694
Confusion Matrix: 
 [[19791 53097 59200]
 [16488 64845 50064]
 [14411 41860 77806]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.15      0.22    132088
        Good       0.41      0.49      0.45    131397
        Poor       0.42      0.58      0.48    134077

    accuracy                           0.41    397562
   macro avg       0.40      0.41      0.38    397562
weighted avg       0.40      0.41      0.38    397562



## KNN Classifier

In [28]:
knn(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

KNN Classifier 

Accuracy Score, Training Set:  0.4336757537189168
Accuracy Score, Test Set:  0.42309375644553554
Confusion Matrix: 
 [[69485 37184 25419]
 [60158 49565 21674]
 [55579 29342 49156]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.53      0.44    132088
        Good       0.43      0.38      0.40    131397
        Poor       0.51      0.37      0.43    134077

    accuracy                           0.42    397562
   macro avg       0.44      0.42      0.42    397562
weighted avg       0.44      0.42      0.42    397562



## Decision Tree Classifier

In [29]:
decision_tree(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.492267872684009
Accuracy Score, Test Set: 0.4780361302136522
Confusion Matrix: 
 [[51724 38683 41681]
 [34633 59660 37104]
 [26044 29368 78665]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.46      0.39      0.42    132088
        Good       0.47      0.45      0.46    131397
        Poor       0.50      0.59      0.54    134077

    accuracy                           0.48    397562
   macro avg       0.48      0.48      0.47    397562
weighted avg       0.48      0.48      0.47    397562



## Random Forest Classifier

In [30]:
random_forest(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Random Forest Classifier 

Accuracy Score, Training Set: 0.49226032669118275
Accuracy Score, Test Set: 0.4794547768649921
Confusion Matrix: 
 [[51005 39268 41815]
 [33686 60674 37037]
 [25746 29397 78934]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.46      0.39      0.42    132088
        Good       0.47      0.46      0.47    131397
        Poor       0.50      0.59      0.54    134077

    accuracy                           0.48    397562
   macro avg       0.48      0.48      0.48    397562
weighted avg       0.48      0.48      0.48    397562



## Gaussian Naive Bayes

In [31]:
gaussian(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.36650467935399594
Accuracy Score, Test Set: 0.36756027990602724
Confusion Matrix: 
 [[ 12858   7673 111557]
 [ 12945  12373 106079]
 [  8102   5078 120897]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.10      0.15    132088
        Good       0.49      0.09      0.16    131397
        Poor       0.36      0.90      0.51    134077

    accuracy                           0.37    397562
   macro avg       0.41      0.36      0.27    397562
weighted avg       0.41      0.37      0.28    397562



# SMOTE Extensions - Borderline SMOTE

[Borderline SMOTE](https://link.springer.com/chapter/10.1007/11538059_91) is an over sampling method in which the minority samples on the cusp of being in the majority pile are oversampled. 

In [32]:
# initialize
bsmt = BorderlineSMOTE(random_state=42)
X_bsmt, y_bsmt = bsmt.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_bsmt)))

# train test split
X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt = train_test_split(X_bsmt, y_bsmt, test_size=0.2, random_state=42)

print(X_train_bsmt.shape, y_train_bsmt.shape)
print(X_test_bsmt.shape, y_test_bsmt.shape)



Resampled dataset shape: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})
(1267984, 24) (1267984,)
(316997, 24) (316997,)


## Logistic Regression

In [33]:
logreg(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.473412913727618
Accuracy Score, Test Set:  0.4722284438023073
Confusion Matrix: 
 [[20060 52371 33588]
 [15202 67595 22281]
 [14798 29062 62040]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.19      0.26    106019
        Good       0.45      0.64      0.53    105078
        Poor       0.53      0.59      0.55    105900

    accuracy                           0.47    316997
   macro avg       0.46      0.47      0.45    316997
weighted avg       0.46      0.47      0.45    316997



## KNN Classifier

In [34]:
knn(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

KNN Classifier 

Accuracy Score, Training Set:  0.5668541558884024
Accuracy Score, Test Set:  0.5580210538270078
Confusion Matrix: 
 [[53594 32055 20370]
 [36165 52324 16589]
 [19724 15203 70973]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.49      0.51      0.50    106019
        Good       0.53      0.50      0.51    105078
        Poor       0.66      0.67      0.66    105900

    accuracy                           0.56    316997
   macro avg       0.56      0.56      0.56    316997
weighted avg       0.56      0.56      0.56    316997



## Decision Tree Classifier

In [35]:
decision_tree(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.627391986018751
Accuracy Score, Test Set: 0.6139774193446625
Confusion Matrix: 
 [[53056 22910 30053]
 [31660 49408 24010]
 [ 8214  5521 92165]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.57      0.50      0.53    106019
        Good       0.63      0.47      0.54    105078
        Poor       0.63      0.87      0.73    105900

    accuracy                           0.61    316997
   macro avg       0.61      0.61      0.60    316997
weighted avg       0.61      0.61      0.60    316997



## Random Forest Classifier

In [36]:
random_forest(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Random Forest Classifier 

Accuracy Score, Training Set: 0.6273817335234514
Accuracy Score, Test Set: 0.6151004583639593
Confusion Matrix: 
 [[52565 23291 30163]
 [31005 50126 23947]
 [ 8091  5515 92294]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.57      0.50      0.53    106019
        Good       0.64      0.48      0.54    105078
        Poor       0.63      0.87      0.73    105900

    accuracy                           0.62    316997
   macro avg       0.61      0.61      0.60    316997
weighted avg       0.61      0.62      0.60    316997



## Gaussian Naive Bayes

In [37]:
gaussian(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.4458045211926964
Accuracy Score, Test Set: 0.44354362975043926
Confusion Matrix: 
 [[12467 59414 34138]
 [10998 74866 19214]
 [ 8980 43651 53269]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.12      0.18    106019
        Good       0.42      0.71      0.53    105078
        Poor       0.50      0.50      0.50    105900

    accuracy                           0.44    316997
   macro avg       0.43      0.44      0.40    316997
weighted avg       0.43      0.44      0.40    316997

