# Over Sampling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (classification_report,
                             confusion_matrix)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB

import imblearn
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN)

In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [4]:
tree.shape

(651535, 26)

# Target and Response Variable, Train_Test_Split

In [5]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [6]:
# create targe and response variable
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 24) (488651,)
(162884, 24) (162884,)


# Baseline - DummyClassifier

In [7]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X, y)
dc_pred = dummy_clf.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.681154504362774


In [8]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X, y)
dc_pred_freq = dummy_clf_freq.predict(X)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6812957093632729


# Algorithm Functions

In [9]:
# logistic regression

def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    print()
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [10]:
# KNN classifier

def knn(X_train, X_test, y_train, y_test):
    # using 6 neighbors
    knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    print('KNN Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))
    print()
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classificatin report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [11]:
# decision tree classifier

def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [12]:
# random forest classifier

def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [13]:
# Gaussian naive bayes

def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB().fit(X_train, y_train)
    y_pred = gaussian.predict(X_test)
    
    print('Gaussian Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
    print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [14]:
# categorical naive bayes

def categorical_naive_bayes(X_train, X_test, y_train, y_test):
    categorical = CategoricalNB().fit(X_train, y_train)
    y_pred = categorical.predict(X_test)
    
    print('Categorical Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', categorical.score(X_train, y_train))
    print('Accuracy Score, Test Set:', categorical.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [15]:
# cross validation - 5-fold - for later
# cv_scores = cross_val_score(logreg, X, y, cv=5)
# print('CV Scores: {}'.format(cv_scores))
# print('Average 5-Fold CV Score: {}'.format(np.mean(cv_scores)))
# print()

# Random Over Sampler

In [16]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ros)))

Resampled dataset shape: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})


In [17]:
# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_ros, y_ros, test_size=0.2, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

(1267984, 24) (1267984,)
(316997, 24) (316997,)


## Logistic Regression

In [18]:
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.41830023091774027
Accuracy Score, Test Set:  0.4176190941870112

Confusion Matrix: 
 [[18074 46390 41555]
 [14041 57824 33213]
 [13884 35530 56486]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.17      0.24    106019
        Good       0.41      0.55      0.47    105078
        Poor       0.43      0.53      0.48    105900

    accuracy                           0.42    316997
   macro avg       0.41      0.42      0.40    316997
weighted avg       0.41      0.42      0.40    316997



## KNN Classifier

In [19]:
knn(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

KNN Classifier 

Accuracy Score, Training Set:  0.4503589950661838
Accuracy Score, Test Set:  0.4417896699337849

Confusion Matrix: 
 [[51833 34680 19506]
 [41419 46632 17027]
 [35494 28825 41581]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.49      0.44    106019
        Good       0.42      0.44      0.43    105078
        Poor       0.53      0.39      0.45    105900

    accuracy                           0.44    316997
   macro avg       0.45      0.44      0.44    316997
weighted avg       0.45      0.44      0.44    316997



## Decision Tree Classifier

In [20]:
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5071767467097377
Accuracy Score, Test Set: 0.4939005731915444
Confusion Matrix: 
 [[34284 41223 30512]
 [18217 61063 25798]
 [12706 31976 61218]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.53      0.32      0.40    106019
        Good       0.45      0.58      0.51    105078
        Poor       0.52      0.58      0.55    105900

    accuracy                           0.49    316997
   macro avg       0.50      0.49      0.49    316997
weighted avg       0.50      0.49      0.49    316997



## Random Forest Classifier

In [21]:
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5071578190261076
Accuracy Score, Test Set: 0.4944179282453777
Confusion Matrix: 
 [[33521 41769 30729]
 [17586 61741 25751]
 [12310 32123 61467]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.53      0.32      0.40    106019
        Good       0.46      0.59      0.51    105078
        Poor       0.52      0.58      0.55    105900

    accuracy                           0.49    316997
   macro avg       0.50      0.49      0.49    316997
weighted avg       0.50      0.49      0.49    316997



## Gaussian Naive Bayes

In [22]:
gaussian(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.3922092076871632
Accuracy Score, Test Set: 0.391041555598318
Confusion Matrix: 
 [[ 6304 79508 20207]
 [ 4669 91154  9255]
 [ 4744 74655 26501]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.06      0.10    106019
        Good       0.37      0.87      0.52    105078
        Poor       0.47      0.25      0.33    105900

    accuracy                           0.39    316997
   macro avg       0.42      0.39      0.32    316997
weighted avg       0.42      0.39      0.32    316997



## Categorical Naive Bayes

In [23]:
categorical_naive_bayes(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.41472841928604776
Accuracy Score, Test Set: 0.41291557964270953
Confusion Matrix: 
 [[28257 51549 26213]
 [23322 65130 16626]
 [21064 47330 37506]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.27      0.32    106019
        Good       0.40      0.62      0.48    105078
        Poor       0.47      0.35      0.40    105900

    accuracy                           0.41    316997
   macro avg       0.42      0.41      0.40    316997
weighted avg       0.42      0.41      0.40    316997



# SMOTE - Synthetic Minority Over-sampling Technique

Documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html

In [24]:
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_sm)))

Resampled dataset shape: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})


In [25]:
# train test split
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

print(X_train_sm.shape, y_train_sm.shape)
print(X_test_sm.shape, y_test_sm.shape)

(1267984, 24) (1267984,)
(316997, 24) (316997,)


## Logistic Regression

In [26]:
logreg(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4217253530012997
Accuracy Score, Test Set:  0.42248349353463915

Confusion Matrix: 
 [[20114 43859 42046]
 [15354 55234 34490]
 [13928 33394 58578]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.41      0.19      0.26    106019
        Good       0.42      0.53      0.47    105078
        Poor       0.43      0.55      0.49    105900

    accuracy                           0.42    316997
   macro avg       0.42      0.42      0.40    316997
weighted avg       0.42      0.42      0.40    316997



## KNN Classifier

In [27]:
knn(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

KNN Classifier 

Accuracy Score, Training Set:  0.4516342477507603
Accuracy Score, Test Set:  0.4430357385085663

Confusion Matrix: 
 [[50908 30993 24118]
 [40445 43899 20734]
 [34417 25849 45634]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.48      0.44    106019
        Good       0.44      0.42      0.43    105078
        Poor       0.50      0.43      0.46    105900

    accuracy                           0.44    316997
   macro avg       0.45      0.44      0.44    316997
weighted avg       0.45      0.44      0.44    316997



## Decision Tree Classifier

In [28]:
decision_tree(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5033588751908541
Accuracy Score, Test Set: 0.487537736950192
Confusion Matrix: 
 [[34261 39794 31964]
 [19344 59333 26401]
 [13316 31630 60954]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.51      0.32      0.40    106019
        Good       0.45      0.56      0.50    105078
        Poor       0.51      0.58      0.54    105900

    accuracy                           0.49    316997
   macro avg       0.49      0.49      0.48    316997
weighted avg       0.49      0.49      0.48    316997



## Random Forest Classifier

In [29]:
random_forest(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5033462567351008
Accuracy Score, Test Set: 0.48894784493228644
Confusion Matrix: 
 [[33611 39631 32777]
 [18553 59607 26918]
 [13029 31094 61777]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.52      0.32      0.39    106019
        Good       0.46      0.57      0.51    105078
        Poor       0.51      0.58      0.54    105900

    accuracy                           0.49    316997
   macro avg       0.49      0.49      0.48    316997
weighted avg       0.49      0.49      0.48    316997



## Gaussian Naive Bayes

In [30]:
gaussian(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.37098575376345444
Accuracy Score, Test Set: 0.37140099117657266
Confusion Matrix: 
 [[ 9529  8301 88189]
 [ 8284 13681 83113]
 [ 6373  5004 94523]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.09      0.15    106019
        Good       0.51      0.13      0.21    105078
        Poor       0.36      0.89      0.51    105900

    accuracy                           0.37    316997
   macro avg       0.42      0.37      0.29    316997
weighted avg       0.42      0.37      0.29    316997



## Categorical Naive Bayes

In [31]:
categorical_naive_bayes(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.4184358793170892
Accuracy Score, Test Set: 0.41775474215844316
Confusion Matrix: 
 [[30257 46426 29336]
 [24319 58847 21912]
 [21942 40635 43323]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.29      0.33    106019
        Good       0.40      0.56      0.47    105078
        Poor       0.46      0.41      0.43    105900

    accuracy                           0.42    316997
   macro avg       0.42      0.42      0.41    316997
weighted avg       0.42      0.42      0.41    316997



# ADASYN - Adaptive Synthetic

In [32]:
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ada)))

Resampled dataset shape: Counter({'Poor': 535394, 'Good': 528327, 'Fair': 526527})


In [33]:
# train test split
X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada, y_ada, test_size=0.25, random_state=42)

print(X_train_ada.shape, y_train_ada.shape)
print(X_test_ada.shape, y_test_ada.shape)

(1192686, 24) (1192686,)
(397562, 24) (397562,)


## Logistic Regression

In [34]:
logreg(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.41042487293386526
Accuracy Score, Test Set:  0.4095361226676594

Confusion Matrix: 
 [[16721 57816 57551]
 [13874 69591 47932]
 [12083 45490 76504]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.13      0.19    132088
        Good       0.40      0.53      0.46    131397
        Poor       0.42      0.57      0.48    134077

    accuracy                           0.41    397562
   macro avg       0.40      0.41      0.38    397562
weighted avg       0.40      0.41      0.38    397562



## KNN Classifier

In [35]:
knn(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

KNN Classifier 

Accuracy Score, Training Set:  0.42628990362928715
Accuracy Score, Test Set:  0.4170594775154567

Confusion Matrix: 
 [[61171 43143 27774]
 [52315 54215 24867]
 [47435 36221 50421]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.46      0.42    132088
        Good       0.41      0.41      0.41    131397
        Poor       0.49      0.38      0.43    134077

    accuracy                           0.42    397562
   macro avg       0.43      0.42      0.42    397562
weighted avg       0.43      0.42      0.42    397562



## Decision Tree Classifier

In [36]:
decision_tree(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.49205826177216805
Accuracy Score, Test Set: 0.47872533089178543
Confusion Matrix: 
 [[51913 38710 41465]
 [34711 59967 36719]
 [26119 29515 78443]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.46      0.39      0.42    132088
        Good       0.47      0.46      0.46    131397
        Poor       0.50      0.59      0.54    134077

    accuracy                           0.48    397562
   macro avg       0.48      0.48      0.48    397562
weighted avg       0.48      0.48      0.48    397562



## Random Forest Classifier

In [37]:
random_forest(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Random Forest Classifier 

Accuracy Score, Training Set: 0.4920448466738102
Accuracy Score, Test Set: 0.48007606360768884
Confusion Matrix: 
 [[51080 39420 41588]
 [33648 61100 36649]
 [25768 29629 78680]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.46      0.39      0.42    132088
        Good       0.47      0.47      0.47    131397
        Poor       0.50      0.59      0.54    134077

    accuracy                           0.48    397562
   macro avg       0.48      0.48      0.48    397562
weighted avg       0.48      0.48      0.48    397562



## Gaussian Naive Bayes

In [38]:
gaussian(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.3664409576367963
Accuracy Score, Test Set: 0.3675200346109538
Confusion Matrix: 
 [[ 14056   7154 110878]
 [ 14328  11597 105472]
 [  8844   4774 120459]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.11      0.17    132088
        Good       0.49      0.09      0.15    131397
        Poor       0.36      0.90      0.51    134077

    accuracy                           0.37    397562
   macro avg       0.41      0.36      0.28    397562
weighted avg       0.41      0.37      0.28    397562



## Categorical Naive Bayes

In [39]:
categorical_naive_bayes(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.40931812731934475
Accuracy Score, Test Set: 0.4105598623610908
Confusion Matrix: 
 [[32808 52308 46972]
 [27107 64274 40016]
 [25192 42744 66141]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.25      0.30    132088
        Good       0.40      0.49      0.44    131397
        Poor       0.43      0.49      0.46    134077

    accuracy                           0.41    397562
   macro avg       0.41      0.41      0.40    397562
weighted avg       0.41      0.41      0.40    397562



# Combination Over-sampling and Under-sampling

In [40]:
from imblearn.combine import SMOTETomek

In [41]:
smt = SMOTETomek(random_state=42)
X_smt, y_smt = smt.fit_sample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_smt)))

Resampled dataset shape: Counter({'Poor': 528308, 'Fair': 528226, 'Good': 528213})


In [42]:
# train test split
X_train_smt, X_test_smt, y_train_smt, y_test_smt = train_test_split(X_smt, y_smt, test_size=0.2, random_state=42)

print(X_train_smt.shape, y_train_smt.shape)
print(X_test_smt.shape, y_test_smt.shape)

(1267797, 24) (1267797,)
(316950, 24) (316950,)


## Logistic Regression

In [43]:
logreg(X_train_smt, X_test_smt, y_train_smt, y_test_smt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4197059939406703
Accuracy Score, Test Set:  0.4183498974601672

Confusion Matrix: 
 [[19058 44960 42043]
 [14390 55919 34807]
 [13711 34443 57619]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.18      0.25    106061
        Good       0.41      0.53      0.47    105116
        Poor       0.43      0.54      0.48    105773

    accuracy                           0.42    316950
   macro avg       0.42      0.42      0.40    316950
weighted avg       0.42      0.42      0.40    316950



## KNN Classifier

In [44]:
knn(X_train_smt, X_test_smt, y_train_smt, y_test_smt)

KNN Classifier 

Accuracy Score, Training Set:  0.44689252301433113
Accuracy Score, Test Set:  0.43645054424988167

Confusion Matrix: 
 [[49599 32498 23964]
 [39594 43996 21526]
 [33289 27746 44738]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.47      0.43    106061
        Good       0.42      0.42      0.42    105116
        Poor       0.50      0.42      0.46    105773

    accuracy                           0.44    316950
   macro avg       0.44      0.44      0.44    316950
weighted avg       0.44      0.44      0.44    316950



## Decision Tree Classifier

In [45]:
decision_tree(X_train_smt, X_test_smt, y_train_smt, y_test_smt)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5032627463229523
Accuracy Score, Test Set: 0.48803912289004575
Confusion Matrix: 
 [[33688 40133 32240]
 [18746 59809 26561]
 [13098 31488 61187]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.51      0.32      0.39    106061
        Good       0.46      0.57      0.51    105116
        Poor       0.51      0.58      0.54    105773

    accuracy                           0.49    316950
   macro avg       0.49      0.49      0.48    316950
weighted avg       0.49      0.49      0.48    316950



## Random Forest Classifier

In [46]:
random_forest(X_train_smt, X_test_smt, y_train_smt, y_test_smt)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5032517035455992
Accuracy Score, Test Set: 0.4897333964347689
Confusion Matrix: 
 [[33297 40520 32244]
 [18070 60660 26386]
 [12876 31633 61264]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.52      0.31      0.39    106061
        Good       0.46      0.58      0.51    105116
        Poor       0.51      0.58      0.54    105773

    accuracy                           0.49    316950
   macro avg       0.50      0.49      0.48    316950
weighted avg       0.50      0.49      0.48    316950



## Gaussian Naive Bayes

In [47]:
gaussian(X_train_smt, X_test_smt, y_train_smt, y_test_smt)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.37264167686151645
Accuracy Score, Test Set: 0.3717053162959457
Confusion Matrix: 
 [[ 9818  9001 87242]
 [ 8765 14567 81784]
 [ 6683  5663 93427]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.09      0.15    106061
        Good       0.50      0.14      0.22    105116
        Poor       0.36      0.88      0.51    105773

    accuracy                           0.37    316950
   macro avg       0.41      0.37      0.29    316950
weighted avg       0.41      0.37      0.29    316950



## Categorical Naive Bayes

In [48]:
categorical_naive_bayes(X_train_smt, X_test_smt, y_train_smt, y_test_smt)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.41895035246179
Accuracy Score, Test Set: 0.41653257611610667
Confusion Matrix: 
 [[30076 46742 29243]
 [24587 59060 21469]
 [22142 40747 42884]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.28      0.33    106061
        Good       0.40      0.56      0.47    105116
        Poor       0.46      0.41      0.43    105773

    accuracy                           0.42    316950
   macro avg       0.42      0.42      0.41    316950
weighted avg       0.42      0.42      0.41    316950



# SMOTE Extensions - Borderline SMOTE

In [49]:
from imblearn.over_sampling import BorderlineSMOTE

In [50]:
bsmt = BorderlineSMOTE(random_state=42)
X_bsmt, y_bsmt = bsmt.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_bsmt)))

Resampled dataset shape: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})


In [51]:
# train test split
X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt = train_test_split(X_bsmt, y_bsmt, test_size=0.2, random_state=42)

print(X_train_bsmt.shape, y_train_bsmt.shape)
print(X_test_bsmt.shape, y_test_bsmt.shape)

(1267984, 24) (1267984,)
(316997, 24) (316997,)


## Logistic Regression

In [52]:
logreg(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4732007659402642
Accuracy Score, Test Set:  0.47196030246343024

Confusion Matrix: 
 [[19335 53031 33653]
 [14761 67937 22380]
 [15051 28511 62338]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.18      0.25    106019
        Good       0.45      0.65      0.53    105078
        Poor       0.53      0.59      0.56    105900

    accuracy                           0.47    316997
   macro avg       0.46      0.47      0.45    316997
weighted avg       0.46      0.47      0.45    316997



## KNN Classifier

In [53]:
knn(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

KNN Classifier 

Accuracy Score, Training Set:  0.5668541558884024
Accuracy Score, Test Set:  0.5580210538270078

Confusion Matrix: 
 [[53594 32055 20370]
 [36165 52324 16589]
 [19724 15203 70973]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.49      0.51      0.50    106019
        Good       0.53      0.50      0.51    105078
        Poor       0.66      0.67      0.66    105900

    accuracy                           0.56    316997
   macro avg       0.56      0.56      0.56    316997
weighted avg       0.56      0.56      0.56    316997



## Decision Tree Classifier

In [54]:
decision_tree(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.627391986018751
Accuracy Score, Test Set: 0.6139774193446625
Confusion Matrix: 
 [[53056 22910 30053]
 [31660 49408 24010]
 [ 8214  5521 92165]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.57      0.50      0.53    106019
        Good       0.63      0.47      0.54    105078
        Poor       0.63      0.87      0.73    105900

    accuracy                           0.61    316997
   macro avg       0.61      0.61      0.60    316997
weighted avg       0.61      0.61      0.60    316997



## Random Forest Classifier

In [55]:
random_forest(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Random Forest Classifier 

Accuracy Score, Training Set: 0.6273817335234514
Accuracy Score, Test Set: 0.6151004583639593
Confusion Matrix: 
 [[52565 23291 30163]
 [31005 50126 23947]
 [ 8091  5515 92294]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.57      0.50      0.53    106019
        Good       0.64      0.48      0.54    105078
        Poor       0.63      0.87      0.73    105900

    accuracy                           0.62    316997
   macro avg       0.61      0.61      0.60    316997
weighted avg       0.61      0.62      0.60    316997



## Gaussian Naive Bayes

In [56]:
gaussian(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.4458045211926964
Accuracy Score, Test Set: 0.44354362975043926
Confusion Matrix: 
 [[12467 59414 34138]
 [10998 74866 19214]
 [ 8980 43651 53269]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.12      0.18    106019
        Good       0.42      0.71      0.53    105078
        Poor       0.50      0.50      0.50    105900

    accuracy                           0.44    316997
   macro avg       0.43      0.44      0.40    316997
weighted avg       0.43      0.44      0.40    316997



## Categorical Naive Bayes

In [57]:
categorical_naive_bayes(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Categorical Naive Bayes 

Accuracy Score, Training Set: 0.4596099004403841
Accuracy Score, Test Set: 0.457256693281009
Confusion Matrix: 
 [[25643 56319 24057]
 [21084 72390 11604]
 [19045 39939 46916]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.24      0.30    106019
        Good       0.43      0.69      0.53    105078
        Poor       0.57      0.44      0.50    105900

    accuracy                           0.46    316997
   macro avg       0.46      0.46      0.44    316997
weighted avg       0.46      0.46      0.44    316997

