# Random Under Sampling Methods

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (mean_squared_error,
                             classification_report,
                             mean_absolute_error,
                             accuracy_score,
                             confusion_matrix,
                             average_precision_score,
                             precision_recall_curve,
                             recall_score,
                             f1_score)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB

import imblearn
from imblearn.under_sampling import (RandomUnderSampler,
                                     TomekLinks,
                                     EditedNearestNeighbours,
                                     NeighbourhoodCleaningRule,
                                     NearMiss)

In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [4]:
tree.shape

(651535, 26)

# Target and Response Variable, Train_Test_Split

In [5]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [6]:
# create targe and response variable
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(521228, 24) (521228,)
(130307, 24) (130307,)


## Baseline - DummyClassifier

In [8]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X_train, y_train)
dc_pred = dummy_clf.predict(X_test)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6808935820792437


In [9]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X_train, y_train)
dc_pred_freq = dummy_clf_freq.predict(X_test)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6803042046858573


# Random Under Sampler

In [10]:
random_under = RandomUnderSampler(random_state=42)
X_rs, y_rs = random_under.fit_sample(X, y)

print('Random undersampling {}'.format(Counter(y_rs)))

Random undersampling Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})


In [12]:
# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.2, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

(64274, 24) (64274,)
(16069, 24) (16069,)


## Logistic Regression

In [14]:
logreg = LogisticRegression(random_state=42).fit(X_train_rs, y_train_rs)
logreg_pred = logreg.predict(X_test_rs)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set: ', logreg.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n')
print(classification_report(y_test_rs, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.4178672558110589
Accuracy Score, Test Set:  0.41284460762959735
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.16      0.23      5315
        Good       0.41      0.53      0.46      5369
        Poor       0.42      0.54      0.48      5385

    accuracy                           0.41     16069
   macro avg       0.41      0.41      0.39     16069
weighted avg       0.41      0.41      0.39     16069



In [15]:
# confusion matrix
cm = confusion_matrix(y_test_rs, logreg_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[ 865 2264 2186]
 [ 722 2846 1801]
 [ 665 1797 2923]]


## KNN Classifier

In [16]:
# using 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_rs, y_train_rs)
knn_pred = knn.predict(X_test_rs)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set: ', knn.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n')
print(classification_report(y_test_rs, knn_pred))

Accuracy Score, Training Set:  0.45264025889162024
Accuracy Score, Test Set:  0.3849648391312465
Classification Report 

              precision    recall  f1-score   support

        Fair       0.35      0.46      0.40      5315
        Good       0.39      0.40      0.40      5369
        Poor       0.44      0.30      0.35      5385

    accuracy                           0.38     16069
   macro avg       0.39      0.39      0.38     16069
weighted avg       0.39      0.38      0.38     16069



In [17]:
# confusion matrix
cm = confusion_matrix(y_test_rs, knn_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[2456 1696 1163]
 [2346 2132  891]
 [2201 1586 1598]]


## Decision Tree Classifier

In [18]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train_rs, y_train_rs)
decision_tree_pred = decision_tree.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n')
print(classification_report(y_test_rs, decision_tree_pred))

Accuracy Score, Training Set: 0.5395183122257834
Accuracy Score, Test Set: 0.40456780135664944
Classification Report 

              precision    recall  f1-score   support

        Fair       0.37      0.31      0.34      5315
        Good       0.41      0.50      0.45      5369
        Poor       0.43      0.40      0.42      5385

    accuracy                           0.40     16069
   macro avg       0.40      0.40      0.40     16069
weighted avg       0.40      0.40      0.40     16069



In [19]:
# confusion matrix
cm = confusion_matrix(y_test_rs, decision_tree_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[1639 2102 1574]
 [1429 2695 1245]
 [1393 1825 2167]]


## Random Forest Classifier

In [20]:
rand_forest = RandomForestClassifier(random_state=42).fit(X_train_rs, y_train_rs)
rand_forest_pred = rand_forest.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', rand_forest.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', rand_forest.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n')
print(classification_report(y_test_rs, rand_forest_pred))

Accuracy Score, Training Set: 0.5394871954445032
Accuracy Score, Test Set: 0.4104798058373265
Classification Report 

              precision    recall  f1-score   support

        Fair       0.37      0.27      0.32      5315
        Good       0.41      0.51      0.45      5369
        Poor       0.43      0.45      0.44      5385

    accuracy                           0.41     16069
   macro avg       0.41      0.41      0.40     16069
weighted avg       0.41      0.41      0.40     16069



In [21]:
# confusion matrix
cm = confusion_matrix(y_test_rs, rand_forest_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[1445 2087 1783]
 [1239 2717 1413]
 [1170 1781 2434]]


## Gaussian Naive Bayes

In [22]:
gaussian = GaussianNB().fit(X_train_rs, y_train_rs)
gaussian_pred = gaussian.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', gaussian.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n')
print(classification_report(y_test_rs, gaussian_pred))

Accuracy Score, Training Set: 0.3923670535519806
Accuracy Score, Test Set: 0.3933038770303068
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.08      0.13      5315
        Good       0.38      0.85      0.52      5369
        Poor       0.47      0.24      0.32      5385

    accuracy                           0.39     16069
   macro avg       0.41      0.39      0.32     16069
weighted avg       0.41      0.39      0.33     16069



In [23]:
# confusion matrix
cm = confusion_matrix(y_test_rs, gaussian_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[ 420 3859 1036]
 [ 340 4585  444]
 [ 318 3752 1315]]


## Categorical Naive Bayes

In [24]:
categorical = CategoricalNB().fit(X_train_rs, y_train_rs)
categorical_pred = categorical.predict(X_test_rs)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train_rs, y_train_rs))
print('Accuracy Score, Test Set:', categorical.score(X_test_rs, y_test_rs))

# classification report
print('Classification Report \n')
print(classification_report(y_test_rs, categorical_pred))

Accuracy Score, Training Set: 0.4167159349036936
Accuracy Score, Test Set: 0.41085319559400085
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.26      0.31      5315
        Good       0.40      0.62      0.49      5369
        Poor       0.47      0.35      0.40      5385

    accuracy                           0.41     16069
   macro avg       0.41      0.41      0.40     16069
weighted avg       0.41      0.41      0.40     16069



In [25]:
# confusion matrix
cm = confusion_matrix(y_test_rs, categorical_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[1382 2615 1318]
 [1189 3349  831]
 [1059 2455 1871]]


# Tomek Links

In [26]:
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X, y)

print('TomekLinks undersampling {}'.format(Counter(y_tomek)))

TomekLinks undersampling Counter({'Good': 527892, 'Fair': 96026, 'Poor': 26781})


In [28]:
# train test split
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tomek, y_tomek, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(521228, 24) (521228,)
(130307, 24) (130307,)


## Logistic Regression

In [29]:
logreg = LogisticRegression(random_state=42).fit(X_train_tl, y_train_tl)
logreg_pred = logreg.predict(X_test_tl)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_tl, y_train_tl))
print('Accuracy Score, Test Set: ', logreg.score(X_test_tl, y_test_tl))

# classification report
print('Classification Report \n')
print(classification_report(y_test_tl, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.811295165389514
Accuracy Score, Test Set:  0.810258183494698
Classification Report 

              precision    recall  f1-score   support

        Fair       0.32      0.02      0.03     19162
        Good       0.81      1.00      0.90    105492
        Poor       0.50      0.00      0.00      5486

    accuracy                           0.81    130140
   macro avg       0.54      0.34      0.31    130140
weighted avg       0.73      0.81      0.73    130140



In [30]:
# confusion matrix
cm = confusion_matrix(y_test_tl, logreg_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[   304  18858      0]
 [   349 105142      1]
 [   296   5189      1]]


## KNN Classifier

In [31]:
# using 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_tl, y_train_tl)
knn_pred = knn.predict(X_test_tl)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_tl, y_train_tl))
print('Accuracy Score, Test Set: ', knn.score(X_test_tl, y_test_tl))

# classification report
print('Classification Report \n')
print(classification_report(y_test_tl, knn_pred))

Accuracy Score, Training Set:  0.8020205202484253
Accuracy Score, Test Set:  0.7914246196403872
Classification Report 

              precision    recall  f1-score   support

        Fair       0.25      0.08      0.13     19162
        Good       0.82      0.96      0.88    105492
        Poor       0.37      0.02      0.04      5486

    accuracy                           0.79    130140
   macro avg       0.48      0.36      0.35    130140
weighted avg       0.72      0.79      0.74    130140



In [32]:
# confusion matrix
cm = confusion_matrix(y_test_tl, knn_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[  1620  17441    101]
 [  4132 101250    110]
 [   607   4753    126]]


## Decision Tree Classifier

In [33]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train_tl, y_train_tl)
decision_tree_pred = decision_tree.predict(X_test_tl)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree.score(X_train_tl, y_train_tl))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_tl, y_test_tl))

# classification report
print('Classification Report \n')
print(classification_report(y_test_tl, decision_tree_pred))

Accuracy Score, Training Set: 0.8250899513791905
Accuracy Score, Test Set: 0.8019594283079761
Classification Report 

              precision    recall  f1-score   support

        Fair       0.31      0.06      0.10     19162
        Good       0.82      0.98      0.89    105492
        Poor       0.27      0.03      0.06      5486

    accuracy                           0.80    130140
   macro avg       0.46      0.36      0.35    130140
weighted avg       0.72      0.80      0.74    130140



In [34]:
# confusion matrix
cm = confusion_matrix(y_test_tl, decision_tree_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[  1140  17836    186]
 [  2156 103055    281]
 [   434   4880    172]]


## Random Forest Classifier

In [35]:
rand_forest = RandomForestClassifier(random_state=42).fit(X_train_tl, y_train_tl)
rand_forest_pred = rand_forest.predict(X_test_tl)

# accuracy score
print('Accuracy Score, Training Set:', rand_forest.score(X_train_tl, y_train_tl))
print('Accuracy Score, Test Set:', rand_forest.score(X_test_tl, y_test_tl))

# classification report
print('Classification Report \n')
print(classification_report(y_test_tl, rand_forest_pred))

Accuracy Score, Training Set: 0.8250861093555197
Accuracy Score, Test Set: 0.8055478715229752
Classification Report 

              precision    recall  f1-score   support

        Fair       0.33      0.04      0.08     19162
        Good       0.82      0.98      0.89    105492
        Poor       0.27      0.03      0.06      5486

    accuracy                           0.81    130140
   macro avg       0.47      0.35      0.34    130140
weighted avg       0.72      0.81      0.74    130140



In [36]:
# confusion matrix
cm = confusion_matrix(y_test_tl, rand_forest_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[   845  18113    204]
 [  1398 103801    293]
 [   339   4959    188]]


## Gaussian Naive Bayes

In [37]:
gaussian = GaussianNB().fit(X_train_tl, y_train_tl)
gaussian_pred = gaussian.predict(X_test_tl)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_tl, y_train_tl))
print('Accuracy Score, Test Set:', gaussian.score(X_test_tl, y_test_tl))

# classification report
print('Classification Report \n')
print(classification_report(y_test_tl, gaussian_pred))

Accuracy Score, Training Set: 0.7354536181297413
Accuracy Score, Test Set: 0.7340940525587828
Classification Report 

              precision    recall  f1-score   support

        Fair       0.21      0.11      0.15     19162
        Good       0.83      0.87      0.85    105492
        Poor       0.12      0.19      0.15      5486

    accuracy                           0.73    130140
   macro avg       0.39      0.39      0.38    130140
weighted avg       0.71      0.73      0.72    130140



In [38]:
# confusion matrix
cm = confusion_matrix(y_test_tl, gaussian_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[ 2191 14579  2392]
 [ 7803 92300  5389]
 [  554  3888  1044]]


## Categorical Naive Bayes

In [39]:
categorical = CategoricalNB().fit(X_train_tl, y_train_tl)
categorical_pred = categorical.predict(X_test_tl)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train_tl, y_train_tl))
print('Accuracy Score, Test Set:', categorical.score(X_test_tl, y_test_tl))

# classification report
print('Classification Report \n')
print(classification_report(y_test_tl, categorical_pred))

Accuracy Score, Training Set: 0.8015095311002212
Accuracy Score, Test Set: 0.8003150453357922
Classification Report 

              precision    recall  f1-score   support

        Fair       0.28      0.06      0.10     19162
        Good       0.82      0.97      0.89    105492
        Poor       0.34      0.04      0.07      5486

    accuracy                           0.80    130140
   macro avg       0.48      0.36      0.35    130140
weighted avg       0.72      0.80      0.74    130140



In [40]:
# confusion matrix
cm = confusion_matrix(y_test_tl, categorical_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[  1159  17813    190]
 [  2486 102775    231]
 [   480   4787    219]]


# Edited Nearest Neighbors

In [41]:
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_enn))

Resampled dataset shape Counter({'Good': 312229, 'Poor': 26781, 'Fair': 1553})


In [43]:
# train test split
X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.2, random_state=42)

print(X_train_enn.shape, y_train_enn.shape)
print(X_test_enn.shape, y_test_enn.shape)

(272450, 24) (272450,)
(68113, 24) (68113,)


## Logistic Regression

In [44]:
logreg = LogisticRegression(random_state=42).fit(X_train_enn, y_train_enn)
logreg_pred = logreg.predict(X_test_enn)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_enn, y_train_enn))
print('Accuracy Score, Test Set: ', logreg.score(X_test_enn, y_test_enn))

# classification report
print('Classification Report \n')
print(classification_report(y_test_enn, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.921002018719031
Accuracy Score, Test Set:  0.9223643063732327
Classification Report 

              precision    recall  f1-score   support

        Fair       0.12      0.00      0.01       328
        Good       0.93      1.00      0.96     62504
        Poor       0.64      0.10      0.17      5281

    accuracy                           0.92     68113
   macro avg       0.56      0.37      0.38     68113
weighted avg       0.90      0.92      0.89     68113



In [45]:
# confusion matrix
cm = confusion_matrix(y_test_enn, logreg_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[    1   235    92]
 [    0 62295   209]
 [    7  4745   529]]


## KNN Classifier

In [46]:
# using 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_enn, y_train_enn)
knn_pred = knn.predict(X_test_enn)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_enn, y_train_enn))
print('Accuracy Score, Test Set: ', knn.score(X_test_enn, y_test_enn))

# classification report
print('Classification Report \n')
print(classification_report(y_test_enn, knn_pred))

Accuracy Score, Training Set:  0.9559258579555882
Accuracy Score, Test Set:  0.9524907139606242
Classification Report 

              precision    recall  f1-score   support

        Fair       0.68      0.70      0.69       328
        Good       0.96      1.00      0.98     62504
        Poor       0.90      0.45      0.60      5281

    accuracy                           0.95     68113
   macro avg       0.84      0.71      0.75     68113
weighted avg       0.95      0.95      0.95     68113



In [47]:
# confusion matrix
cm = confusion_matrix(y_test_enn, knn_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[  228    46    54]
 [    8 62295   201]
 [  101  2826  2354]]


## Decision Tree Classifier

In [48]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train_enn, y_train_enn)
decision_tree_pred = decision_tree.predict(X_test_enn)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree.score(X_train_enn, y_train_enn))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_enn, y_test_enn))

# classification report
print('Classification Report \n')
print(classification_report(y_test_enn, decision_tree_pred))

Accuracy Score, Training Set: 0.9652817030647826
Accuracy Score, Test Set: 0.9564253519886071
Classification Report 

              precision    recall  f1-score   support

        Fair       0.73      0.84      0.78       328
        Good       0.96      1.00      0.98     62504
        Poor       0.91      0.49      0.64      5281

    accuracy                           0.96     68113
   macro avg       0.87      0.78      0.80     68113
weighted avg       0.95      0.96      0.95     68113



In [49]:
# confusion matrix
cm = confusion_matrix(y_test_enn, decision_tree_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[  277    15    36]
 [   13 62274   217]
 [   92  2595  2594]]


## Random Forest Classifier

In [50]:
rand_forest = RandomForestClassifier(random_state=42).fit(X_train_enn, y_train_enn)
rand_forest_pred = rand_forest.predict(X_test_enn)

# accuracy score
print('Accuracy Score, Training Set:', rand_forest.score(X_train_enn, y_train_enn))
print('Accuracy Score, Test Set:', rand_forest.score(X_test_enn, y_test_enn))

# classification report
print('Classification Report \n')
print(classification_report(y_test_enn, rand_forest_pred))

Accuracy Score, Training Set: 0.9652817030647826
Accuracy Score, Test Set: 0.9582018116952711
Classification Report 

              precision    recall  f1-score   support

        Fair       0.77      0.84      0.80       328
        Good       0.96      1.00      0.98     62504
        Poor       0.95      0.49      0.65      5281

    accuracy                           0.96     68113
   macro avg       0.89      0.78      0.81     68113
weighted avg       0.96      0.96      0.95     68113



In [51]:
# confusion matrix
cm = confusion_matrix(y_test_enn, rand_forest_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[  274    18    36]
 [    3 62394   107]
 [   78  2605  2598]]


## Gaussian Naive Bayes

In [52]:
gaussian = GaussianNB().fit(X_train_enn, y_train_enn)
gaussian_pred = gaussian.predict(X_test_enn)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_enn, y_train_enn))
print('Accuracy Score, Test Set:', gaussian.score(X_test_enn, y_test_enn))

# classification report
print('Classification Report \n')
print(classification_report(y_test_enn, gaussian_pred))

Accuracy Score, Training Set: 0.8437438062029731
Accuracy Score, Test Set: 0.8444496645280637
Classification Report 

              precision    recall  f1-score   support

        Fair       0.03      0.34      0.06       328
        Good       0.94      0.90      0.92     62504
        Poor       0.21      0.19      0.20      5281

    accuracy                           0.84     68113
   macro avg       0.39      0.48      0.39     68113
weighted avg       0.88      0.84      0.86     68113



In [53]:
# confusion matrix
cm = confusion_matrix(y_test_enn, gaussian_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[  111    55   162]
 [ 2523 56408  3573]
 [  602  3680   999]]


## Categorical Naive Bayes

In [77]:
# # IndexError: index 9 is out of bounds for axis 1 with size 9
# categorical = CategoricalNB().fit(X_train_enn, y_train_enn)
# categorical_pred = categorical.predict(X_test_enn)

# # accuracy score
# print('Accuracy Score, Training Set:', categorical.score(X_train_enn, y_train_enn))
# print('Accuracy Score, Test Set:', categorical.score(X_test_enn, y_test_enn))

# # classification report
# print('Classification Report \n')
# print(classification_report(y_test_enn, categorical_pred))

In [78]:
# # confusion matrix
# cm = confusion_matrix(y_test_enn, categorical_pred)

# print ('Confusion Matrix: \n', cm)

# Near Miss

In [63]:
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_nm))

Resampled dataset shape Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})


In [64]:
# train test split
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42)

print(X_train_nm.shape, y_train_nm.shape)
print(X_test_nm.shape, y_test_nm.shape)

(60257, 24) (60257,)
(20086, 24) (20086,)


## Logistic Regression

In [65]:
logreg = LogisticRegression(random_state=42).fit(X_train_nm, y_train_nm)
logreg_pred = logreg.predict(X_test_nm)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set: ', logreg.score(X_test_nm, y_test_nm))

# classification report
print('Classification Report \n')
print(classification_report(y_test_nm, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.44751647111538906
Accuracy Score, Test Set:  0.4448869859603704
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.38      0.38      6648
        Good       0.43      0.60      0.50      6744
        Poor       0.59      0.35      0.44      6694

    accuracy                           0.44     20086
   macro avg       0.47      0.44      0.44     20086
weighted avg       0.47      0.44      0.44     20086



In [66]:
# confusion matrix
cm = confusion_matrix(y_test_nm, logreg_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[2553 3147  948]
 [2036 4019  689]
 [2063 2267 2364]]


## KNN Classifier

In [67]:
# using 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train_nm, y_train_nm)
knn_pred = knn.predict(X_test_nm)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set: ', knn.score(X_test_nm, y_test_nm))

# classification report
print('Classification Report \n')
print(classification_report(y_test_nm, knn_pred))

Accuracy Score, Training Set:  0.44540883216887667
Accuracy Score, Test Set:  0.4092402668525341
Classification Report 

              precision    recall  f1-score   support

        Fair       0.36      0.49      0.41      6648
        Good       0.40      0.42      0.41      6744
        Poor       0.54      0.32      0.40      6694

    accuracy                           0.41     20086
   macro avg       0.43      0.41      0.41     20086
weighted avg       0.43      0.41      0.41     20086



In [68]:
# confusion matrix
cm = confusion_matrix(y_test_nm, knn_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[3267 2377 1004]
 [3119 2818  807]
 [2794 1765 2135]]


## Decision Tree Classifier

In [69]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train_nm, y_train_nm)
decision_tree_pred = decision_tree.predict(X_test_nm)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_nm, y_test_nm))

# classification report
print('Classification Report \n')
print(classification_report(y_test_nm, decision_tree_pred))

Accuracy Score, Training Set: 0.5063478102129213
Accuracy Score, Test Set: 0.46883401374091405
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.43      0.42      6648
        Good       0.44      0.60      0.51      6744
        Poor       0.66      0.37      0.47      6694

    accuracy                           0.47     20086
   macro avg       0.50      0.47      0.47     20086
weighted avg       0.50      0.47      0.47     20086



In [70]:
# confusion matrix
cm = confusion_matrix(y_test_nm, decision_tree_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[2891 2991  766]
 [2178 4076  490]
 [2118 2126 2450]]


## Random Forest Classifier

In [71]:
rand_forest = RandomForestClassifier(random_state=42).fit(X_train_nm, y_train_nm)
rand_forest_pred = rand_forest.predict(X_test_nm)

# accuracy score
print('Accuracy Score, Training Set:', rand_forest.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', rand_forest.score(X_test_nm, y_test_nm))

# classification report
print('Classification Report \n')
print(classification_report(y_test_nm, rand_forest_pred))

Accuracy Score, Training Set: 0.5062648323016413
Accuracy Score, Test Set: 0.47301603106641443
Classification Report 

              precision    recall  f1-score   support

        Fair       0.41      0.42      0.41      6648
        Good       0.44      0.61      0.51      6744
        Poor       0.65      0.39      0.48      6694

    accuracy                           0.47     20086
   macro avg       0.50      0.47      0.47     20086
weighted avg       0.50      0.47      0.47     20086



In [72]:
# confusion matrix
cm = confusion_matrix(y_test_nm, rand_forest_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[2795 3007  846]
 [2076 4124  544]
 [1968 2144 2582]]


## Gaussian Naive Bayes

In [73]:
gaussian = GaussianNB().fit(X_train_nm, y_train_nm)
gaussian_pred = gaussian.predict(X_test_nm)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', gaussian.score(X_test_nm, y_test_nm))

# classification report
print('Classification Report \n')
print(classification_report(y_test_nm, gaussian_pred))

Accuracy Score, Training Set: 0.4020279801516836
Accuracy Score, Test Set: 0.39744100368415813
Classification Report 

              precision    recall  f1-score   support

        Fair       0.30      0.09      0.14      6648
        Good       0.37      0.93      0.53      6744
        Poor       0.79      0.17      0.28      6694

    accuracy                           0.40     20086
   macro avg       0.49      0.40      0.32     20086
weighted avg       0.49      0.40      0.32     20086



In [74]:
# confusion matrix
cm = confusion_matrix(y_test_nm, gaussian_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[ 589 5864  195]
 [ 383 6249  112]
 [ 981 4568 1145]]


## Categorical Naive Bayes

In [75]:
categorical = CategoricalNB().fit(X_train_nm, y_train_nm)
categorical_pred = categorical.predict(X_test_nm)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', categorical.score(X_test_nm, y_test_nm))

# classification report
print('Classification Report \n')
print(classification_report(y_test_nm, categorical_pred))

Accuracy Score, Training Set: 0.45123388154073385
Accuracy Score, Test Set: 0.4474758538285373
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.39      0.39      6648
        Good       0.43      0.61      0.50      6744
        Poor       0.62      0.34      0.44      6694

    accuracy                           0.45     20086
   macro avg       0.48      0.45      0.44     20086
weighted avg       0.48      0.45      0.44     20086



In [76]:
# confusion matrix
cm = confusion_matrix(y_test_nm, categorical_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[2596 3215  837]
 [2059 4118  567]
 [2106 2314 2274]]
