# Random Under Sampling Methods

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (mean_squared_error,
                             classification_report,
                             mean_absolute_error,
                             accuracy_score,
                             confusion_matrix,
                             average_precision_score,
                             precision_recall_curve,
                             recall_score,
                             f1_score)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB

import imblearn
from imblearn.under_sampling import (RandomUnderSampler,
                                     TomekLinks,
                                     EditedNearestNeighbours,
                                     NeighbourhoodCleaningRule,
                                     NearMiss)

In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [4]:
tree.shape

(651535, 26)

# Target and Response Variable, Train_Test_Split

In [5]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [37]:
# create targe and response variable
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(521228, 24) (521228,)
(130307, 24) (130307,)


## Baseline - DummyClassifier

In [38]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X_train, y_train)
dc_pred = dummy_clf.predict(X_test)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.6804085735992694


In [39]:
dummy_clf_freq = DummyClassifier(strategy='most_frequent')
dummy_clf_freq.fit(X_train, y_train)
dc_pred_freq = dummy_clf_freq.predict(X_test)

print('Accuracy Score: ', dummy_clf.score(X, y))

Accuracy Score:  0.68208308072475


# Random Under Sampler

In [10]:
random_under = RandomUnderSampler(random_state=42)
X_rs, y_rs = random_under.fit_sample(X_train, y_train)

print('Random undersampling {}'.format(Counter(y_rs)))

Random undersampling Counter({'Fair': 20086, 'Good': 20086, 'Poor': 20086})


In [11]:
# X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

## Logistic Regression

In [12]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_rs, y_rs)
logreg_pred = logreg_rs.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_rs, y_rs))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.4195127617909655
Accuracy Score, Test Set:  0.498133641118833


In [13]:
print(classification_report(y_test, logreg_pred))

              precision    recall  f1-score   support

        Fair       0.18      0.18      0.18     24107
        Good       0.85      0.55      0.67    132082
        Poor       0.07      0.51      0.12      6695

    accuracy                           0.50    162884
   macro avg       0.37      0.42      0.32    162884
weighted avg       0.72      0.50      0.58    162884



## KNN Classifier

In [13]:
# using 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_rs, y_rs)
knn_pred = knn.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_rs, y_rs))
print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))

Accuracy Score, Training Set:  0.44818029440562923
Accuracy Score, Test Set:  0.37284675893657276


In [14]:
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

        Fair       0.35      0.44      0.39      6648
        Good       0.39      0.35      0.37      6744
        Poor       0.39      0.33      0.36      6694

    accuracy                           0.37     20086
   macro avg       0.38      0.37      0.37     20086
weighted avg       0.38      0.37      0.37     20086



## Decision Tree Classifier

In [34]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_rs, y_rs)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred))

Accuracy Score, Training Set: 0.47542520121722864
Accuracy Score, Test Set: 0.4579885071584686
Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.32      0.22     24107
        Good       0.85      0.49      0.62    132082
        Poor       0.07      0.40      0.11      6695

    accuracy                           0.46    162884
   macro avg       0.36      0.40      0.32    162884
weighted avg       0.72      0.46      0.54    162884



## Random Forest Classifier

In [33]:
rand_forest = RandomForestClassifier(random_state=42)
rand_forest.fit(X_rs, y_rs)
rand_forest_pred = rand_forest.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', rand_forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', rand_forest.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, rand_forest_pred))

Accuracy Score, Training Set: 0.4865537981094892
Accuracy Score, Test Set: 0.47021806930085214
Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.28      0.21     24107
        Good       0.85      0.51      0.64    132082
        Poor       0.07      0.44      0.11      6695

    accuracy                           0.47    162884
   macro avg       0.36      0.41      0.32    162884
weighted avg       0.72      0.47      0.55    162884



In [18]:
# confusion matrix
cm = confusion_matrix(y_test, rand_forest_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[ 6743  9539  7825]
 [31654 66923 33505]
 [ 1627  2143  2925]]


## Gaussian Naive Bayes

In [32]:
gaussian = GaussianNB()
gaussian.fit(X_rs, y_rs)
gaussian_pred = gaussian.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, gaussian_pred))

Accuracy Score, Training Set: 0.7201745212841066
Accuracy Score, Test Set: 0.7218327153065985
Classification Report 

              precision    recall  f1-score   support

        Fair       0.19      0.05      0.08     24107
        Good       0.83      0.87      0.85    132082
        Poor       0.09      0.26      0.13      6695

    accuracy                           0.72    162884
   macro avg       0.37      0.39      0.35    162884
weighted avg       0.71      0.72      0.71    162884



## Categorical Naive Bayes

In [35]:
categorical = CategoricalNB()
categorical.fit(X_rs, y_rs)
categorical_pred = categorical.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train, y_train))
print('Accuracy Score, Test Set:', categorical.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, categorical_pred))

Accuracy Score, Training Set: 0.5487249591221547
Accuracy Score, Test Set: 0.5483841261265686
Classification Report 

              precision    recall  f1-score   support

        Fair       0.17      0.28      0.21     24107
        Good       0.85      0.61      0.71    132082
        Poor       0.08      0.35      0.13      6695

    accuracy                           0.55    162884
   macro avg       0.37      0.41      0.35    162884
weighted avg       0.72      0.55      0.61    162884



In [36]:
# confusion matrix
cm = confusion_matrix(y_test, categorical_pred)

print ('Confusion Matrix: \n', cm)

Confusion Matrix: 
 [[ 6744 11615  5748]
 [31595 80223 20264]
 [ 1453  2886  2356]]


# Tomek Links

In [42]:
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X_train, y_train)

print('TomekLinks undersampling {}'.format(Counter(y_tomek)))

TomekLinks undersampling Counter({'Good': 422260, 'Fair': 76775, 'Poor': 21425})


In [27]:
# train test split
# X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tomek, y_tomek, test_size=0.25, random_state=42)

## Logistic Regression

In [43]:
logreg = LogisticRegression(random_state=42).fit(X_tomek, y_tomek)
logreg_pred = logreg.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.8107411727689227
Accuracy Score, Test Set:  0.8105473996024772
Classification Report 

              precision    recall  f1-score   support

        Fair       0.34      0.02      0.03     19285
        Good       0.81      1.00      0.90    105666
        Poor       0.12      0.00      0.00      5356

    accuracy                           0.81    130307
   macro avg       0.42      0.34      0.31    130307
weighted avg       0.72      0.81      0.73    130307



## KNN Classifier

In [31]:
# using 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5).fit(X_tomek, y_tomek)
knn_pred = knn.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, knn_pred))

0.7818964192408175


In [32]:
print(classification_report(y_test_tl, y_pred_knn_tl))

              precision    recall  f1-score   support

        Fair       0.23      0.10      0.13     23893
        Good       0.82      0.95      0.88    131916
        Poor       0.28      0.02      0.04      6866

    accuracy                           0.78    162675
   macro avg       0.44      0.35      0.35    162675
weighted avg       0.71      0.78      0.73    162675



## Decision Tree Classifier

In [45]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_tomek, y_tomek)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred))

Accuracy Score, Training Set: 0.8242880275042783
Accuracy Score, Test Set: 0.8015993001143453
Classification Report 

              precision    recall  f1-score   support

        Fair       0.30      0.06      0.10     19285
        Good       0.82      0.98      0.89    105666
        Poor       0.26      0.03      0.06      5356

    accuracy                           0.80    130307
   macro avg       0.46      0.36      0.35    130307
weighted avg       0.72      0.80      0.74    130307



## Random Forest Classifier

In [46]:
rand_forest = RandomForestClassifier(random_state=42).fit(X_tomek, y_tomek)
rand_forest_pred = rand_forest.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', rand_forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', rand_forest.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, rand_forest_pred))

Accuracy Score, Training Set: 0.8243494209827561
Accuracy Score, Test Set: 0.8051677960508645
Classification Report 

              precision    recall  f1-score   support

        Fair       0.32      0.05      0.08     19285
        Good       0.82      0.98      0.89    105666
        Poor       0.26      0.04      0.07      5356

    accuracy                           0.81    130307
   macro avg       0.47      0.36      0.35    130307
weighted avg       0.72      0.81      0.74    130307



## Gaussian Naive Bayes

In [47]:
gaussian = GaussianNB().fit(X_tomek, y_tomek)
gaussian_pred = gaussian.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, gaussian_pred))

Accuracy Score, Training Set: 0.7359197894203688
Accuracy Score, Test Set: 0.7375889246164826
Classification Report 

              precision    recall  f1-score   support

        Fair       0.21      0.11      0.14     19285
        Good       0.83      0.88      0.86    105666
        Poor       0.12      0.20      0.15      5356

    accuracy                           0.74    130307
   macro avg       0.39      0.39      0.38    130307
weighted avg       0.71      0.74      0.72    130307



## Categorical Naive Bayes

In [48]:
categorical = CategoricalNB().fit(X_tomek, y_tomek)
categorical_pred = categorical.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train, y_train))
print('Accuracy Score, Test Set:', categorical.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, categorical_pred))

Accuracy Score, Training Set: 0.8010103064301994
Accuracy Score, Test Set: 0.8008625783726123
Classification Report 

              precision    recall  f1-score   support

        Fair       0.28      0.06      0.10     19285
        Good       0.82      0.98      0.89    105666
        Poor       0.30      0.04      0.07      5356

    accuracy                           0.80    130307
   macro avg       0.47      0.36      0.35    130307
weighted avg       0.72      0.80      0.74    130307



# Edited Nearest Neighbors

In [50]:
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_enn))

Resampled dataset shape Counter({'Good': 217533, 'Poor': 21425, 'Fair': 969})


In [44]:
# train test split
# X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.25, random_state=42)

## Logistic Regression

In [52]:
logreg = LogisticRegression(random_state=42).fit(X_enn, y_enn)
logreg_pred = logreg.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.8058795766919659
Accuracy Score, Test Set:  0.806326597957132
Classification Report 

              precision    recall  f1-score   support

        Fair       0.50      0.00      0.00     19285
        Good       0.82      0.99      0.90    105666
        Poor       0.21      0.11      0.14      5356

    accuracy                           0.81    130307
   macro avg       0.51      0.36      0.35    130307
weighted avg       0.75      0.81      0.73    130307



## KNN Classifier

In [48]:
# using 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5).fit(X_enn, y_enn)
knn_pred = knn.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', knn_enn.score(X_train_enn, y_train_enn))
print('Accuracy Score, Test Set: ', knn_enn.score(X_test_enn, y_test_enn))

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

Accuracy Score, Training Set:  0.9530463311695938
Accuracy Score, Test Set:  0.9505291222795128


In [49]:
print(classification_report(y_test_enn, y_pred_knn_enn))

              precision    recall  f1-score   support

        Fair       0.62      0.63      0.62       393
        Good       0.95      1.00      0.97     78137
        Poor       0.91      0.42      0.57      6611

    accuracy                           0.95     85141
   macro avg       0.83      0.68      0.72     85141
weighted avg       0.95      0.95      0.94     85141



## Decision Tree Classifier

In [53]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_enn, y_enn)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred))

Accuracy Score, Training Set: 0.4798437535972741
Accuracy Score, Test Set: 0.46924570437505275
Classification Report 

              precision    recall  f1-score   support

        Fair       0.30      0.01      0.02     19285
        Good       0.83      0.55      0.66    105666
        Poor       0.05      0.56      0.09      5356

    accuracy                           0.47    130307
   macro avg       0.39      0.37      0.26    130307
weighted avg       0.72      0.47      0.54    130307



In [52]:
print(classification_report(y_test_enn, y_pred_decision_tree_enn))

              precision    recall  f1-score   support

        Fair       0.72      0.85      0.78       393
        Good       0.96      1.00      0.98     78137
        Poor       0.91      0.49      0.64      6611

    accuracy                           0.96     85141
   macro avg       0.86      0.78      0.80     85141
weighted avg       0.96      0.96      0.95     85141



## Random Forest Classifier

In [55]:
rand_forest = RandomForestClassifier(random_state=42).fit(X_enn, y_enn)
rand_forest_pred = rand_forest.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', rand_forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', rand_forest.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, rand_forest_pred))

Accuracy Score, Training Set: 0.4828923235129348
Accuracy Score, Test Set: 0.4738425410760742
Classification Report 

              precision    recall  f1-score   support

        Fair       0.31      0.01      0.02     19285
        Good       0.83      0.55      0.66    105666
        Poor       0.05      0.56      0.09      5356

    accuracy                           0.47    130307
   macro avg       0.40      0.37      0.26    130307
weighted avg       0.72      0.47      0.55    130307



In [55]:
print(classification_report(y_test_enn, y_pred_forest_enn))

              precision    recall  f1-score   support

        Fair       0.76      0.84      0.80       393
        Good       0.96      1.00      0.98     78137
        Poor       0.94      0.49      0.65      6611

    accuracy                           0.96     85141
   macro avg       0.89      0.78      0.81     85141
weighted avg       0.96      0.96      0.95     85141



## Gaussian Naive Bayes

In [56]:
gaussian = GaussianNB().fit(X_enn, y_enn)
gaussian_pred = gaussian.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, gaussian_pred))

Accuracy Score, Training Set: 0.745065499167351
Accuracy Score, Test Set: 0.7462147083426063
Classification Report 

              precision    recall  f1-score   support

        Fair       0.23      0.09      0.13     19285
        Good       0.83      0.90      0.86    105666
        Poor       0.10      0.17      0.12      5356

    accuracy                           0.75    130307
   macro avg       0.39      0.38      0.37    130307
weighted avg       0.71      0.75      0.72    130307



In [58]:
print(classification_report(y_test_enn, y_pred_gaussian))

              precision    recall  f1-score   support

        Fair       0.03      0.33      0.06       393
        Good       0.94      0.90      0.92     78137
        Poor       0.21      0.19      0.20      6611

    accuracy                           0.84     85141
   macro avg       0.39      0.47      0.39     85141
weighted avg       0.88      0.84      0.86     85141



## Categorical Naive Bayes

In [58]:
categorical = CategoricalNB().fit(X_enn, y_enn)
categorical_pred = categorical.predict(X_test)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train, y_train))
print('Accuracy Score, Test Set:', categorical.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, categorical_pred))

Accuracy Score, Training Set: 0.7955328570222628
Accuracy Score, Test Set: 0.7960662128665382
Classification Report 

              precision    recall  f1-score   support

        Fair       0.30      0.03      0.06     19285
        Good       0.82      0.97      0.89    105666
        Poor       0.14      0.10      0.12      5356

    accuracy                           0.80    130307
   macro avg       0.42      0.37      0.35    130307
weighted avg       0.72      0.80      0.74    130307



# Near Miss

In [62]:
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_nm))

Resampled dataset shape Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})


In [63]:
# train test split
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42)

## Logistic Regression

In [64]:
logreg_nm = LogisticRegression().fit(X_train_nm, y_train_nm)
y_pred_logreg_nm = logreg_nm.predict(X_test_nm)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg_nm.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set: ', logreg_nm.score(X_test_nm, y_test_nm))

Accuracy Score, Training Set:  0.44751647111538906
Accuracy Score, Test Set:  0.4448869859603704


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [65]:
# cross validation - 5-fold
cv_scores_logreg = cross_val_score(logreg_nm, X_nm, y_nm, cv=5)
print(cv_scores_logreg)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_logreg)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[0.41670297 0.42802912 0.41807206 0.43048295 0.46415235]
Average 5-Fold CV Score: 0.431487891409546


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## KNN Classifier

In [66]:
# using 6 neighbors
knn_nm = KNeighborsClassifier(n_neighbors=6)
knn_nm.fit(X_train_nm, y_train_nm)
y_pred_knn_nm = knn_nm.predict(X_test_nm)

# accuracy scores
print('Accuracy Score, Training Set: ', knn_nm.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set: ', knn_nm.score(X_test_nm, y_test_nm))

Accuracy Score, Training Set:  0.44736711087508507
Accuracy Score, Test Set:  0.41133127551528426


In [67]:
print(classification_report(y_test_nm, y_pred_knn_nm))

              precision    recall  f1-score   support

        Fair       0.36      0.47      0.41      6648
        Good       0.41      0.45      0.43      6744
        Poor       0.54      0.31      0.39      6694

    accuracy                           0.41     20086
   macro avg       0.43      0.41      0.41     20086
weighted avg       0.43      0.41      0.41     20086



## Decision Tree Classifier

In [68]:
decision_tree_nm = DecisionTreeClassifier(random_state=42)
decision_tree_nm.fit(X_train_nm, y_train_nm)
y_pred_decision_tree_nm = decision_tree_nm.predict(X_test_nm)

# accuracy score

print('Accuracy Score, Training Set:', decision_tree_nm.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', decision_tree_nm.score(X_test_nm, y_test_nm))

Accuracy Score, Training Set: 0.5063478102129213
Accuracy Score, Test Set: 0.46883401374091405


In [69]:
print(classification_report(y_test_nm, y_pred_decision_tree_nm))

              precision    recall  f1-score   support

        Fair       0.40      0.43      0.42      6648
        Good       0.44      0.60      0.51      6744
        Poor       0.66      0.37      0.47      6694

    accuracy                           0.47     20086
   macro avg       0.50      0.47      0.47     20086
weighted avg       0.50      0.47      0.47     20086



## Random Forest Classifier

In [70]:
forest = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_nm, y_train_nm)
y_pred_forest_nm = forest.predict(X_test_nm)

# accuracy score
print('Accuracy Score, Training Set:', forest.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', forest.score(X_test_nm, y_test_nm))

Accuracy Score, Training Set: 0.5062648323016413
Accuracy Score, Test Set: 0.47301603106641443


In [71]:
# cross validation - 5-fold
cv_scores_rfc = cross_val_score(forest, X_nm, y_nm, cv=5)

print(cv_scores_rfc)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores_rfc)))

[0.43898189 0.45105483 0.43611924 0.44772218 0.45799104]
Average 5-Fold CV Score: 0.4463738342550105


In [72]:
print(classification_report(y_test_nm, y_pred_forest_nm))

              precision    recall  f1-score   support

        Fair       0.41      0.42      0.41      6648
        Good       0.44      0.61      0.51      6744
        Poor       0.65      0.39      0.48      6694

    accuracy                           0.47     20086
   macro avg       0.50      0.47      0.47     20086
weighted avg       0.50      0.47      0.47     20086



## Gaussian Naive Bayes

In [73]:
gaussian = GaussianNB().fit(X_train_nm, y_train_nm)
y_pred_g = gaussian.predict(X_test_nm)

# accuracy score
print('Accuracy Score, Training Set:', gaussian.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', gaussian.score(X_test_nm, y_test_nm))

Accuracy Score, Training Set: 0.4020279801516836
Accuracy Score, Test Set: 0.39744100368415813


In [74]:
print(classification_report(y_test_rs, y_pred_g))

              precision    recall  f1-score   support

        Fair       0.30      0.09      0.14      6648
        Good       0.37      0.93      0.53      6744
        Poor       0.79      0.17      0.28      6694

    accuracy                           0.40     20086
   macro avg       0.49      0.40      0.32     20086
weighted avg       0.49      0.40      0.32     20086



## Categorical Naive Bayes

In [75]:
categorical = CategoricalNB()
categorical.fit(X_train_nm, y_train_nm)
y_pred_cnb = categorical.predict(X_test_nm)

# accuracy score
print('Accuracy Score, Training Set:', categorical.score(X_train_nm, y_train_nm))
print('Accuracy Score, Test Set:', categorical.score(X_test_nm, y_test_nm))

Accuracy Score, Training Set: 0.45123388154073385
Accuracy Score, Test Set: 0.4474758538285373


In [76]:
print(classification_report(y_test_rs, y_pred_cnb))

              precision    recall  f1-score   support

        Fair       0.38      0.39      0.39      6648
        Good       0.43      0.61      0.50      6744
        Poor       0.62      0.34      0.44      6694

    accuracy                           0.45     20086
   macro avg       0.48      0.45      0.44     20086
weighted avg       0.48      0.45      0.44     20086

