# General Overview - Combination Under and Over Sampling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import (cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (classification_report,
                             confusion_matrix)
import imblearn
from imblearn.combine import (SMOTETomek, SMOTEENN)



In [2]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [3]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [4]:
tree.shape

(651535, 26)

In [5]:
# for reproducible results
np.random.seed(42)

## separate variables using train test split

In [6]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [7]:
# target variable = health
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 24) (488651,)
(162884, 24) (162884,)


# Baseline - DummyClassifier

This is the control for our models. The accuracy scores show the success rates we should expect based on the strategies used for simple guessing.

In [8]:
strategies = ['most_frequent', 'stratified', 'uniform', 'constant'] # strategies available
  
for s in strategies: 
    if s =='constant': 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42, constant='Good') 
    else: 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42) 
    dummy_classifier.fit(X_train, y_train) 
    score = dummy_classifier.score(X_test, y_test) 
    print(s, score)

most_frequent 0.8108960978364972
stratified 0.6815525159008865
uniform 0.3338879202377152
constant 0.8108960978364972


# SMOTE Tomek

This method combines over sampling using SMOTE and under sampling by Tomek Links.

In [9]:
# initialize
smt = SMOTETomek(random_state=42)
X_smt, y_smt = smt.fit_sample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_smt)))

# train test split
X_train_smt, X_test_smt, y_train_smt, y_test_smt = train_test_split(X_smt, y_smt, test_size=0.25, random_state=42)

print(X_train_smt.shape, y_train_smt.shape)
print(X_test_smt.shape, y_test_smt.shape)



Resampled dataset shape: Counter({'Poor': 528308, 'Fair': 528226, 'Good': 528213})
(1188560, 24) (1188560,)
(396187, 24) (396187,)


In [10]:
# logistic regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_smt, y_train_smt)
y_pred = logreg.predict(X_test_smt)
    
print('Logistic Regression \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set: ', logreg.score(X_test_smt, y_test_smt))
    
# confusion matrix
cm = confusion_matrix(y_test_smt, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.42051894729757017
Accuracy Score, Test Set:  0.41956197452213223
Confusion Matrix: 
 [[23417 55601 53500]
 [17525 69329 44609]
 [16413 42314 73479]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.41      0.18      0.25    132518
        Good       0.41      0.53      0.46    131463
        Poor       0.43      0.56      0.48    132206

    accuracy                           0.42    396187
   macro avg       0.42      0.42      0.40    396187
weighted avg       0.42      0.42      0.40    396187



In [11]:
# k-nearest neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_smt, y_train_smt)
y_pred = knn.predict(X_test_smt)
    
print('KNN Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set: ', knn.score(X_test_smt, y_test_smt))
    
# confusion matrix
cm = confusion_matrix(y_test_smt, y_pred)    
print('Confusion Matrix: \n', cm)

# classificatin report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

KNN Classifier 

Accuracy Score, Training Set:  0.4470291781651747
Accuracy Score, Test Set:  0.43683664532152744
Confusion Matrix: 
 [[66133 41977 24408]
 [52596 57872 20995]
 [48439 34703 49064]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.50      0.44    132518
        Good       0.43      0.44      0.44    131463
        Poor       0.52      0.37      0.43    132206

    accuracy                           0.44    396187
   macro avg       0.45      0.44      0.44    396187
weighted avg       0.45      0.44      0.44    396187



In [13]:
# decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_smt, y_train_smt)
y_pred = decision_tree.predict(X_test_smt)
    
print('Decision Tree Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_smt, y_test_smt))
    
# confusion matrix
cm = confusion_matrix(y_test_smt, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5035967893922056
Accuracy Score, Test Set: 0.48834010202252975
Confusion Matrix: 
 [[42321 50050 40147]
 [23534 74655 33274]
 [16425 39283 76498]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.51      0.32      0.39    132518
        Good       0.46      0.57      0.51    131463
        Poor       0.51      0.58      0.54    132206

    accuracy                           0.49    396187
   macro avg       0.49      0.49      0.48    396187
weighted avg       0.49      0.49      0.48    396187



In [14]:
# random forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smt, y_train_smt)
y_pred = rf.predict(X_test_smt)
    
print('Random Forest Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', rf.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set:', rf.score(X_test_smt, y_test_smt))
    
# confusion matrix
cm = confusion_matrix(y_test_smt, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

Random Forest Classifier 

Accuracy Score, Training Set: 0.503583327724305
Accuracy Score, Test Set: 0.4892209991746322
Confusion Matrix: 
 [[41723 50677 40118]
 [22779 75613 33071]
 [16271 39448 76487]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.52      0.31      0.39    132518
        Good       0.46      0.58      0.51    131463
        Poor       0.51      0.58      0.54    132206

    accuracy                           0.49    396187
   macro avg       0.49      0.49      0.48    396187
weighted avg       0.49      0.49      0.48    396187



In [15]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train_smt, y_train_smt)
y_pred = gaussian.predict(X_test_smt)
    
print('Gaussian Naive Bayes \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', gaussian.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set:', gaussian.score(X_test_smt, y_test_smt))
    
# confusion matrix
cm = confusion_matrix(y_test_smt, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.37264589082587335
Accuracy Score, Test Set: 0.3720263410965024
Confusion Matrix: 
 [[ 12282  11264 108972]
 [ 10767  18364 102332]
 [  8296   7164 116746]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.09      0.15    132518
        Good       0.50      0.14      0.22    131463
        Poor       0.36      0.88      0.51    132206

    accuracy                           0.37    396187
   macro avg       0.42      0.37      0.29    396187
weighted avg       0.42      0.37      0.29    396187



# SMOTE ENN

This method uses a combination of SMOTE (Synthetic Minority Over-sampling Technique) over sampling and under sampling using Edited Nearest Neighbors.

In [16]:
# initialize
sme = SMOTEENN(random_state=42)
X_senn, y_senn = sme.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_senn)))

# train test split
X_train_senn, X_test_senn, y_train_senn, y_test_senn = train_test_split(X_smt, y_smt, test_size=0.25, random_state=42)

print(X_train_senn.shape, y_train_senn.shape)
print(X_test_senn.shape, y_test_senn.shape)



Resampled dataset shape: Counter({'Good': 82562, 'Poor': 74722, 'Fair': 69648})
(1188560, 24) (1188560,)
(396187, 24) (396187,)


In [17]:
# logistic regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_senn, y_train_senn)
y_pred = logreg.predict(X_test_senn)
    
print('Logistic Regression \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set: ', logreg.score(X_test_senn, y_test_senn))
    
# confusion matrix
cm = confusion_matrix(y_test_senn, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.42051894729757017
Accuracy Score, Test Set:  0.41956197452213223
Confusion Matrix: 
 [[23417 55601 53500]
 [17525 69329 44609]
 [16413 42314 73479]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.41      0.18      0.25    132518
        Good       0.41      0.53      0.46    131463
        Poor       0.43      0.56      0.48    132206

    accuracy                           0.42    396187
   macro avg       0.42      0.42      0.40    396187
weighted avg       0.42      0.42      0.40    396187



In [18]:
# k-nearest neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_senn, y_train_senn)
y_pred = knn.predict(X_test_senn)
    
print('KNN Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set: ', knn.score(X_test_senn, y_test_senn))
    
# confusion matrix
cm = confusion_matrix(y_test_senn, y_pred)    
print('Confusion Matrix: \n', cm)

# classificatin report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

KNN Classifier 

Accuracy Score, Training Set:  0.4470291781651747
Accuracy Score, Test Set:  0.43683664532152744
Confusion Matrix: 
 [[66133 41977 24408]
 [52596 57872 20995]
 [48439 34703 49064]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.50      0.44    132518
        Good       0.43      0.44      0.44    131463
        Poor       0.52      0.37      0.43    132206

    accuracy                           0.44    396187
   macro avg       0.45      0.44      0.44    396187
weighted avg       0.45      0.44      0.44    396187



In [20]:
# decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_senn, y_train_senn)
y_pred = decision_tree.predict(X_test_senn)
    
print('Decision Tree Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_senn, y_test_senn))
    
# confusion matrix
cm = confusion_matrix(y_test_senn, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5035967893922056
Accuracy Score, Test Set: 0.48834010202252975
Confusion Matrix: 
 [[42321 50050 40147]
 [23534 74655 33274]
 [16425 39283 76498]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.51      0.32      0.39    132518
        Good       0.46      0.57      0.51    131463
        Poor       0.51      0.58      0.54    132206

    accuracy                           0.49    396187
   macro avg       0.49      0.49      0.48    396187
weighted avg       0.49      0.49      0.48    396187



In [21]:
# random forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_senn, y_train_senn)
y_pred = rf.predict(X_test_senn)
    
print('Random Forest Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', rf.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set:', rf.score(X_test_senn, y_test_senn))
    
# confusion matrix
cm = confusion_matrix(y_test_senn, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

Random Forest Classifier 

Accuracy Score, Training Set: 0.503583327724305
Accuracy Score, Test Set: 0.4892209991746322
Confusion Matrix: 
 [[41723 50677 40118]
 [22779 75613 33071]
 [16271 39448 76487]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.52      0.31      0.39    132518
        Good       0.46      0.58      0.51    131463
        Poor       0.51      0.58      0.54    132206

    accuracy                           0.49    396187
   macro avg       0.49      0.49      0.48    396187
weighted avg       0.49      0.49      0.48    396187



In [22]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train_senn, y_train_senn)
y_pred = gaussian.predict(X_test_senn)
    
print('Gaussian Naive Bayes \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', gaussian.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set:', gaussian.score(X_test_senn, y_test_senn))
    
# confusion matrix
cm = confusion_matrix(y_test_senn, y_pred)
print('Confusion Matrix: \n', cm)
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.37264589082587335
Accuracy Score, Test Set: 0.3720263410965024
Confusion Matrix: 
 [[ 12282  11264 108972]
 [ 10767  18364 102332]
 [  8296   7164 116746]]
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.09      0.15    132518
        Good       0.50      0.14      0.22    131463
        Poor       0.36      0.88      0.51    132206

    accuracy                           0.37    396187
   macro avg       0.42      0.37      0.29    396187
weighted avg       0.42      0.37      0.29    396187

