# General Overview - Combination Under and Over Sampling

In [19]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import imblearn
from imblearn.combine import (SMOTETomek, SMOTEENN)



In [3]:
np.random.seed(42)

In [4]:
# import data
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/tree_ml.csv', index_col=0)

tree = data.copy()

In [5]:
tree.head()

Unnamed: 0,tree_dbh,curb_loc,health,sidewalk,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,longitude,latitude,num_problems,1or2,3or4,4orMore,Stew_N,Guard_N,Harmful,Helpful,Unsure,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,3,1,Fair,0,0,0,0,0,0,0,0,0,0,-73.844215,40.723092,0,0,0,0,1,1,0,0,0,0,0,0,1,0
1,21,1,Fair,1,1,0,0,0,0,0,0,0,0,-73.818679,40.794111,1,0,0,0,1,1,0,0,0,0,0,0,1,0
2,3,1,Good,1,0,0,0,0,0,0,0,0,0,-73.936608,40.717581,0,1,0,0,0,1,0,0,0,0,1,0,0,0
3,10,1,Good,1,1,0,0,0,0,0,0,0,0,-73.934456,40.713537,1,0,0,0,1,1,0,0,0,0,1,0,0,0
4,21,1,Good,1,1,0,0,0,0,0,0,0,0,-73.975979,40.666778,1,0,0,0,1,1,0,0,0,0,1,0,0,0


In [6]:
tree.shape

(651535, 29)

## separate variables using train test split

In [7]:
y = tree['health'].values
X = tree.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 28) (488651,)
(162884, 28) (162884,)


# Baseline - DummyClassifier

This is the control for our models. The accuracy scores show the success rates we should expect based on the strategies used for simple guessing.

In [8]:
strategies = ['most_frequent', 'stratified', 'uniform', 'constant']
  
for s in strategies: 
    if s =='constant': 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42, constant='Good') 
    else: 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42) 
    dummy_classifier.fit(X_train, y_train) 
    score = dummy_classifier.score(X_test, y_test) 
    print(s, score.round(2))

most_frequent 0.81
stratified 0.68
uniform 0.33
constant 0.81


# SMOTE Tomek

This method combines over sampling using SMOTE and under sampling by Tomek Links.

In [9]:
smt = SMOTETomek(random_state=42)
X_smt, y_smt = smt.fit_sample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_smt)))

X_train_smt, X_test_smt, y_train_smt, y_test_smt = train_test_split(X_smt, y_smt, test_size=0.25, random_state=42)

print(X_train_smt.shape, y_train_smt.shape)
print(X_test_smt.shape, y_test_smt.shape)



Resampled dataset shape: Counter({'Poor': 489457, 'Fair': 466662, 'Good': 464402})
(1065390, 28) (1065390,)
(355131, 28) (355131,)


In [10]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_smt, y_train_smt)
y_pred = logreg.predict(X_test_smt)
    
print('Logistic Regression \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set: ', logreg.score(X_test_smt, y_test_smt))
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4228282600737758
Accuracy Score, Test Set:  0.42106433963804907
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.17      0.24    117418
        Good       0.42      0.51      0.46    115710
        Poor       0.43      0.58      0.49    122003

    accuracy                           0.42    355131
   macro avg       0.42      0.42      0.40    355131
weighted avg       0.42      0.42      0.40    355131



In [11]:
# k-nearest neighbors
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_smt, y_train_smt)
y_pred = knn.predict(X_test_smt)
    
print('KNN Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set: ', knn.score(X_test_smt, y_test_smt))

# classificatin report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

KNN Classifier 

Accuracy Score, Training Set:  0.7385079642196755
Accuracy Score, Test Set:  0.6971906141677296
Classification Report 

              precision    recall  f1-score   support

        Fair       0.65      0.71      0.68    117418
        Good       0.70      0.58      0.63    115710
        Poor       0.74      0.80      0.77    122003

    accuracy                           0.70    355131
   macro avg       0.70      0.70      0.69    355131
weighted avg       0.70      0.70      0.69    355131



In [12]:
# decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_smt, y_train_smt)
y_pred = decision_tree.predict(X_test_smt)
    
print('Decision Tree Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_smt, y_test_smt))

# classification report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999774730380424
Accuracy Score, Test Set: 0.754707417826098
Classification Report 

              precision    recall  f1-score   support

        Fair       0.73      0.73      0.73    117418
        Good       0.73      0.71      0.72    115710
        Poor       0.81      0.82      0.81    122003

    accuracy                           0.75    355131
   macro avg       0.75      0.75      0.75    355131
weighted avg       0.75      0.75      0.75    355131



In [13]:
# random forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smt, y_train_smt)
y_pred = rf.predict(X_test_smt)
    
print('Random Forest Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', rf.score(X_train_smt, y_train_smt))
print('Accuracy Score, Test Set:', rf.score(X_test_smt, y_test_smt))
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_smt, y_pred))

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999680868038934
Accuracy Score, Test Set: 0.8313354790204178
Classification Report 

              precision    recall  f1-score   support

        Fair       0.81      0.81      0.81    117418
        Good       0.81      0.80      0.80    115710
        Poor       0.87      0.89      0.88    122003

    accuracy                           0.83    355131
   macro avg       0.83      0.83      0.83    355131
weighted avg       0.83      0.83      0.83    355131



# SMOTE ENN

This method uses a combination of SMOTE (Synthetic Minority Over-sampling Technique) over sampling and under sampling using Edited Nearest Neighbors.

In [14]:
sme = SMOTEENN(random_state=42)
X_senn, y_senn = sme.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_senn)))

X_train_senn, X_test_senn, y_train_senn, y_test_senn = train_test_split(X_smt, y_smt, test_size=0.25, random_state=42)

print(X_train_senn.shape, y_train_senn.shape)
print(X_test_senn.shape, y_test_senn.shape)



Resampled dataset shape: Counter({'Poor': 336884, 'Fair': 258967, 'Good': 236591})
(1065390, 28) (1065390,)
(355131, 28) (355131,)


In [15]:
# logistic regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_senn, y_train_senn)
y_pred = logreg.predict(X_test_senn)
    
print('Logistic Regression \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set: ', logreg.score(X_test_senn, y_test_senn))
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4228282600737758
Accuracy Score, Test Set:  0.42106433963804907
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.17      0.24    117418
        Good       0.42      0.51      0.46    115710
        Poor       0.43      0.58      0.49    122003

    accuracy                           0.42    355131
   macro avg       0.42      0.42      0.40    355131
weighted avg       0.42      0.42      0.40    355131



In [16]:
# k-nearest neighbors
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_senn, y_train_senn)
y_pred = knn.predict(X_test_senn)
    
print('KNN Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set: ', knn.score(X_test_senn, y_test_senn))

# classificatin report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

KNN Classifier 

Accuracy Score, Training Set:  0.7385079642196755
Accuracy Score, Test Set:  0.6971906141677296
Classification Report 

              precision    recall  f1-score   support

        Fair       0.65      0.71      0.68    117418
        Good       0.70      0.58      0.63    115710
        Poor       0.74      0.80      0.77    122003

    accuracy                           0.70    355131
   macro avg       0.70      0.70      0.69    355131
weighted avg       0.70      0.70      0.69    355131



In [17]:
# decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_senn, y_train_senn)
y_pred = decision_tree.predict(X_test_senn)
    
print('Decision Tree Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set:', decision_tree.score(X_test_senn, y_test_senn))
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999774730380424
Accuracy Score, Test Set: 0.754707417826098
Classification Report 

              precision    recall  f1-score   support

        Fair       0.73      0.73      0.73    117418
        Good       0.73      0.71      0.72    115710
        Poor       0.81      0.82      0.81    122003

    accuracy                           0.75    355131
   macro avg       0.75      0.75      0.75    355131
weighted avg       0.75      0.75      0.75    355131



In [18]:
# random forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_senn, y_train_senn)
y_pred = rf.predict(X_test_senn)
    
print('Random Forest Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set:', rf.score(X_train_senn, y_train_senn))
print('Accuracy Score, Test Set:', rf.score(X_test_senn, y_test_senn))
    
# classification report
print('Classification Report \n')
print(classification_report(y_test_senn, y_pred))

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999680868038934
Accuracy Score, Test Set: 0.8313354790204178
Classification Report 

              precision    recall  f1-score   support

        Fair       0.81      0.81      0.81    117418
        Good       0.81      0.80      0.80    115710
        Poor       0.87      0.89      0.88    122003

    accuracy                           0.83    355131
   macro avg       0.83      0.83      0.83    355131
weighted avg       0.83      0.83      0.83    355131

