# General Overview - Over Sampling Methods

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (cross_val_score, train_test_split)
from sklearn.metrics import classification_report

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import imblearn
from imblearn.over_sampling import (RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE)



In [2]:
np.random.seed(42)

In [3]:
# import data
data = pd.read_csv('/tree_ml.csv', index_col=0)

tree = data.copy()

In [4]:
tree.head()

Unnamed: 0,tree_dbh,curb_loc,health,sidewalk,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,longitude,latitude,num_problems,1or2,3or4,4orMore,Stew_N,Guard_N,Harmful,Helpful,Unsure,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,3,1,Fair,0,0,0,0,0,0,0,0,0,0,-73.844215,40.723092,0,0,0,0,1,1,0,0,0,0,0,0,1,0
1,21,1,Fair,1,1,0,0,0,0,0,0,0,0,-73.818679,40.794111,1,0,0,0,1,1,0,0,0,0,0,0,1,0
2,3,1,Good,1,0,0,0,0,0,0,0,0,0,-73.936608,40.717581,0,1,0,0,0,1,0,0,0,0,1,0,0,0
3,10,1,Good,1,1,0,0,0,0,0,0,0,0,-73.934456,40.713537,1,0,0,0,1,1,0,0,0,0,1,0,0,0
4,21,1,Good,1,1,0,0,0,0,0,0,0,0,-73.975979,40.666778,1,0,0,0,1,1,0,0,0,0,1,0,0,0


In [5]:
tree.shape

(651535, 29)

## separate variables using train test split

In [6]:
# target variable = health
y = tree['health'].values
X = tree.drop('health', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 28) (488651,)
(162884, 28) (162884,)


## algorithm functions

In [7]:
# logistic regression

def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [8]:
# k-nearest neighbors

def knn(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    print('KNN Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))
    
    # classificatin report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [9]:
# decision tree classifier

def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [10]:
# random forest classifier

def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

# Baseline - DummyClassifier

This is the control for our models. The accuracy scores show the success rates we should expect based on the strategies used for simple guessing.

In [11]:
strategies = ['most_frequent', 'stratified', 'uniform', 'constant'] # strategies available
  
for s in strategies: 
    if s =='constant': 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42, constant='Good') 
    else: 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42) 
    dummy_classifier.fit(X_train, y_train) 
    score = dummy_classifier.score(X_test, y_test) 
    print(s, score.round(2))

most_frequent 0.81
stratified 0.68
uniform 0.33
constant 0.81


# Random Over Sampler

This standard method of over sampling selects minority samples and replicates them until they match the number of samples of the majority class.

In [12]:
# initialize
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

print('Resampled dataset:', Counter(y_ros))

# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_ros, y_ros, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)



Resampled dataset: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})
(1188735, 28) (1188735,)
(396246, 28) (396246,)


## Logistic Regression

In [13]:
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4163938977147977
Accuracy Score, Test Set:  0.41573668882461906
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.17      0.24    132500
        Good       0.41      0.55      0.47    131527
        Poor       0.43      0.53      0.47    132219

    accuracy                           0.42    396246
   macro avg       0.41      0.42      0.39    396246
weighted avg       0.41      0.42      0.39    396246



## KNN Classifier

In [14]:
knn(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

KNN Classifier 

Accuracy Score, Training Set:  0.8925647852549138
Accuracy Score, Test Set:  0.8357686891476507
Classification Report 

              precision    recall  f1-score   support

        Fair       0.75      0.90      0.82    132500
        Good       0.88      0.61      0.72    131527
        Poor       0.90      1.00      0.95    132219

    accuracy                           0.84    396246
   macro avg       0.84      0.84      0.83    396246
weighted avg       0.84      0.84      0.83    396246



## Decision Tree Classifier

In [15]:
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999882227746302
Accuracy Score, Test Set: 0.9386214624248573
Classification Report 

              precision    recall  f1-score   support

        Fair       0.88      0.99      0.93    132500
        Good       0.99      0.82      0.90    131527
        Poor       0.97      1.00      0.98    132219

    accuracy                           0.94    396246
   macro avg       0.94      0.94      0.94    396246
weighted avg       0.94      0.94      0.94    396246



## Random Forest Classifier

In [16]:
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999747630884932
Accuracy Score, Test Set: 0.9533597815498454
Classification Report 

              precision    recall  f1-score   support

        Fair       0.90      0.99      0.94    132500
        Good       0.99      0.87      0.93    131527
        Poor       0.98      1.00      0.99    132219

    accuracy                           0.95    396246
   macro avg       0.96      0.95      0.95    396246
weighted avg       0.96      0.95      0.95    396246



# SMOTE - Synthetic Minority Over-sampling Technique

The over sampling method SMOTE, or Synthetic Minority Over-sampling Technique, selects minority samples and increases them randomly to match the majority classes.

In [17]:
# initialize
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)

print('Resampled dataset:', Counter(y_sm))

# train test split
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

print(X_train_sm.shape, y_train_sm.shape)
print(X_test_sm.shape, y_test_sm.shape)



Resampled dataset: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})
(1188735, 28) (1188735,)
(396246, 28) (396246,)


## Logistic Regression

In [18]:
logreg(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.41613143383512724
Accuracy Score, Test Set:  0.4153959913790928
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.16      0.23    132500
        Good       0.41      0.54      0.47    131527
        Poor       0.42      0.54      0.48    132219

    accuracy                           0.42    396246
   macro avg       0.41      0.42      0.39    396246
weighted avg       0.41      0.42      0.39    396246



## KNN Classifier

In [19]:
knn(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

KNN Classifier 

Accuracy Score, Training Set:  0.8157949416817036
Accuracy Score, Test Set:  0.723179539982738
Classification Report 

              precision    recall  f1-score   support

        Fair       0.67      0.74      0.71    132500
        Good       0.71      0.62      0.66    131527
        Poor       0.79      0.80      0.80    132219

    accuracy                           0.72    396246
   macro avg       0.72      0.72      0.72    396246
weighted avg       0.72      0.72      0.72    396246



## Decision Tree Classifier

In [20]:
decision_tree(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9998704505209319
Accuracy Score, Test Set: 0.6994796161980184
Classification Report 

              precision    recall  f1-score   support

        Fair       0.67      0.67      0.67    132500
        Good       0.67      0.66      0.66    131527
        Poor       0.76      0.77      0.76    132219

    accuracy                           0.70    396246
   macro avg       0.70      0.70      0.70    396246
weighted avg       0.70      0.70      0.70    396246



## Random Forest Classifier

In [21]:
random_forest(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9998595145259457
Accuracy Score, Test Set: 0.7749680753875118
Classification Report 

              precision    recall  f1-score   support

        Fair       0.75      0.75      0.75    132500
        Good       0.75      0.73      0.74    131527
        Poor       0.82      0.85      0.84    132219

    accuracy                           0.77    396246
   macro avg       0.77      0.77      0.77    396246
weighted avg       0.77      0.77      0.77    396246



# ADASYN - Adaptive Synthetic

The Adaptive Synthetic method, or ADASYN, is similar to SMOTE over sampling. However, ADASYN selectively generates sample points around minority samples using a density distributor and does not create uniform weights like SMOTE.

In [22]:
# initialize
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X, y)

print('Resampled dataset:', Counter(y_ada))

# train test split
X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada, y_ada, test_size=0.25, random_state=42)

print(X_train_ada.shape, y_train_ada.shape)
print(X_test_ada.shape, y_test_ada.shape)



Resampled dataset: Counter({'Fair': 534448, 'Good': 528327, 'Poor': 523028})
(1189352, 28) (1189352,)
(396451, 28) (396451,)


## Logistic Regression

In [23]:
logreg(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.40702416105576816
Accuracy Score, Test Set:  0.40713732592426305
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.20      0.26    133988
        Good       0.40      0.52      0.45    131343
        Poor       0.42      0.50      0.46    131120

    accuracy                           0.41    396451
   macro avg       0.40      0.41      0.39    396451
weighted avg       0.40      0.41      0.39    396451



## KNN Classifier

In [24]:
knn(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

KNN Classifier 

Accuracy Score, Training Set:  0.8057925660359591
Accuracy Score, Test Set:  0.7083977591177725
Classification Report 

              precision    recall  f1-score   support

        Fair       0.65      0.74      0.69    133988
        Good       0.70      0.59      0.64    131343
        Poor       0.78      0.80      0.79    131120

    accuracy                           0.71    396451
   macro avg       0.71      0.71      0.71    396451
weighted avg       0.71      0.71      0.71    396451



## Decision Tree Classifier

In [25]:
decision_tree(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999268509238645
Accuracy Score, Test Set: 0.6851842976811763
Classification Report 

              precision    recall  f1-score   support

        Fair       0.65      0.65      0.65    133988
        Good       0.65      0.65      0.65    131343
        Poor       0.75      0.76      0.75    131120

    accuracy                           0.69    396451
   macro avg       0.69      0.69      0.69    396451
weighted avg       0.68      0.69      0.68    396451



## Random Forest Classifier

In [26]:
random_forest(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999066718683788
Accuracy Score, Test Set: 0.7627928798262584
Classification Report 

              precision    recall  f1-score   support

        Fair       0.74      0.74      0.74    133988
        Good       0.74      0.71      0.72    131343
        Poor       0.81      0.84      0.83    131120

    accuracy                           0.76    396451
   macro avg       0.76      0.76      0.76    396451
weighted avg       0.76      0.76      0.76    396451



# SMOTE Extensions - Borderline SMOTE

[Borderline SMOTE](https://link.springer.com/chapter/10.1007/11538059_91) is an over sampling method in which the minority samples on the cusp of being in the majority pile are oversampled. 

In [27]:
# initialize
bsmt = BorderlineSMOTE(random_state=42)
X_bsmt, y_bsmt = bsmt.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_bsmt)))

# train test split
X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt = train_test_split(X_bsmt, y_bsmt, test_size=0.2, random_state=42)

print(X_train_bsmt.shape, y_train_bsmt.shape)
print(X_test_bsmt.shape, y_test_bsmt.shape)



Resampled dataset shape: Counter({'Fair': 528327, 'Good': 528327, 'Poor': 528327})
(1267984, 28) (1267984,)
(316997, 28) (316997,)


## Logistic Regression

In [28]:
logreg(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4567462996378503
Accuracy Score, Test Set:  0.4576131635315161
Classification Report 

              precision    recall  f1-score   support

        Fair       0.42      0.22      0.29    106019
        Good       0.45      0.56      0.50    105078
        Poor       0.49      0.59      0.53    105900

    accuracy                           0.46    316997
   macro avg       0.45      0.46      0.44    316997
weighted avg       0.45      0.46      0.44    316997



## KNN Classifier

In [29]:
knn(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

KNN Classifier 

Accuracy Score, Training Set:  0.8736987217504322
Accuracy Score, Test Set:  0.8221434272248633
Classification Report 

              precision    recall  f1-score   support

        Fair       0.76      0.82      0.79    106019
        Good       0.82      0.72      0.77    105078
        Poor       0.88      0.93      0.90    105900

    accuracy                           0.82    316997
   macro avg       0.82      0.82      0.82    316997
weighted avg       0.82      0.82      0.82    316997



## Decision Tree Classifier

In [30]:
decision_tree(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9997870635591616
Accuracy Score, Test Set: 0.7964649507724048
Classification Report 

              precision    recall  f1-score   support

        Fair       0.75      0.76      0.76    106019
        Good       0.76      0.73      0.74    105078
        Poor       0.88      0.90      0.89    105900

    accuracy                           0.80    316997
   macro avg       0.80      0.80      0.80    316997
weighted avg       0.80      0.80      0.80    316997



## Random Forest Classifier

In [31]:
random_forest(X_train_bsmt, X_test_bsmt, y_train_bsmt, y_test_bsmt)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9997618266476549
Accuracy Score, Test Set: 0.8541090294229914
Classification Report 

              precision    recall  f1-score   support

        Fair       0.82      0.82      0.82    106019
        Good       0.82      0.80      0.81    105078
        Poor       0.91      0.93      0.92    105900

    accuracy                           0.85    316997
   macro avg       0.85      0.85      0.85    316997
weighted avg       0.85      0.85      0.85    316997

