# General Overview - Under Sampling Methods

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (classification_report,
                             confusion_matrix)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import imblearn
from imblearn.under_sampling import (RandomUnderSampler,
                                     TomekLinks,
                                     EditedNearestNeighbours,
                                     NearMiss)



In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('tree_ml.csv', index_col=0) # import data
tree = data.copy() # save a copy of data as tree

In [4]:
tree.head()

Unnamed: 0,health,health_l,num_problems,tree_dbh,root_stone_l,root_grate_l,root_other_l,trunk_wire_l,trnk_light_l,trnk_other_l,...,OnCurb,Harmful,Helpful,Unsure,Damage,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Fair,1,0,3,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,Fair,1,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,Good,2,0,3,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
3,Good,2,1,10,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,Good,2,1,21,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [5]:
tree.shape

(651535, 26)

##  separate variables using train test split

In [6]:
tree_ml = tree.drop(columns='health_l') # keep the categorical column

In [7]:
# target variable = health
y = tree_ml['health'].values
X = tree_ml.drop('health', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 24) (488651,)
(162884, 24) (162884,)


## baseline - DummyClassifier

The dummy classifier is our baseline, it makes predictions based on simply guessing. We use these results as comparisons to real classifiers.

In [8]:
strategies = ['most_frequent', 'stratified', 'uniform', 'constant'] # strategies available
  
for s in strategies: 
    if s =='constant': 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42, constant='Good') 
    else: 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42) 
    dummy_classifier.fit(X_train, y_train) 
    score = dummy_classifier.score(X_test, y_test) 
    print(s, score)

most_frequent 0.8108960978364972
stratified 0.6815525159008865
uniform 0.3338879202377152
constant 0.8108960978364972


# Algorithm Functions

In [9]:
# logistic regression
def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    print()
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [10]:
# KNN classifier
def knn(X_train, X_test, y_train, y_test):
    # using 6 neighbors
    knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    print('KNN Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))
    print()
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classificatin report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [11]:
# decision tree classifier
def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [12]:
# random forest classifier
def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [13]:
# Gaussian naive bayes
def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB().fit(X_train, y_train)
    y_pred = gaussian.predict(X_test)
    
    print('Gaussian Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
    print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

# Random Under Sampler

In [14]:
# initialize
random_under = RandomUnderSampler(random_state=42)
X_rs, y_rs = random_under.fit_sample(X, y)

print('Random undersampling {}'.format(Counter(y_rs)))

# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

Random undersampling Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})
(60257, 24) (60257,)
(20086, 24) (20086,)




## Logistic Regression

In [15]:
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.41839122425610303
Accuracy Score, Test Set:  0.4137707856218261

Confusion Matrix: 
 [[1141 2785 2722]
 [ 940 3552 2252]
 [ 880 2196 3618]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.17      0.24      6648
        Good       0.42      0.53      0.47      6744
        Poor       0.42      0.54      0.47      6694

    accuracy                           0.41     20086
   macro avg       0.41      0.41      0.39     20086
weighted avg       0.41      0.41      0.39     20086



## KNN Classifier

In [16]:
knn(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

KNN Classifier 

Accuracy Score, Training Set:  0.44910964701196543
Accuracy Score, Test Set:  0.3725480434133227

Confusion Matrix: 
 [[3141 1950 1557]
 [3106 2374 1264]
 [3016 1710 1968]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.34      0.47      0.39      6648
        Good       0.39      0.35      0.37      6744
        Poor       0.41      0.29      0.34      6694

    accuracy                           0.37     20086
   macro avg       0.38      0.37      0.37     20086
weighted avg       0.38      0.37      0.37     20086



## Decision Tree Classifier

In [17]:
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.543107024909969
Accuracy Score, Test Set: 0.4008762322015334
Confusion Matrix: 
 [[2043 2631 1974]
 [1791 3371 1582]
 [1734 2322 2638]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.37      0.31      0.33      6648
        Good       0.40      0.50      0.45      6744
        Poor       0.43      0.39      0.41      6694

    accuracy                           0.40     20086
   macro avg       0.40      0.40      0.40     20086
weighted avg       0.40      0.40      0.40     20086



## Random Forest Classifier

In [18]:
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 0.543040642580945
Accuracy Score, Test Set: 0.4056058946529921
Confusion Matrix: 
 [[1801 2573 2274]
 [1532 3372 1840]
 [1500 2220 2974]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.37      0.27      0.31      6648
        Good       0.41      0.50      0.45      6744
        Poor       0.42      0.44      0.43      6694

    accuracy                           0.41     20086
   macro avg       0.40      0.41      0.40     20086
weighted avg       0.40      0.41      0.40     20086



## Gaussian Naive Bayes

In [19]:
gaussian(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.39248552035448164
Accuracy Score, Test Set: 0.3930100567559494
Confusion Matrix: 
 [[ 535 4812 1301]
 [ 446 5728  570]
 [ 409 4654 1631]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.08      0.13      6648
        Good       0.38      0.85      0.52      6744
        Poor       0.47      0.24      0.32      6694

    accuracy                           0.39     20086
   macro avg       0.41      0.39      0.33     20086
weighted avg       0.41      0.39      0.33     20086



# Tomek Links

In [20]:
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X, y)

print('TomekLinks undersampling {}'.format(Counter(y_tomek)))

# train test split
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tomek, y_tomek, test_size=0.25, random_state=42)

print(X_train_tl.shape, y_train_tl.shape)
print(X_test_tl.shape, y_test_tl.shape)



TomekLinks undersampling Counter({'Good': 527892, 'Fair': 96026, 'Poor': 26781})
(488024, 24) (488024,)
(162675, 24) (162675,)


## Logistic Regression

In [21]:
logreg(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.8112920676032326
Accuracy Score, Test Set:  0.8104933148916551

Confusion Matrix: 
 [[   394  23498      1]
 [   461 131452      3]
 [   389   6476      1]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.32      0.02      0.03     23893
        Good       0.81      1.00      0.90    131916
        Poor       0.20      0.00      0.00      6866

    accuracy                           0.81    162675
   macro avg       0.44      0.34      0.31    162675
weighted avg       0.72      0.81      0.73    162675



## KNN Classifier

In [22]:
knn(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

KNN Classifier 

Accuracy Score, Training Set:  0.800884792551186
Accuracy Score, Test Set:  0.7901152604887045

Confusion Matrix: 
 [[  2030  21718    145]
 [  5402 126352    162]
 [   735   5981    150]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.25      0.08      0.13     23893
        Good       0.82      0.96      0.88    131916
        Poor       0.33      0.02      0.04      6866

    accuracy                           0.79    162675
   macro avg       0.47      0.35      0.35    162675
weighted avg       0.72      0.79      0.74    162675



## Decision Tree Classifier

In [23]:
decision_tree(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.8255208760224907
Accuracy Score, Test Set: 0.8012417396649761
Confusion Matrix: 
 [[  1413  22231    249]
 [  2833 128726    357]
 [   544   6119    203]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.29      0.06      0.10     23893
        Good       0.82      0.98      0.89    131916
        Poor       0.25      0.03      0.05      6866

    accuracy                           0.80    162675
   macro avg       0.46      0.35      0.35    162675
weighted avg       0.72      0.80      0.74    162675



## Random Forest Classifier

In [24]:
random_forest(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Random Forest Classifier 

Accuracy Score, Training Set: 0.8255167778633837
Accuracy Score, Test Set: 0.805489472875365
Confusion Matrix: 
 [[  1077  22548    268]
 [  1814 129715    387]
 [   409   6216    241]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.33      0.05      0.08     23893
        Good       0.82      0.98      0.89    131916
        Poor       0.27      0.04      0.06      6866

    accuracy                           0.81    162675
   macro avg       0.47      0.35      0.34    162675
weighted avg       0.72      0.81      0.74    162675



## Gaussian Naive Bayes

In [25]:
gaussian(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.734986394111765
Accuracy Score, Test Set: 0.7342800061472261
Confusion Matrix: 
 [[  2926  18145   2822]
 [ 10284 115264   6368]
 [   746   4861   1259]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.21      0.12      0.15     23893
        Good       0.83      0.87      0.85    131916
        Poor       0.12      0.18      0.15      6866

    accuracy                           0.73    162675
   macro avg       0.39      0.39      0.38    162675
weighted avg       0.71      0.73      0.72    162675



# Edited Nearest Neighbors

In [26]:
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_enn)))

# train test split
X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.25, random_state=42)

print(X_train_enn.shape, y_train_enn.shape)
print(X_test_enn.shape, y_test_enn.shape)



Resampled dataset shape: Counter({'Good': 312229, 'Poor': 26781, 'Fair': 1553})
(255422, 24) (255422,)
(85141, 24) (85141,)


## Logistic Regression

In [27]:
logreg(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.9208603800768923
Accuracy Score, Test Set:  0.9221174287358617

Confusion Matrix: 
 [[    1   288   104]
 [    2 77852   283]
 [   10  5944   657]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.08      0.00      0.00       393
        Good       0.93      1.00      0.96     78137
        Poor       0.63      0.10      0.17      6611

    accuracy                           0.92     85141
   macro avg       0.54      0.37      0.38     85141
weighted avg       0.90      0.92      0.89     85141



## KNN Classifier

In [28]:
knn(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

KNN Classifier 

Accuracy Score, Training Set:  0.9552231209527762
Accuracy Score, Test Set:  0.9516331732067982

Confusion Matrix: 
 [[  254    57    82]
 [   22 77848   267]
 [  117  3573  2921]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.65      0.65      0.65       393
        Good       0.96      1.00      0.98     78137
        Poor       0.89      0.44      0.59      6611

    accuracy                           0.95     85141
   macro avg       0.83      0.69      0.74     85141
weighted avg       0.95      0.95      0.94     85141



## Decision Tree Classifier

In [29]:
decision_tree(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.965269240707535
Accuracy Score, Test Set: 0.9565896571569514
Confusion Matrix: 
 [[  334    23    36]
 [   19 77849   269]
 [  112  3237  3262]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.72      0.85      0.78       393
        Good       0.96      1.00      0.98     78137
        Poor       0.91      0.49      0.64      6611

    accuracy                           0.96     85141
   macro avg       0.86      0.78      0.80     85141
weighted avg       0.96      0.96      0.95     85141



## Random Forest Classifier

In [30]:
random_forest(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Random Forest Classifier 

Accuracy Score, Training Set: 0.965269240707535
Accuracy Score, Test Set: 0.957799415087913
Confusion Matrix: 
 [[  330    23    40]
 [    6 77980   151]
 [   96  3277  3238]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.76      0.84      0.80       393
        Good       0.96      1.00      0.98     78137
        Poor       0.94      0.49      0.65      6611

    accuracy                           0.96     85141
   macro avg       0.89      0.78      0.81     85141
weighted avg       0.96      0.96      0.95     85141



## Gaussian Naive Bayes

In [31]:
gaussian(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.843854483952048
Accuracy Score, Test Set: 0.84417613135857
Confusion Matrix: 
 [[  128    71   194]
 [ 3137 70512  4488]
 [  750  4627  1234]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.03      0.33      0.06       393
        Good       0.94      0.90      0.92     78137
        Poor       0.21      0.19      0.20      6611

    accuracy                           0.84     85141
   macro avg       0.39      0.47      0.39     85141
weighted avg       0.88      0.84      0.86     85141



# Near Miss

In [32]:
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)
print('Resampled dataset shape: {}'.format(Counter(y_nm)))

# train test split
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42)

print(X_train_nm.shape, y_train_nm.shape)
print(X_test_nm.shape, y_test_nm.shape)



Resampled dataset shape: Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})
(60257, 24) (60257,)
(20086, 24) (20086,)


## Logistic Regression

In [33]:
logreg(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.4476492357734371
Accuracy Score, Test Set:  0.44538484516578714

Confusion Matrix: 
 [[2515 3179  954]
 [1979 4064  701]
 [2042 2285 2367]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.38      0.38      6648
        Good       0.43      0.60      0.50      6744
        Poor       0.59      0.35      0.44      6694

    accuracy                           0.45     20086
   macro avg       0.47      0.44      0.44     20086
weighted avg       0.47      0.45      0.44     20086



## KNN Classifier

In [34]:
knn(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

KNN Classifier 

Accuracy Score, Training Set:  0.44540883216887667
Accuracy Score, Test Set:  0.4092402668525341

Confusion Matrix: 
 [[3267 2377 1004]
 [3119 2818  807]
 [2794 1765 2135]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.36      0.49      0.41      6648
        Good       0.40      0.42      0.41      6744
        Poor       0.54      0.32      0.40      6694

    accuracy                           0.41     20086
   macro avg       0.43      0.41      0.41     20086
weighted avg       0.43      0.41      0.41     20086



## Decision Tree Classifier

In [35]:
decision_tree(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.5063478102129213
Accuracy Score, Test Set: 0.46883401374091405
Confusion Matrix: 
 [[2891 2991  766]
 [2178 4076  490]
 [2118 2126 2450]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.43      0.42      6648
        Good       0.44      0.60      0.51      6744
        Poor       0.66      0.37      0.47      6694

    accuracy                           0.47     20086
   macro avg       0.50      0.47      0.47     20086
weighted avg       0.50      0.47      0.47     20086



## Random Forest Classifier

In [36]:
random_forest(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Random Forest Classifier 

Accuracy Score, Training Set: 0.5062648323016413
Accuracy Score, Test Set: 0.47301603106641443
Confusion Matrix: 
 [[2795 3007  846]
 [2076 4124  544]
 [1968 2144 2582]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.41      0.42      0.41      6648
        Good       0.44      0.61      0.51      6744
        Poor       0.65      0.39      0.48      6694

    accuracy                           0.47     20086
   macro avg       0.50      0.47      0.47     20086
weighted avg       0.50      0.47      0.47     20086



## Gaussian Naive Bayes

In [37]:
gaussian(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.4020279801516836
Accuracy Score, Test Set: 0.39744100368415813
Confusion Matrix: 
 [[ 589 5864  195]
 [ 383 6249  112]
 [ 981 4568 1145]]

Classification Report 

              precision    recall  f1-score   support

        Fair       0.30      0.09      0.14      6648
        Good       0.37      0.93      0.53      6744
        Poor       0.79      0.17      0.28      6694

    accuracy                           0.40     20086
   macro avg       0.49      0.40      0.32     20086
weighted avg       0.49      0.40      0.32     20086

