# General Overview - Under Sampling Methods

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, cross_val_score, GridSearchCV, train_test_split)
from sklearn.metrics import classification_report

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import imblearn
from imblearn.under_sampling import (RandomUnderSampler, TomekLinks, EditedNearestNeighbours, NearMiss)



In [3]:
np.random.seed(42)

In [4]:
# import data
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/tree_ml.csv', index_col=0)

tree = data.copy()

In [5]:
tree.head()

Unnamed: 0,tree_dbh,curb_loc,health,sidewalk,root_stone,root_grate,root_other,trunk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,longitude,latitude,num_problems,1or2,3or4,4orMore,Stew_N,Guard_N,Harmful,Helpful,Unsure,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,3,1,Fair,0,0,0,0,0,0,0,0,0,0,-73.844215,40.723092,0,0,0,0,1,1,0,0,0,0,0,0,1,0
1,21,1,Fair,1,1,0,0,0,0,0,0,0,0,-73.818679,40.794111,1,0,0,0,1,1,0,0,0,0,0,0,1,0
2,3,1,Good,1,0,0,0,0,0,0,0,0,0,-73.936608,40.717581,0,1,0,0,0,1,0,0,0,0,1,0,0,0
3,10,1,Good,1,1,0,0,0,0,0,0,0,0,-73.934456,40.713537,1,0,0,0,1,1,0,0,0,0,1,0,0,0
4,21,1,Good,1,1,0,0,0,0,0,0,0,0,-73.975979,40.666778,1,0,0,0,1,1,0,0,0,0,1,0,0,0


In [6]:
tree.shape

(651535, 29)

##  separate variables using train test split

In [7]:
y = tree['health'].values
X = tree.drop('health', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(488651, 28) (488651,)
(162884, 28) (162884,)


## baseline - DummyClassifier

The dummy classifier is our baseline, it makes predictions based on simply guessing. We use these results as comparisons to real classifiers.

In [8]:
strategies = ['most_frequent', 'stratified', 'uniform', 'constant']
  
for s in strategies: 
    if s =='constant': 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42, constant='Good') 
    else: 
        dummy_classifier = DummyClassifier(strategy=s, random_state=42) 
    dummy_classifier.fit(X_train, y_train) 
    score = dummy_classifier.score(X_test, y_test) 
    print(s, score.round(2))

most_frequent 0.81
stratified 0.68
uniform 0.33
constant 0.81


# Algorithm Functions

In [9]:
# logistic regression
def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [10]:
# KNN classifier
def knn(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=15).fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    print('KNN Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))
    
    # classificatin report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [11]:
# decision tree classifier
def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [12]:
# random forest classifier
def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [13]:
# Gaussian naive bayes
def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB().fit(X_train, y_train)
    y_pred = gaussian.predict(X_test)
    
    print('Gaussian Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
    print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

# Random Under Sampler

In [14]:
random_under = RandomUnderSampler(random_state=42)
X_rs, y_rs = random_under.fit_sample(X, y)

print('Random undersampling:', Counter(y_rs))

X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

Random undersampling: Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})
(60257, 28) (60257,)
(20086, 28) (20086,)




## Logistic Regression

In [15]:
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.41802612144647094
Accuracy Score, Test Set:  0.4101364134222842
Classification Report 

              precision    recall  f1-score   support

        Fair       0.38      0.18      0.25      6648
        Good       0.41      0.52      0.46      6744
        Poor       0.42      0.52      0.46      6694

    accuracy                           0.41     20086
   macro avg       0.40      0.41      0.39     20086
weighted avg       0.40      0.41      0.39     20086



## KNN Classifier

In [16]:
knn(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

KNN Classifier 

Accuracy Score, Training Set:  0.5229931792156928
Accuracy Score, Test Set:  0.4206412426565767
Classification Report 

              precision    recall  f1-score   support

        Fair       0.37      0.39      0.38      6648
        Good       0.45      0.47      0.46      6744
        Poor       0.45      0.40      0.42      6694

    accuracy                           0.42     20086
   macro avg       0.42      0.42      0.42     20086
weighted avg       0.42      0.42      0.42     20086



## Decision Tree Classifier

In [17]:
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.999983404417744
Accuracy Score, Test Set: 0.4332370805536194
Classification Report 

              precision    recall  f1-score   support

        Fair       0.39      0.39      0.39      6648
        Good       0.46      0.45      0.45      6744
        Poor       0.45      0.46      0.45      6694

    accuracy                           0.43     20086
   macro avg       0.43      0.43      0.43     20086
weighted avg       0.43      0.43      0.43     20086



## Random Forest Classifier

In [18]:
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 0.999983404417744
Accuracy Score, Test Set: 0.45688539281091306
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.40      0.40      6648
        Good       0.48      0.50      0.49      6744
        Poor       0.49      0.47      0.48      6694

    accuracy                           0.46     20086
   macro avg       0.46      0.46      0.46     20086
weighted avg       0.46      0.46      0.46     20086



# Tomek Links

In [19]:
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X, y)

print('TomekLinks undersampling:', Counter(y_tomek))

X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tomek, y_tomek, test_size=0.25, random_state=42)

print(X_train_tl.shape, y_train_tl.shape)
print(X_test_tl.shape, y_test_tl.shape)

TomekLinks undersampling: Counter({'Good': 483315, 'Fair': 55899, 'Poor': 26781})
(424496, 28) (424496,)
(141499, 28) (141499,)




## Logistic Regression

In [20]:
logreg(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.8542341977309562
Accuracy Score, Test Set:  0.852733941582626
Classification Report 

              precision    recall  f1-score   support

        Fair       0.31      0.02      0.03     14059
        Good       0.86      1.00      0.92    120672
        Poor       0.63      0.01      0.02      6768

    accuracy                           0.85    141499
   macro avg       0.60      0.34      0.32    141499
weighted avg       0.79      0.85      0.79    141499



## KNN Classifier

In [21]:
knn(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

KNN Classifier 

Accuracy Score, Training Set:  0.8603025705778147
Accuracy Score, Test Set:  0.8534618619212857
Classification Report 

              precision    recall  f1-score   support

        Fair       0.40      0.06      0.10     14059
        Good       0.86      0.99      0.92    120672
        Poor       0.48      0.05      0.09      6768

    accuracy                           0.85    141499
   macro avg       0.58      0.37      0.37    141499
weighted avg       0.80      0.85      0.80    141499



## Decision Tree Classifier

In [22]:
decision_tree(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999858655911952
Accuracy Score, Test Set: 0.8075251415204348
Classification Report 

              precision    recall  f1-score   support

        Fair       0.32      0.32      0.32     14059
        Good       0.90      0.90      0.90    120672
        Poor       0.23      0.23      0.23      6768

    accuracy                           0.81    141499
   macro avg       0.48      0.48      0.48    141499
weighted avg       0.81      0.81      0.81    141499



## Random Forest Classifier

In [23]:
random_forest(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999246164863744
Accuracy Score, Test Set: 0.8700202828288539
Classification Report 

              precision    recall  f1-score   support

        Fair       0.61      0.24      0.35     14059
        Good       0.89      0.98      0.93    120672
        Poor       0.52      0.16      0.24      6768

    accuracy                           0.87    141499
   macro avg       0.67      0.46      0.51    141499
weighted avg       0.84      0.87      0.84    141499



# Edited Nearest Neighbors

In [24]:
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)

print('Resampled dataset shape:', Counter(y_enn))

X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.25, random_state=42)

print(X_train_enn.shape, y_train_enn.shape)
print(X_test_enn.shape, y_test_enn.shape)



Resampled dataset shape: Counter({'Good': 339374, 'Poor': 26781, 'Fair': 3713})
(277401, 28) (277401,)
(92467, 28) (92467,)




## Logistic Regression

In [25]:
logreg(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.9213593317976503
Accuracy Score, Test Set:  0.9212043215417393
Classification Report 



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        Fair       0.00      0.00      0.00       943
        Good       0.92      1.00      0.96     84805
        Poor       0.68      0.09      0.15      6719

    accuracy                           0.92     92467
   macro avg       0.54      0.36      0.37     92467
weighted avg       0.90      0.92      0.89     92467



## KNN Classifier

In [26]:
knn(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

KNN Classifier 

Accuracy Score, Training Set:  0.9271019210457064
Accuracy Score, Test Set:  0.9240485794932246
Classification Report 

              precision    recall  f1-score   support

        Fair       0.50      0.09      0.16       943
        Good       0.93      1.00      0.96     84805
        Poor       0.68      0.13      0.22      6719

    accuracy                           0.92     92467
   macro avg       0.70      0.41      0.45     92467
weighted avg       0.91      0.92      0.90     92467



## Decision Tree Classifier

In [27]:
decision_tree(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9090270042285356
Classification Report 

              precision    recall  f1-score   support

        Fair       0.56      0.61      0.58       943
        Good       0.95      0.95      0.95     84805
        Poor       0.41      0.40      0.40      6719

    accuracy                           0.91     92467
   macro avg       0.64      0.65      0.64     92467
weighted avg       0.91      0.91      0.91     92467



## Random Forest Classifier

In [28]:
random_forest(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999603462136041
Accuracy Score, Test Set: 0.9420766327446548
Classification Report 

              precision    recall  f1-score   support

        Fair       0.89      0.68      0.77       943
        Good       0.95      1.00      0.97     84805
        Poor       0.82      0.30      0.44      6719

    accuracy                           0.94     92467
   macro avg       0.89      0.66      0.73     92467
weighted avg       0.94      0.94      0.93     92467



# Near Miss

In [29]:
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)
print('Resampled dataset shape:', Counter(y_nm))

X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42)

print(X_train_nm.shape, y_train_nm.shape)
print(X_test_nm.shape, y_test_nm.shape)



Resampled dataset shape: Counter({'Fair': 26781, 'Good': 26781, 'Poor': 26781})
(60257, 28) (60257,)
(20086, 28) (20086,)


## Logistic Regression

In [30]:
logreg(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.5622749224156529
Accuracy Score, Test Set:  0.563974907896047
Classification Report 

              precision    recall  f1-score   support

        Fair       0.47      0.34      0.39      6648
        Good       0.51      0.78      0.62      6744
        Poor       0.77      0.57      0.66      6694

    accuracy                           0.56     20086
   macro avg       0.58      0.56      0.55     20086
weighted avg       0.58      0.56      0.56     20086



## KNN Classifier

In [31]:
knn(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

KNN Classifier 

Accuracy Score, Training Set:  0.6927161989478401
Accuracy Score, Test Set:  0.656526934183013
Classification Report 

              precision    recall  f1-score   support

        Fair       0.58      0.58      0.58      6648
        Good       0.62      0.85      0.72      6744
        Poor       0.84      0.54      0.66      6694

    accuracy                           0.66     20086
   macro avg       0.68      0.66      0.65     20086
weighted avg       0.68      0.66      0.65     20086



## Decision Tree Classifier

In [32]:
decision_tree(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999668088354879
Accuracy Score, Test Set: 0.6423877327491785
Classification Report 

              precision    recall  f1-score   support

        Fair       0.58      0.57      0.58      6648
        Good       0.70      0.71      0.70      6744
        Poor       0.65      0.64      0.64      6694

    accuracy                           0.64     20086
   macro avg       0.64      0.64      0.64     20086
weighted avg       0.64      0.64      0.64     20086



## Random Forest Classifier

In [33]:
random_forest(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999004265064639
Accuracy Score, Test Set: 0.6968037439012247
Classification Report 

              precision    recall  f1-score   support

        Fair       0.64      0.64      0.64      6648
        Good       0.72      0.80      0.76      6744
        Poor       0.73      0.64      0.68      6694

    accuracy                           0.70     20086
   macro avg       0.70      0.70      0.70     20086
weighted avg       0.70      0.70      0.70     20086

