# Q2 Imbalanced Data Classification

## Reference

* 机器学习之类别不平衡问题 (3) —— 采样方法: https://www.cnblogs.com/massquantity/p/9382710.html
* 不平衡数据集的处理: https://www.cnblogs.com/kamekin/p/9824294.html
* imblearn document: https://imbalanced-learn.org/stable/index.html
* https://books.google.com.ph/books?id=GvKrDwAAQBAJ&pg=PA452&lpg=PA452&dq=smote+SMOTEENN&source=bl&ots=uD6yhIQ_cZ&sig=ACfU3U01CuiKc1bOPmQgsOdaMUekdiozRA&hl=en&sa=X&ved=2ahUKEwiEsIL_mt3pAhXoy4sBHfVDC38Q6AEwD3oECAoQAg#v=onepage&q=smote%20SMOTEENN&f=false

In [553]:
import numpy as np
import pandas as pd
from collections import Counter

# Imbalanced learn
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedBaggingClassifier 

# Sklearn
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Outliers detection
from sklearn.ensemble import IsolationForest

## Bi-class Datasets

`v_train.csv` and `p_train.csv` are data sets with binary classes (e.g., positive, negative).

### Use SMOTE on `v_train.csv` dataset

In [276]:
Xv = pd.read_csv("v/v_train.csv", names=[0,1,2,3,4,5,6,7,8,9,'label'])
#Xv.label.loc[Xv.label==' negative'] = 0
#Xv.label.loc[Xv.label==' positive'] = 1

In [277]:
X = Xv.iloc[:,0:10]
y = Xv.iloc[:, 10]

In [278]:
print("Before SMOTE")
print(Counter(Xv.label))

smote = SMOTE(random_state=0) 

X_smote, y_smote = smote.fit_sample(X, y)

print("After SMOTE")
print(Counter(y_smote))

X_train, X_val, y_train, y_val = train_test_split(X_smote, y_smote, test_size=0.2)

# Random Forest Model
rf = RandomForestClassifier(n_estimators=5, random_state=0, max_depth=2)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val) # do predict on validation set
print("Random Forest evalute on validation set")
print(classification_report(y_val, y_pred)) # Show the evaluation result

Before SMOTE
Counter({' negative': 867, ' positive': 88})
After SMOTE
Counter({' positive': 867, ' negative': 867})
Random Forest evalute on validation set
              precision    recall  f1-score   support

    negative       0.94      0.92      0.93       161
    positive       0.93      0.95      0.94       186

    accuracy                           0.94       347
   macro avg       0.94      0.94      0.94       347
weighted avg       0.94      0.94      0.94       347



In [279]:
# Predict on testing set
Xv_test = pd.read_csv("v/v_test.csv")
X_test = Xv_test.iloc[:,:10]
pred = rf.predict(Xv_test.iloc[:, :10])
Xv_test['label'] = pred

In [280]:
# Save to CSV file
Xv_test.to_csv("v_test_pred.csv", index=False)

### Use SMOTEENN on `p_train.csv` dataset

In [281]:
Xp = pd.read_csv("p/p_train.csv", names=[0,1,2,3,4,5,6,7,'label'])

In [282]:
X = Xp.iloc[:,0:8]
y = Xp.iloc[:, 8]

In [283]:
print("Before SMOTEENN")
print(Counter(Xp.label))

sme = SMOTEENN(random_state=27)
X_sme, y_sme = sme.fit_resample(X, y)

print("After SMOTEENN")
print(Counter(y_sme))

X_train, X_val, y_train, y_val = train_test_split(X_sme, y_sme, test_size=0.2)

# Random Forest Model
rf = RandomForestClassifier(n_estimators=5, random_state=0, max_depth=2)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val) # do predict on validation set
print("Random Forest evalute on validation set")
print(classification_report(y_val, y_pred)) # Show the evaluation result

Before SMOTEENN
Counter({' negative': 484, ' positive': 261})
After SMOTEENN
Counter({' positive': 266, ' negative': 213})
Random Forest evalute on validation set
              precision    recall  f1-score   support

    negative       0.90      0.85      0.88        41
    positive       0.89      0.93      0.91        55

    accuracy                           0.90        96
   macro avg       0.90      0.89      0.89        96
weighted avg       0.90      0.90      0.90        96



In [284]:
# Predict on testing set
Xp_test = pd.read_csv("p/p_test.csv")
X_test = Xp_test.iloc[:,:8]
pred = rf.predict(Xp_test.iloc[:,:8])
Xp_test['label'] = pred

In [285]:
# Save to CSV file
Xp_test.to_csv("p_test_pred.csv", index=False)

## Multi-class Datasets

`y_train.csv`, `e_train.csv` and `a_train.csv` are datasets with multi-classes.

In [437]:
def balanced_random_forest(X, y, n=15, depth=2, state=0, divide=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=divide)

    # Balanced Random Forest Model
    brf = BalancedRandomForestClassifier(n_estimators=n, max_depth=depth, random_state=state)
    brf.fit(X_train, y_train)
    y_pred = brf.predict(X_val) # do predict on validation set
    print("Balanced Random Forest evalute on validation set")
    print(classification_report(y_val, y_pred)) # Show the evaluation result
    
    return brf

def rus_boost(X, y, n=50, lr=1e-3, state=0, divide=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=divide)

    # Random Forest Model
    rusb = RUSBoostClassifier(n_estimators=n, learning_rate=lr, random_state=state)
    rusb.fit(X_train, y_train)
    y_pred = rusb.predict(X_val) # do predict on validation set
    print("RUSBoost Classifier evalute on validation set")
    print(classification_report(y_val, y_pred)) # Show the evaluation result
    
    return rusb

def easy_ensemble(X, y, n=10, state=0, divide=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=divide)

    # Random Forest Model
    eec = EasyEnsembleClassifier(n_estimators=n, random_state=state)
    eec.fit(X_train, y_train)
    y_pred = eec.predict(X_val) # do predict on validation set
    print("Easy Ensemble Classifier evalute on validation set")
    print(classification_report(y_val, y_pred)) # Show the evaluation result
    
    return eec

def balanced_bagging(X, y, n=10, state=0, divide=0.2):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=divide)

    # Random Forest Model
    bbc = BalancedBaggingClassifier(n_estimators=n, random_state=state)
    bbc.fit(X_train, y_train)
    y_pred = bbc.predict(X_val) # do predict on validation set
    print("Balanced Bagging Classifier evalute on validation set")
    print(classification_report(y_val, y_pred)) # Show the evaluation result
    
    return bbc

## `y_train.csv`

In [457]:
Xy = pd.read_csv("y/y_train.csv", names=[0,1,2,3,4,5,6,7,'label'])
Counter(Xy.label)

Counter({'MIT': 234,
         'NUC': 410,
         'CYT': 450,
         'ME1': 36,
         'EXC': 33,
         'ME2': 46,
         'ME3': 140,
         'VAC': 30,
         'POX': 18,
         'ERL': 5})

In [458]:
X = Xy.iloc[:,0:8]
y = Xy.iloc[:, 8]

In [321]:
brf = balanced_random_forest(X, y)

Balanced Random Forest evalute on validation set
              precision    recall  f1-score   support

         CYT       0.49      0.57      0.53       109
         ERL       0.50      1.00      0.67         2
         EXC       0.33      0.57      0.42         7
         ME1       0.50      0.44      0.47         9
         ME2       0.21      0.33      0.26         9
         ME3       0.44      0.84      0.58        25
         MIT       0.47      0.16      0.24        44
         NUC       0.42      0.12      0.18        68
         POX       0.14      0.75      0.24         4
         VAC       0.08      0.25      0.12         4

    accuracy                           0.41       281
   macro avg       0.36      0.50      0.37       281
weighted avg       0.44      0.41      0.38       281



In [318]:
rusb = rus_boost(X, y)

RUSBoost Classifier evalute on validation set
              precision    recall  f1-score   support

         CYT       0.44      0.35      0.39        88
         ERL       0.50      1.00      0.67         1
         EXC       0.42      0.56      0.48         9
         ME1       0.50      0.50      0.50         8
         ME2       0.00      0.00      0.00         6
         ME3       0.76      0.76      0.76        29
         MIT       0.31      0.42      0.36        43
         NUC       0.59      0.39      0.47        89
         POX       0.67      0.40      0.50         5
         VAC       0.03      0.33      0.05         3

    accuracy                           0.42       281
   macro avg       0.42      0.47      0.42       281
weighted avg       0.49      0.42      0.45       281



In [462]:
eec = easy_ensemble(X, y, n=20)

Easy Ensemble Classifier evalute on validation set
              precision    recall  f1-score   support

         CYT       0.55      0.43      0.48        92
         ERL       0.33      1.00      0.50         1
         EXC       0.42      1.00      0.59         5
         ME1       0.25      0.33      0.29         3
         ME2       0.43      0.50      0.46        12
         ME3       0.68      0.62      0.65        24
         MIT       0.40      0.44      0.42        54
         NUC       0.61      0.32      0.42        79
         POX       0.33      0.17      0.22         6
         VAC       0.02      0.20      0.04         5

    accuracy                           0.42       281
   macro avg       0.40      0.50      0.41       281
weighted avg       0.52      0.42      0.45       281



In [467]:
bbc = balanced_bagging(X, y, n=20)

Balanced Bagging Classifier evalute on validation set
              precision    recall  f1-score   support

         CYT       0.44      0.36      0.40        83
         ERL       0.67      1.00      0.80         2
         EXC       0.41      0.70      0.52        10
         ME1       0.62      0.83      0.71         6
         ME2       0.17      0.38      0.23         8
         ME3       0.68      0.83      0.75        30
         MIT       0.59      0.47      0.52        47
         NUC       0.47      0.29      0.36        82
         POX       0.20      0.75      0.32         4
         VAC       0.04      0.11      0.06         9

    accuracy                           0.43       281
   macro avg       0.43      0.57      0.47       281
weighted avg       0.48      0.43      0.44       281



In [329]:
# Use RUSBoost Classifier to predect on testing set
Xy_test = pd.read_csv("y/y_test.csv")
X_test = Xy_test.iloc[:,:8]
pred = rusb.predict(Xy_test.iloc[:,:8])
Xy_test['label'] = pred

# Save
Xy_test.to_csv("y_test_pred.csv", index=False)

## `e_train.csv`

In [469]:
Xe = pd.read_csv("e/e_train.csv", names=[0,1,2,3,4,5,6,'label'])
Counter(Xe.label)

Counter({'cp': 143,
         'im': 77,
         'imS': 2,
         'imL': 2,
         'imU': 35,
         'om': 20,
         'omL': 5,
         'pp': 19})

In [470]:
X = Xe.iloc[:,0:7]
y = Xe.iloc[:, 7]

In [434]:
brf = balanced_random_forest(X, y, n=20, depth=2, state=9, divide=0.2)

Balanced Random Forest evalute on validation set
              precision    recall  f1-score   support

          cp       1.00      0.14      0.25        76
          im       0.82      0.84      0.83        38
         imL       0.00      0.00      0.00         0
         imS       0.00      0.00      0.00         1
         imU       0.75      0.35      0.48        17
          om       0.36      1.00      0.53         8
         omL       0.00      0.00      0.00         4
          pp       0.06      0.50      0.11         8

    accuracy                           0.40       152
   macro avg       0.37      0.35      0.28       152
weighted avg       0.81      0.40      0.42       152



  _warn_prf(average, modifier, msg_start, len(result))


In [492]:
rusb = rus_boost(X, y, n=50, lr=1e-2, state=0)

RUSBoost Classifier evalute on validation set
              precision    recall  f1-score   support

          cp       0.66      0.95      0.78        22
          im       1.00      0.11      0.19        19
         imL       0.00      0.00      0.00         0
         imS       0.07      1.00      0.12         1
         imU       0.50      0.12      0.20         8
          om       1.00      0.20      0.33         5
         omL       1.00      1.00      1.00         1
          pp       0.71      1.00      0.83         5

    accuracy                           0.52        61
   macro avg       0.62      0.55      0.43        61
weighted avg       0.77      0.52      0.48        61



  _warn_prf(average, modifier, msg_start, len(result))


In [455]:
eec = easy_ensemble(X, y, n=10)

Easy Ensemble Classifier evalute on validation set
              precision    recall  f1-score   support

          cp       0.96      0.77      0.85        30
          im       0.57      0.53      0.55        15
         imL       0.00      0.00      0.00         0
         imS       0.00      0.00      0.00         0
         imU       0.12      0.10      0.11        10
          om       0.25      0.50      0.33         2
         omL       1.00      0.50      0.67         2
          pp       0.00      0.00      0.00         2

    accuracy                           0.56        61
   macro avg       0.36      0.30      0.31        61
weighted avg       0.67      0.56      0.61        61



  _warn_prf(average, modifier, msg_start, len(result))


In [450]:
bbc = balanced_bagging(X, y, n=10)

Balanced Bagging Classifier evalute on validation set
              precision    recall  f1-score   support

          cp       0.88      0.96      0.92        23
          im       1.00      0.41      0.58        22
         imL       0.00      0.00      0.00         1
         imS       0.25      1.00      0.40         1
         imU       0.40      0.50      0.44         4
          om       1.00      0.75      0.86         4
         omL       0.50      1.00      0.67         1
          pp       0.50      1.00      0.67         5

    accuracy                           0.70        61
   macro avg       0.57      0.70      0.57        61
weighted avg       0.84      0.70      0.71        61



In [456]:
# Use Balanced Bagging Classifier to predict on testing set
Xe_test = pd.read_csv("e/e_test.csv")
X_test = Xe_test.iloc[:,:7]
pred = bbc.predict(Xe_test.iloc[:,:7])
Xe_test['label'] = pred

# Save
Xe_test.to_csv("e_test_pred.csv", index=False)

## `a_train.csv`

In [556]:
Xa = pd.read_csv("a/a_train.csv", names=[0,1,2,3,4,5,6,'label'])
Counter(Xa.label)

Counter({15: 103,
         7: 383,
         9: 669,
         10: 617,
         8: 553,
         20: 26,
         16: 67,
         19: 32,
         14: 126,
         11: 464,
         12: 264,
         18: 42,
         13: 200,
         5: 115,
         4: 56,
         6: 255,
         21: 14,
         17: 58,
         22: 6,
         1: 1,
         3: 15,
         26: 1,
         23: 9,
         29: 1,
         2: 1,
         27: 2,
         25: 1,
         24: 2})

In [494]:
len(Xa.label)

4083

In [549]:
X = Xa.iloc[:,0:7]
y = Xa.iloc[:, 7]

In [550]:
# Random over-sampling
ros = RandomOverSampler(random_state=0)
X_ros, y_ros = ros.fit_sample(X, y)

In [551]:
brf = balanced_random_forest(X_ros, y_ros, n=20, depth=2, state=9, divide=0.2)
rusb = rus_boost(X_ros, y_ros, n=50, lr=1e-2, state=0)
eec = easy_ensemble(X_ros, y_ros, n=10)
bbc = balanced_bagging(X_ros, y_ros, n=10)

Balanced Random Forest evalute on validation set
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       123
           2       0.81      1.00      0.89       141
           3       0.35      0.87      0.50       127
           4       0.18      0.08      0.11       137
           5       0.20      0.02      0.04       135
           6       0.00      0.00      0.00       129
           7       0.00      0.00      0.00       145
           8       0.00      0.00      0.00       142
           9       0.00      0.00      0.00       121
          10       0.05      0.01      0.01       136
          11       0.00      0.00      0.00       125
          12       0.00      0.00      0.00       144
          13       0.00      0.00      0.00       126
          14       0.00      0.00      0.00       143
          15       0.00      0.00      0.00       134
          16       0.00      0.00      0.00       146
          17       0.00      0.0

  _warn_prf(average, modifier, msg_start, len(result))


RUSBoost Classifier evalute on validation set
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       135
           2       0.00      0.00      0.00       147
           3       0.00      0.00      0.00       153
           4       0.00      0.00      0.00       122
           5       0.00      0.00      0.00       121
           6       0.00      0.00      0.00       145
           7       0.06      0.43      0.11       118
           8       0.20      0.25      0.23       130
           9       0.00      0.00      0.00       138
          10       0.00      0.00      0.00       120
          11       0.00      0.00      0.00       133
          12       0.00      0.00      0.00       141
          13       0.00      0.00      0.00       124
          14       0.00      0.00      0.00       131
          15       0.07      0.85      0.14       135
          16       0.00      0.00      0.00       128
          17       0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


Easy Ensemble Classifier evalute on validation set
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       119
           2       0.00      0.00      0.00       144
           3       0.07      0.13      0.09       149
           4       0.34      0.98      0.50       139
           5       0.71      0.11      0.19       139
           6       0.41      0.06      0.10       124
           7       0.00      0.00      0.00       119
           8       0.00      0.00      0.00       151
           9       0.00      0.00      0.00       141
          10       0.00      0.00      0.00       135
          11       0.00      0.00      0.00       139
          12       0.00      0.00      0.00       136
          13       0.00      0.00      0.00       139
          14       0.00      0.00      0.00       118
          15       0.00      0.00      0.00       141
          16       0.00      0.00      0.00       139
          17       0.00      0

  _warn_prf(average, modifier, msg_start, len(result))


Balanced Bagging Classifier evalute on validation set
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       145
           2       1.00      1.00      1.00       143
           3       1.00      1.00      1.00       134
           4       0.99      1.00      1.00       131
           5       0.92      0.99      0.95       131
           6       0.75      0.96      0.84       129
           7       0.78      0.74      0.76       160
           8       0.52      0.46      0.49       134
           9       0.25      0.24      0.25       125
          10       0.45      0.29      0.35       138
          11       0.64      0.51      0.57       150
          12       0.78      0.80      0.79       131
          13       0.84      0.94      0.89       121
          14       0.92      1.00      0.96       139
          15       0.95      0.96      0.96       136
          16       0.95      1.00      0.97       129
          17       0.97    

In [555]:
# Use Balanced Bagging Classifier to predict on testing set
Xa_test = pd.read_csv("a/a_test.csv")
X_test = Xa_test.iloc[:,:7]
pred = bbc.predict(Xa_test.iloc[:,:7])
Xa_test['label'] = pred

# Save
Xa_test.to_csv("a_test_pred.csv", index=False)