In [1]:
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.metrics import classification_report
from SMOTEBoost import SMOTEBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Toy example

In [2]:
X, y = make_moons(n_samples=1000, noise=0.3, random_state=0)
X_train, X_test, y_trian, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [3]:
smote = SMOTEBoostClassifier()
smote.fit(X_train, y_trian)


predicitons = smote.predict(X_test)

print('balanced_accuracy_score: ', balanced_accuracy_score(y_test, predicitons), 'f1_score: ', f1_score(y_test, predicitons))
print()
print(classification_report(y_test, predicitons))

balanced_accuracy_score:  0.8971015025164395 f1_score:  0.896969696969697

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       163
           1       0.91      0.89      0.90       167

    accuracy                           0.90       330
   macro avg       0.90      0.90      0.90       330
weighted avg       0.90      0.90      0.90       330



# Dataset adult.csv with Imbalanced classes

In [4]:
import os, sys

path = sys.path[0].replace('SMOTEBoost', 'datasets')
data_name = path + '\\' +  os.listdir(path)[0]

df = pd.read_csv(data_name)
y = (df[' income'] == ' >50K').astype(int)

X = df.drop(' income', axis=1)
X = pd.get_dummies(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

## Vary the parameter: depth of base estimator

In [27]:
for depth in [3, 5, 7, 20]:

    smote = SMOTEBoostClassifier(DecisionTreeClassifier(max_depth=depth))
    smote.fit(X_train, y_train)
    
    predicitons = smote.predict(X_test)
    
    print("Base estimator's depth: ", depth)
    print('balanced_accuracy_score: ', balanced_accuracy_score(y_test, predicitons), 'f1_score: ', f1_score(y_test, predicitons))
    print()
    print(classification_report(y_test, predicitons))
    print()

Base estimator's depth:  3
balanced_accuracy_score:  0.7418891827166849 f1_score:  0.6266548984995587

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      8151
           1       0.73      0.55      0.63      2595

    accuracy                           0.84     10746
   macro avg       0.80      0.74      0.76     10746
weighted avg       0.83      0.84      0.83     10746


Base estimator's depth:  5
balanced_accuracy_score:  0.738545999178795 f1_score:  0.624180790960452

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      8151
           1       0.75      0.53      0.62      2595

    accuracy                           0.85     10746
   macro avg       0.81      0.74      0.76     10746
weighted avg       0.84      0.85      0.84     10746


Base estimator's depth:  7
balanced_accuracy_score:  0.762060614570502 f1_score:  0.6598253275109169

              precision    recall  f1-

## Vary the parameter: number of syntatic samples

In [22]:
for _n_syn_samples in [3, 10, 20, 100]:

    smote = SMOTEBoostClassifier(n_syn_samples=_n_syn_samples)
    smote.fit(X_train, y_train)
    
    predicitons = smote.predict(X_test)
    
    print("Number of syntatic samples: ", _n_syn_samples)
    print('balanced_accuracy_score: ', balanced_accuracy_score(y_test, predicitons), 'f1_score: ', f1_score(y_test, predicitons))
    print()
    print(classification_report(y_test, predicitons))
    print()

Number of syntatic samples:  3
balanced_accuracy_score:  0.7511306460500254 f1_score:  0.6432049579459939

              precision    recall  f1-score   support

           0       0.87      0.94      0.91      8151
           1       0.76      0.56      0.64      2595

    accuracy                           0.85     10746
   macro avg       0.81      0.75      0.77     10746
weighted avg       0.84      0.85      0.84     10746


Number of syntatic samples:  10
balanced_accuracy_score:  0.7511306460500254 f1_score:  0.6432049579459939

              precision    recall  f1-score   support

           0       0.87      0.94      0.91      8151
           1       0.76      0.56      0.64      2595

    accuracy                           0.85     10746
   macro avg       0.81      0.75      0.77     10746
weighted avg       0.84      0.85      0.84     10746


Number of syntatic samples:  20
balanced_accuracy_score:  0.7511306460500254 f1_score:  0.6432049579459939

              precisi

## Vary the parameter: number of base estimator

In [23]:
for _n_estimators in [3, 10, 20, 100]:

    smote = SMOTEBoostClassifier(n_estimators=_n_estimators)
    smote.fit(X_train, y_train)
    
    predicitons = smote.predict(X_test)
    
    print("Number of base estimators: ", _n_estimators)
    print('balanced_accuracy_score: ', balanced_accuracy_score(y_test, predicitons), 'f1_score: ', f1_score(y_test, predicitons))
    print()
    print(classification_report(y_test, predicitons))
    print()

Number of base estimators:  3
balanced_accuracy_score:  0.7387386774061553 f1_score:  0.6244916403072751

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      8151
           1       0.75      0.53      0.62      2595

    accuracy                           0.85     10746
   macro avg       0.81      0.74      0.76     10746
weighted avg       0.84      0.85      0.84     10746


Number of base estimators:  10
balanced_accuracy_score:  0.7387386774061553 f1_score:  0.6244916403072751

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      8151
           1       0.75      0.53      0.62      2595

    accuracy                           0.85     10746
   macro avg       0.81      0.74      0.76     10746
weighted avg       0.84      0.85      0.84     10746


Number of base estimators:  20
balanced_accuracy_score:  0.7387386774061553 f1_score:  0.6244916403072751

              precision 

## Vary the parameter: number of neighbours

In [25]:
for _k_neighbors in [3, 10, 20]:

    smote = SMOTEBoostClassifier(k_neighbors=_k_neighbors)
    smote.fit(X_train, y_train)
    
    predicitons = smote.predict(X_test)
    
    print("Number of neighbors: ", _n_estimators)
    print('balanced_accuracy_score: ', balanced_accuracy_score(y_test, predicitons), 'f1_score: ', f1_score(y_test, predicitons))
    print()
    print(classification_report(y_test, predicitons))
    print()

Number of neighbors:  100
balanced_accuracy_score:  0.7387386774061553 f1_score:  0.6244916403072751

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      8151
           1       0.75      0.53      0.62      2595

    accuracy                           0.85     10746
   macro avg       0.81      0.74      0.76     10746
weighted avg       0.84      0.85      0.84     10746


Number of neighbors:  100
balanced_accuracy_score:  0.738545999178795 f1_score:  0.624180790960452

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      8151
           1       0.75      0.53      0.62      2595

    accuracy                           0.85     10746
   macro avg       0.81      0.74      0.76     10746
weighted avg       0.84      0.85      0.84     10746


Number of neighbors:  100
balanced_accuracy_score:  0.738545999178795 f1_score:  0.624180790960452

              precision    recall  f1-scor