## Compare different Under Sampling using Different Datasets

- In this notebook, we will try to compare different under sampling methods on different datasets using a Random Forest Model

In [3]:
# import libraries
import numpy as np
import pandas as pd

# for counting
from collections import Counter

# for plotting 
import matplotlib.pyplot as plt
import seaborn as sns

# from sklearn 
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler

# from imblearn
from imblearn.datasets import fetch_datasets
from imblearn.under_sampling import (
    RandomUnderSampler,
    CondensedNearestNeighbour,
    EditedNearestNeighbours,
    RepeatedEditedNearestNeighbours,
    TomekLinks,
    AllKNN,
    NeighbourhoodCleaningRule,
    NearMiss,
    OneSidedSelection,
    InstanceHardnessThreshold
)

## Initialize under sampling methods

- Lets create a dictionary that contains the initialization of all our Under Sampling methods

In [29]:
undersampler_dict = {
    
    'random' : RandomUnderSampler(
        sampling_strategy='auto',
        random_state=0,
        replacement=True),
    
    'cnn' : CondensedNearestNeighbour(
        sampling_strategy='auto',
        n_neighbors=1,
        random_state=0,n_jobs=2),
    
    'enn' : EditedNearestNeighbours(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=2),
  
    'renn' : RepeatedEditedNearestNeighbours(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=2,
        max_iter=100),
  
    'tomek' : TomekLinks(
        sampling_strategy='atuo',
        n_jobs=2),
    
    'allknn' : AllKNN(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=2),
    
    'ncr' : NeighbourhoodCleaningRule(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=2,
        threshold_cleaning=0.5),
    
    'oss' : OneSidedSelection(
                            sampling_strategy='auto',
                            random_state=0,
                            n_neighbors=1,
                            n_jobs=2),
    
    
    'ncr' : NeighbourhoodCleaningRule(
        sampling_strategy='auto',
        n_neighbors=3,
        kind_sel='all',
        n_jobs=2,
        threshold_cleaning=0.5),
    
    'oss' : OneSidedSelection(
                            sampling_strategy='auto',
                            random_state=0,
                            n_neighbors=1,
                            n_jobs=2),
    
    'nm1' : NearMiss(sampling_strategy='auto',
                     version=1,
                     n_neighbors=3,
                     n_jobs=2),
    
    'nm2' : NearMiss(sampling_strategy='auto',
                     version=2,
                     n_neighbors=3,
                     n_jobs=2),
    

    'iht' : InstanceHardnessThreshold(
        estimator=LogisticRegression(max_iter=200, n_jobs=2,random_state=0), 
                                     sampling_strategy='auto',
                                     cv=3,
                                     random_state=0,
                                     n_jobs=2)  
}

In [30]:
# lets define the datasets to use - as a list of datasets names
datasets_ls = [
    'car_eval_34',
    'ecoli',
    'thyroid_sick',
    'arrhythmia',
    'ozone_level'
]

In [45]:
# getting the datasets

data = fetch_datasets()['ecoli']
data

{'data': array([[0.49, 0.29, 0.48, ..., 0.56, 0.24, 0.35],
        [0.07, 0.4 , 0.48, ..., 0.54, 0.35, 0.44],
        [0.56, 0.4 , 0.48, ..., 0.49, 0.37, 0.46],
        ...,
        [0.61, 0.6 , 0.48, ..., 0.44, 0.39, 0.38],
        [0.59, 0.61, 0.48, ..., 0.42, 0.42, 0.37],
        [0.74, 0.74, 0.48, ..., 0.31, 0.53, 0.52]]),
 'target': array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -

In [51]:
range(len(data.data[0][:]))

range(0, 7)

In [46]:
pd.Series(data.target).value_counts()

-1    301
 1     35
dtype: int64

In [38]:
# lets try to check the data imbalance for all the data in our dataset list

for dataset in datasets_ls:
    data = fetch_datasets()[dataset]
    print(data.DESCR)
    data = pd.Series(data.target)
    print(data.value_counts())

car_eval_34
-1    1594
 1     134
dtype: int64
ecoli
-1    301
 1     35
dtype: int64
thyroid_sick
-1    3541
 1     231
dtype: int64
arrhythmia
-1    427
 1     25
dtype: int64
ozone_level
-1    2463
 1      73
dtype: int64


In [39]:
# can do this way as well
for dataset in datasets_ls:
    data = fetch_datasets()[dataset]
    print(dataset)
    print(Counter(data.target))
    print()

car_eval_34
Counter({-1: 1594, 1: 134})

ecoli
Counter({-1: 301, 1: 35})

thyroid_sick
Counter({-1: 3541, 1: 231})

arrhythmia
Counter({-1: 427, 1: 25})

ozone_level
Counter({-1: 2463, 1: 73})



In [40]:
# lets create a function to train random forest model

def Random_Forest(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=100, n_jobs=2, max_depth=4, random_state=39)
    
    rf.fit(X_train, y_train)
    
    probs_train = rf.predict_proba(X_train)
    probs_test = rf.predict_proba(X_test)
    
    print('ROC score train data: {}'.format(roc_auc_score(y_train,probs_train[:,1])))
    print('ROC score test data: {}'.format(roc_auc_score(y_test,probs_test[:,1])))
    
    return roc_auc_score(y_test, probs_test[:,1])

### train the different datasets and get the roc score using differnt under sampling methods

In [58]:
roc_values = dict()

for dataset in datasets_ls:
    print(dataset)
    dataset = fetch_datasets()[dataset]
    columns = list(range(len(dataset.data[0][:])))
    columns = list(map(str, columns))
    X = pd.DataFrame(data = dataset.data, columns = columns)
    y = pd.Series(data.target)
    
    # split into Train and Test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    # call the Random Forest function
    Random_Forest(X_train, X_test, y_train, y_test)


car_eval_34


ValueError: Found input variables with inconsistent numbers of samples: [1728, 336]