### Methods We should be followed:
1. Down Sampling
- Random Over Sampler
- Near Miss
2. Upper Sampling
- Random Over Sampler
- SMOTE
- Borderline SMOTE
- ADASYN
3. Training Importance of Minority Class 
- Logistic Regression 
- Easy Ensemble Classifier
- Random Forest Classifier
- Boosting Algorithms Techniques

In [1]:
###### Import all necessity library #######
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
%matplotlib inline

### Down Sampling - Random Under Sampler

In [2]:
######## Import RandomUnderSampler ########
from imblearn.under_sampling import RandomUnderSampler

In [3]:
#### load the dataset
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [4]:
##### Count value_counts() of the target column
df.iloc[:, -1].value_counts()
##### It's quite Imbalanced DataSet

0    500
1    267
Name: 1, dtype: int64

In [5]:
##### Split the dataset X, y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

##### Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [6]:
random_under_sampler_ = RandomUnderSampler(random_state = 42)
X_train, y_train = random_under_sampler_.fit_resample(X_train, y_train)

In [7]:
X_train.shape, y_train.shape

((398, 8), (398,))

In [8]:
##### Create the model ######
def decision_tree():
    decision_tree_ = DecisionTreeClassifier()
    decision_tree_.fit(X_train, y_train)
    predicted_ = decision_tree_.predict(X_test)
    print("The accuracy is  = ", accuracy_score(predicted_, y_test))
    print("The precision is = ", precision_score(predicted_, y_test))
    print("recall value is  = ", recall_score(predicted_, y_test))
    print("f1 value score   = ", f1_score(predicted_, y_test))
    
def adaboost():
    decision_tree_ = AdaBoostClassifier()
    decision_tree_.fit(X_train, y_train)
    predicted_ = decision_tree_.predict(X_test)
    print("The accuracy is  = ", accuracy_score(predicted_, y_test))
    print("The precision is = ", precision_score(predicted_, y_test))
    print("recall value is  = ", recall_score(predicted_, y_test))
    print("f1 value score   = ", f1_score(predicted_, y_test))
    
def gradient_boosting():
    decision_tree_ = GradientBoostingClassifier()
    decision_tree_.fit(X_train, y_train)
    predicted_ = decision_tree_.predict(X_test)
    print("The accuracy is  = ", accuracy_score(predicted_, y_test))
    print("The precision is = ", precision_score(predicted_, y_test))
    print("recall value is  = ", recall_score(predicted_, y_test))
    print("f1 value score   = ", f1_score(predicted_, y_test))
    
def xgboost():
    decision_tree_ = XGBClassifier()
    decision_tree_.fit(X_train, y_train)
    predicted_ = decision_tree_.predict(X_test)
    print("The accuracy is  = ", accuracy_score(predicted_, y_test))
    print("The precision is = ", precision_score(predicted_, y_test))
    print("recall value is  = ", recall_score(predicted_, y_test))
    print("f1 value score   = ", f1_score(predicted_, y_test))

In [9]:
print("Decision Tree\n\n",decision_tree())
print("Adaboost\n\n", adaboost())
print("GradientBoosting\n\n",gradient_boosting())
print("XGBoost\n\n", xgboost())

The accuracy is  =  0.7135416666666666
The precision is =  0.8088235294117647
recall value is  =  0.5670103092783505
f1 value score   =  0.6666666666666666
Decision Tree

 None
The accuracy is  =  0.7083333333333334
The precision is =  0.7352941176470589
recall value is  =  0.5681818181818182
f1 value score   =  0.641025641025641
Adaboost

 None
The accuracy is  =  0.765625
The precision is =  0.8382352941176471
recall value is  =  0.6263736263736264
f1 value score   =  0.7169811320754716
GradientBoosting

 None
The accuracy is  =  0.7708333333333334
The precision is =  0.8235294117647058
recall value is  =  0.6363636363636364
f1 value score   =  0.717948717948718
XGBoost

 None


### Under Sampling - Near Miss

In [10]:
from imblearn.under_sampling import NearMiss

In [11]:
#### load the dataset
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [12]:
##### Split the dataset X, y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

##### Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [13]:
####### Implementing NearMiss ########
near_miss_ = NearMiss()
X_train, y_train = near_miss_.fit_resample(X_train, y_train)

In [14]:
X_train.shape, y_train.shape

((398, 8), (398,))

In [15]:
print("Decision Tree\n\n",decision_tree())
print("Adaboost\n\n", adaboost())
print("GradientBoosting\n\n",gradient_boosting())
print("XGBoost\n\n", xgboost())

The accuracy is  =  0.7083333333333334
The precision is =  0.8088235294117647
recall value is  =  0.5612244897959183
f1 value score   =  0.6626506024096385
Decision Tree

 None
The accuracy is  =  0.71875
The precision is =  0.7941176470588235
recall value is  =  0.574468085106383
f1 value score   =  0.6666666666666666
Adaboost

 None
The accuracy is  =  0.75
The precision is =  0.8235294117647058
recall value is  =  0.6086956521739131
f1 value score   =  0.7
GradientBoosting

 None
The accuracy is  =  0.7552083333333334
The precision is =  0.8676470588235294
recall value is  =  0.6082474226804123
f1 value score   =  0.7151515151515151
XGBoost

 None


#### Advantage:
1. Easy to implement

#### Disadvantage:
1. In Under Sampling - we are deleting the random values based on `minority`. As a result, the valuable information might be lost

## Upper Sampling - Random Over Sampler

In [16]:
#### load the dataset
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [17]:
##### Scaling the dataset
scaler_ = MinMaxScaler()
df = pd.DataFrame(scaler_.fit_transform(df), columns = df.columns)
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
1,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
2,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
3,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0
4,0.294118,0.582915,0.606557,0.0,0.0,0.38152,0.052519,0.15,0.0


In [18]:
##### Split the dataset X, y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

##### Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [19]:
##### Random Over Sampler ######
from imblearn.over_sampling import RandomOverSampler

random_over_sampler_ = RandomOverSampler(random_state = 42)
X_train, y_train = random_over_sampler_.fit_resample(X_train, y_train)
X_train.shape, y_train.shape

((752, 8), (752,))

In [20]:
print("Decision Tree\n\n",decision_tree())
print("Adaboost\n\n", adaboost())
print("GradientBoosting\n\n",gradient_boosting())
print("XGBoost\n\n", xgboost())

The accuracy is  =  0.703125
The precision is =  0.5588235294117647
recall value is  =  0.5846153846153846
f1 value score   =  0.5714285714285715
Decision Tree

 None
The accuracy is  =  0.6979166666666666
The precision is =  0.75
recall value is  =  0.5543478260869565
f1 value score   =  0.6375000000000001
Adaboost

 None
The accuracy is  =  0.734375
The precision is =  0.7794117647058824
recall value is  =  0.5955056179775281
f1 value score   =  0.6751592356687898
GradientBoosting

 None
The accuracy is  =  0.75
The precision is =  0.7352941176470589
recall value is  =  0.625
f1 value score   =  0.6756756756756757
XGBoost

 None


### SMOTE 

In [21]:
#### load the dataset
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [22]:
##### Scaling the dataset
scaler_ = MinMaxScaler()
df = pd.DataFrame(scaler_.fit_transform(df), columns = df.columns)
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
1,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
2,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
3,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0
4,0.294118,0.582915,0.606557,0.0,0.0,0.38152,0.052519,0.15,0.0


In [23]:
##### Split the dataset X, y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

##### Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [25]:
##### Implementing SMOTE #####
from imblearn.over_sampling import SMOTE
smote_ = SMOTE()
X_train, y_train = smote_.fit_resample(X_train, y_train)
X_train.shape, y_train.shape

((752, 8), (752,))

In [26]:
print("Decision Tree\n\n",decision_tree())
print("Adaboost\n\n", adaboost())
print("GradientBoosting\n\n",gradient_boosting())
print("XGBoost\n\n", xgboost())

The accuracy is  =  0.7239583333333334
The precision is =  0.5882352941176471
recall value is  =  0.6153846153846154
f1 value score   =  0.6015037593984962
Decision Tree

 None
The accuracy is  =  0.71875
The precision is =  0.7352941176470589
recall value is  =  0.5813953488372093
f1 value score   =  0.6493506493506493
Adaboost

 None
The accuracy is  =  0.7395833333333334
The precision is =  0.75
recall value is  =  0.6071428571428571
f1 value score   =  0.6710526315789472
GradientBoosting

 None
The accuracy is  =  0.7604166666666666
The precision is =  0.7647058823529411
recall value is  =  0.6341463414634146
f1 value score   =  0.6933333333333332
XGBoost

 None


### Borderline SMOTE

In [27]:
#### import the dataset
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [29]:
#### drop the Time column
df.drop(['Time'], axis = 1, inplace = True)

In [33]:
##### Split the dataset X, y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

##### Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [34]:
###### Implementing Borderline #######
from imblearn.over_sampling import BorderlineSMOTE

borderline_ = BorderlineSMOTE()
X_train, y_train = borderline_.fit_resample(X_train, y_train)
X_train.shape, y_train.shape

((426452, 29), (426452,))

In [35]:
print("Decision Tree\n\n",decision_tree())
print("Adaboost\n\n", adaboost())
print("GradientBoosting\n\n",gradient_boosting())
print("XGBoost\n\n", xgboost())

The accuracy is  =  0.9988343024072357
The precision is =  0.7433628318584071
recall value is  =  0.6086956521739131
f1 value score   =  0.6693227091633467
Decision Tree

 None
The accuracy is  =  0.9956461897137721
The precision is =  0.8849557522123894
recall value is  =  0.2518891687657431
f1 value score   =  0.3921568627450981
Adaboost

 None
The accuracy is  =  0.9976124266172298
The precision is =  0.8672566371681416
recall value is  =  0.38735177865612647
f1 value score   =  0.53551912568306
GradientBoosting

 None
The accuracy is  =  0.9994241734782731
The precision is =  0.8407079646017699
recall value is  =  0.8050847457627118
f1 value score   =  0.8225108225108225
XGBoost

 None


### ADASYN 

In [42]:
#### import the dataset
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [43]:
#### drop the Time column
df.drop(['Time'], axis = 1, inplace = True)

In [44]:
##### Split the dataset X, y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

##### Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [40]:
######## ADASYN #########
from imblearn.over_sampling import ADASYN
adasyn_ = ADASYN()
X_train, y_train = adasyn_.fit_resample(X_train, y_train)

In [41]:
print("Decision Tree\n\n",decision_tree())
# print("Adaboost\n\n", adaboost())
# print("GradientBoosting\n\n",gradient_boosting())
print("XGBoost\n\n", xgboost())

The accuracy is  =  0.9981882531389568
The precision is =  0.7610619469026548
recall value is  =  0.4574468085106383
f1 value score   =  0.5714285714285714
Decision Tree

 None
The accuracy is  =  0.9993539507317211
The precision is =  0.8672566371681416
recall value is  =  0.7596899224806202
f1 value score   =  0.8099173553719009
XGBoost

 None


#### Disadvantage:
1. Though we are creating synthetic data from the majority and minority class that leads to overfitting sometimes

#### Using Random Forest Classifier

In [55]:
random_forest_ = RandomForestClassifier(class_weight = 'balanced', criterion='entropy', n_estimators = 50)
random_forest_.fit(X_train, y_train)
predicted_ = random_forest_.predict(X_test)

print("The accuracy is  = ", accuracy_score(predicted_, y_test))
print("The precision is = ", precision_score(predicted_, y_test))
print("recall value is  = ", recall_score(predicted_, y_test))
print("f1 value score   = ", f1_score(predicted_, y_test))

The accuracy is  =  0.999522485323446
The precision is =  0.7345132743362832
recall value is  =  0.9540229885057471
f1 value score   =  0.83


In [47]:
##### Random Forest Classifier with class_weight ######
random_forest_ = RandomForestClassifier(class_weight = 'balanced_subsample', criterion='entropy')
random_forest_.fit(X_train, y_train)
predicted_ = random_forest_.predict(X_test)

print("The accuracy is  = ", accuracy_score(predicted_, y_test))
print("The precision is = ", precision_score(predicted_, y_test))
print("recall value is  = ", recall_score(predicted_, y_test))
print("f1 value score   = ", f1_score(predicted_, y_test))

The accuracy is  =  0.9995786635206876
The precision is =  0.7787610619469026
recall value is  =  0.946236559139785
f1 value score   =  0.854368932038835


## Easy Ensemble Classifier

In [54]:
from imblearn.ensemble import EasyEnsembleClassifier
easy_ensemble_ = EasyEnsembleClassifier(n_estimators = 100, replacement = False, base_estimator = RandomForestClassifier(
class_weight = 'balanced_subsample', criterion='entropy'))
easy_ensemble_.fit(X_train, y_train)
predicted_ = easy_ensemble_.predict(X_test)

print("The accuracy is  = ", accuracy_score(predicted_, y_test))
print("The precision is = ", precision_score(predicted_, y_test))
print("recall value is  = ", recall_score(predicted_, y_test))
print("f1 value score   = ", f1_score(predicted_, y_test))

The accuracy is  =  0.9790876660767956
The precision is =  0.9292035398230089
recall value is  =  0.06620428751576292
f1 value score   =  0.12360211889346674


In [58]:
from imblearn.over_sampling import SVMSMOTE
svm_smote_ = SVMSMOTE()
X_train, y_train = svm_smote_.fit_resample(X_train, y_train)

In [60]:
xgboost_ = RandomForestClassifier()
xgboost_.fit(X_train, y_train)
predicted_ = xgboost_.predict(X_test)

print("The accuracy is  = ", accuracy_score(predicted_, y_test))
print("The precision is = ", precision_score(predicted_, y_test))
print("recall value is  = ", recall_score(predicted_, y_test))
print("f1 value score   = ", f1_score(predicted_, y_test))

The accuracy is  =  0.9994382180275835
The precision is =  0.8407079646017699
recall value is  =  0.811965811965812
f1 value score   =  0.8260869565217391
