In [4]:
# Installing imblearn
# !pip install -U imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.7.0


You should consider upgrading via the 'c:\users\anurag.bhardwaj\anaconda3\python.exe -m pip install --upgrade pip' command.


# SMOTE (Synthetic Minority Over-sampling Technique)
SMOTE is an over-sampling method. What it does is, it creates synthetic (not duplicate) samples of the minority class. Hence making the minority class equal to the majority class. SMOTE does this by selecting similar records and altering that record one column at a time by a randAom amount within the difference to the neighbouring records.
We will be diving into python to see how this works. If you want to read more on SMOTE, here is an original research paper titled: “SMOTE: Synthetic Minority Over-sampling Technique” written in 2002.

# NearMiss
NearMiss is an under-sampling technique. Instead of resampling the Minority class, using a distance, this will make the majority class equal to minority class.

In [65]:
# Importing necessary libraries

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [66]:
# Bank Marketing Dataset from UCI
# https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip

bank = pd.read_csv("bank-full.csv", sep = ";", na_values = "unknown")

In [67]:
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


In [68]:
bank.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [69]:
# Converting categorical columns to continuous columns
bank["default"] = bank["default"].map({"no":0,"yes":1})
bank["housing"] = bank["housing"].map({"no":0,"yes":1})
bank["loan"] = bank["loan"].map({"no":0,"yes":1})
bank["y"] = bank["y"].map({"no":0,"yes":1})
bank.education = bank.education.map({"primary": 0, "secondary":1, "tertiary":2})

In [70]:
bank.month = pd.to_datetime(bank.month, format = "%b").dt.month

In [71]:
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,2.0,0,2143,1,0,,5,5,261,1,-1,0,,0
1,44,technician,single,1.0,0,29,1,0,,5,5,151,1,-1,0,,0
2,33,entrepreneur,married,1.0,0,2,1,1,,5,5,76,1,-1,0,,0
3,47,blue-collar,married,,0,1506,1,0,,5,5,92,1,-1,0,,0
4,33,,single,,0,1,0,0,,5,5,198,1,-1,0,,0


In [72]:
bank['marital'].value_counts()

married     27214
single      12790
divorced     5207
Name: marital, dtype: int64

In [73]:
# finding nulls in each column for respective dataset

bank.isnull().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
y                0
dtype: int64

In [74]:
bank.shape

(45211, 17)

In [75]:
# dropping below two columns as they are having maximum nulls

bank.drop(["poutcome", "contact"], axis = 1, inplace = True)

In [76]:
# dropping nulls

bank.dropna(inplace = True)

In [77]:
# doing one hot encoding for whole dataset

bank = pd.get_dummies(bank, drop_first = True)

In [78]:
bank.head()

Unnamed: 0,age,education,default,balance,housing,loan,day,month,duration,campaign,...,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_married,marital_single
0,58,2.0,0,2143,1,0,5,5,261,1,...,0,1,0,0,0,0,0,0,1,0
1,44,1.0,0,29,1,0,5,5,151,1,...,0,0,0,0,0,0,1,0,0,1
2,33,1.0,0,2,1,1,5,5,76,1,...,0,0,0,0,0,0,0,0,1,0
5,35,2.0,0,231,1,0,5,5,139,1,...,0,1,0,0,0,0,0,0,1,0
6,28,2.0,0,447,1,1,5,5,217,1,...,0,1,0,0,0,0,0,0,0,1


In [79]:
bank.columns

Index(['age', 'education', 'default', 'balance', 'housing', 'loan', 'day',
       'month', 'duration', 'campaign', 'pdays', 'previous', 'y',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'marital_married',
       'marital_single'],
      dtype='object')

In [80]:
# Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
#        'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
#        'previous', 'poutcome', 'y'],
#       dtype='object')

In [81]:
bank.y.value_counts()

0    38172
1     5021
Name: y, dtype: int64

In [82]:
# Dividing 

X = bank.drop("y", axis = 1)
y = bank.y

In [83]:
# Test-Train Split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)
y_train.value_counts()

0    28628
1     3766
Name: y, dtype: int64

In [84]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [85]:
y_pred = lr.predict(X_test)

In [86]:
# calculating confusion matrix

confusion_matrix(y_test, y_pred)

array([[9361,  183],
       [ 986,  269]], dtype=int64)

In [87]:
# calculating confusion matrix

accuracy_score(y_test, y_pred)

0.8917492360403741

# SMOTE

In [88]:
# Test- Train Split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)

In [89]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
np.bincount(y_train)

array([28628, 28628], dtype=int64)

In [90]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [91]:
y_pred = lr.predict(X_test)

In [92]:
# calculating confusion matrix

confusion_matrix(y_test, y_pred)

array([[8525, 1019],
       [ 651,  604]], dtype=int64)

In [93]:
# calculating confusion matrix

accuracy_score(y_test, y_pred)

0.8453560514862487

In [94]:
recall_score(y_test, y_pred)

0.48127490039840637

# NEAR MISS

In [95]:
# Test- Train Split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)

In [96]:
nr = NearMiss()

In [97]:
X_train, y_train = nr.fit_sample(X_train, y_train)
np.bincount(y_train)

array([3766, 3766], dtype=int64)

In [98]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [99]:
y_pred = lr.predict(X_test)

In [100]:
# calculating confusion matrix

confusion_matrix(y_test, y_pred)

array([[5078, 4466],
       [ 158, 1097]], dtype=int64)

In [101]:
# calculating confusion matrix

accuracy_score(y_test, y_pred)

0.5718122048337809

In [102]:
recall_score(y_test, y_pred)

0.8741035856573706