### SMOTE and Nearest Neighbor 

### Load libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

For oversampling the imbalanced datasets we can use Markov Chain sampling methods like Rejection Sampling, in which we need to define an envelope that totaly covers the target distribution and generate samples from known envelope distribution and keep those below the target probability distribution function. Another method would be Sampling Importance Resampling (SIR) method in which First, a sample is drawn from a proposal distribution and then from this a smaller sample is drawn with sample probabilities proportional to the importance ratios. In both cases the target distribution function should be available then we need to use another oversampling technique.
However, the method I used here is an oversampling method called Synthetic Minority Oversampling Technique (SMOTE). SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.


### Load the data

In [2]:
iris = load_iris()
X = iris.data
y = iris.target 
y[y != 0]=1    # Iris-Setosa vs. others
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)


### Nearest Neighbor function

In [3]:
def nearest_neighbour(X):
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(X)
    weight = np.random.uniform(low=0.001, high=1.0, size=X.shape[0])
    additive = distances[:,1]*weight
    return additive

### Synthetic Minority Oversampling TEchnique (SMOTE) function

In [4]:
def SMOTE(X,y):
    # finding the lables and the shape of the classes
    unique, counts = np.unique(y, return_counts=True)
    
    # which class is the minority
    r = np.where(counts == np.min(counts))
    
    # The number of Majority and Minority classes
    minority_count = dict(zip(unique, counts))[0]
    majority_count = dict(zip(unique, counts))[1]
    
    # Separating the minority class for further analysis
    x = X[y==r[0]]
       
    # ratio and remainder of the number of majority over minority classs 
    n = majority_count//minority_count
    m = majority_count%minority_count
    
    # Zeros matrix for the results
    X_n = np.zeros(((n-1)*minority_count,x.shape[1]))
    
    for i in range(n-1):
        X_n[i*minority_count:(i+1)*minority_count,:] = (x + nearest_neighbour(x).reshape(len(x),1))
    
    # If we have remainder
    if m != 0:
        rows = random.sample(range(0, len(x)), m)
        X_m =  x[rows,:] + nearest_neighbour(x[rows,:]).reshape(len(x[rows,:]),1)
        X_n = np.concatenate((X_n, X_m), axis=0)
        
    # Concatenate the produced observationes to the original observations    
    X_new = np.concatenate((X,X_n), axis = 0)
    
    # Setting class lables If 0 is the minority class
    if r[0] ==0:
        y_new = np.concatenate((y,np.zeros(len(X_n))), axis=0)
    # If 1 is minority class    
    else:
        y_new = np.concatenate((y,np.ones(len(X_n))), axis=0)
        
    return  X_new, y_new # the synthetic samples created by SMOTE

### Creating balance dataset

In [5]:
X_new, y_new = SMOTE(X_train,y_train)

### Lets see the count of two classes in new dataset

In [6]:
unique, counts = np.unique(y_new, return_counts=True)
print('unique:',unique)
print('counts:',counts)

unique: [0. 1.]
counts: [71 71]


### We fit logistic regression to the new data and report the result on unbalanced test data

In [7]:
X_train_std = StandardScaler().fit_transform(X_new)
X_test_std = StandardScaler().fit_transform(X_test)
lg= LogisticRegression().fit(X_train_std, y_new)

y_pred= lg.predict(X_test_std)
classificationReport = classification_report(y_test, y_pred)
print(classificationReport)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        29

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

