In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('./data/framingham.csv').dropna()
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
len(df)

3658

In [4]:
target_column = 'TenYearCHD'
X = df.drop(columns=[target_column]).values
X[:5]

array([[  1.  ,  39.  ,   4.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  , 195.  , 106.  ,  70.  ,  26.97,  80.  ,  77.  ],
       [  0.  ,  46.  ,   2.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
          0.  , 250.  , 121.  ,  81.  ,  28.73,  95.  ,  76.  ],
       [  1.  ,  48.  ,   1.  ,   1.  ,  20.  ,   0.  ,   0.  ,   0.  ,
          0.  , 245.  , 127.5 ,  80.  ,  25.34,  75.  ,  70.  ],
       [  0.  ,  61.  ,   3.  ,   1.  ,  30.  ,   0.  ,   0.  ,   1.  ,
          0.  , 225.  , 150.  ,  95.  ,  28.58,  65.  , 103.  ],
       [  0.  ,  46.  ,   3.  ,   1.  ,  23.  ,   0.  ,   0.  ,   0.  ,
          0.  , 285.  , 130.  ,  84.  ,  23.1 ,  85.  ,  85.  ]])

In [5]:
y = df[target_column].values
y[:5]

array([0, 0, 0, 1, 0])

# Seed for determinsm

In [6]:
seed = 42

# Remove outliers

In [7]:
def plot_outliers(x: np.ndarray, pred: np.ndarray, clf: IsolationForest):
    """Plots the outliers that the IsolationForest has detected"""
    helper = lambda i: np.linspace(min((x[:, i])), max((x[:, i])), 500)
    xx, yy = np.meshgrid(helper(0), helper(1))
    pos = pred > 0
    neg = pred < 0

    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.title('Outliers detected using IsolationForest')
    plt.contourf(xx, yy, Z, cmap=plt.cm.get_cmap('Blues_r'))

    plt.scatter(x[pos][:, 0], x[pos][:, 1], c='green', edgecolor='k')
    plt.scatter(x[neg][:, 0], x[neg][:, 1], c='red', edgecolor='k')

    plt.axis('tight')

    plt.xlim((xx.min(), xx.max()))
    plt.ylim((yy.min(), yy.max()))

    print(f'Total outliers detected: {neg.sum()}')

    plt.show()

In [8]:
def remove_outlier(x: np.ndarray, y: np.ndarray, plot=False) -> np.ndarray:
    """Removes the outliers of the x vector. Optionally plots the vectors removed"""
    clf = IsolationForest(max_samples=100, contamination=0.1)
    pred = clf.fit_predict(x)
    if plot:
        plot_outliers(x, pred, clf)
    keep_idx = pred > 0
    return x[keep_idx], y[keep_idx]

In [9]:
X, y = remove_outlier(X, y)
len(X)

3292

# Oversample using SMOTE

In [10]:
def oversample_data(x: np.ndarray, y: np.ndarray):
    smote = SMOTE(random_state=seed)
    return smote.fit_resample(x, y)

In [11]:
X, y = oversample_data(X, y)
len(X)

5710

In [12]:
len(y == 0) == len(y == 1)

True