### Import libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# **Missing Data Imputation**

### Define a function for drawing scatter plots of data

In [0]:
def DrawScatterPlot2D(data, label, fig, title):
    color = np.array(['red' if label[i] == 0 else 'black' for i in range(len(label))])
    plt.figure(fig)
    plt.scatter(data[:, 0], data[:, 1], s=50, c=color, alpha=1)
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()
    plt.close(fig)

### Generate data with missing values

In [0]:
fig = 0
N = 6
randomSeed = 1
class1 = np.array([np.random.RandomState(seed=randomSeed).normal(0, 1, N)*0.4 + 2, 
                   np.random.RandomState(seed=randomSeed+1).normal(0, 1, N)*0.4 + 5]).transpose()
class2 = np.array([np.random.RandomState(seed=randomSeed+2).normal(0, 1, N)*0.4 + 5, 
                   np.random.RandomState(seed=randomSeed+3).normal(0, 1, N)*0.4 + 2]).transpose()
data = np.vstack((class1, class2))
label = np.vstack((np.zeros((N, 1)), np.ones((N, 1))))

nanID = [3, 10]
dataWithNan = data
dataWithNan[nanID[0], 0] = np.nan
dataWithNan[nanID[1], 1] = np.nan
nonNanID = [0, 1, 2, 4, 5, 6, 7, 8, 9, 11]
print(data)
fig = fig + 1
DrawScatterPlot2D(dataWithNan, label, fig, 'Data with missing values')

### Impute by mean

In [0]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
dataImpute = imp.fit_transform(dataWithNan)
fig = fig + 1
DrawScatterPlot2D(dataImpute, label, fig, 'Imputation by Mean')

### Impute by median

In [0]:
imp = SimpleImputer(strategy='median')
dataImpute = imp.fit_transform(dataWithNan)
fig = fig + 1
DrawScatterPlot2D(dataImpute, label, fig, 'Imputation by Median')

### Define a function for imputation using K nearest neighbors (KNN)

In [0]:
def KNN_Impute(data, k = 3):
    import copy
    dataImpute = copy.deepcopy(data)

    numS = data.shape[0]
    dist = np.empty((numS, numS))
    dist.fill(np.nan)

    for i in range(numS-1):
        for j in range(i+1, numS):
            nnID = np.intersect1d(np.where(np.invert(np.isnan(data[i, :])))[0], 
                                  np.where(np.invert(np.isnan(data[j, :])))[0])
            if len(nnID) > 0:
                dist[i, j] = np.sqrt(np.mean((data[i, nnID] - data[j, nnID]) ** 2))
                dist[j, i] = dist[i, j]

    for i in range(numS):
        nID = np.where(np.isnan(data[i, :]))[0]
        if len(nID) == 0:
            continue
        nnDistID = np.where(np.invert(np.isnan(dist[i, :])))[0]
        sortedID = nnDistID[np.argsort(dist[i, nnDistID])]
        for j in range(len(nID)):
            nnIDj = np.where(np.invert(np.isnan(data[:, nID[j]])))[0]
            sortedIDj = sortedID[np.sort(np.where(np.isin(sortedID, nnIDj))[0])]
            if len(sortedIDj) > k:
                sortedIDj = sortedIDj[:k]
            if len(sortedIDj) > 0:
                dataImpute[i, nID[j]] = np.mean(data[sortedIDj, nID[j]])

    return dataImpute

### Impute using KNN approach

In [0]:
dataImpute = KNN_Impute(dataWithNan, k=3)
fig = fig + 1
DrawScatterPlot2D(dataImpute, label, fig, 'Imputation using KNN')

# Imbalanced Data Classification

### Define a function for logistic regression classification and visualization in 2D space

In [0]:
def LogisticRegression2D(trainData, trainLabel, testData, testLabel, fig=1, 
                         randomSeed=1, classWeight=None):
    from sklearn.linear_model import LogisticRegression

    model = LogisticRegression(random_state=randomSeed, solver='lbfgs', 
                               class_weight=classWeight)
    model.fit(trainData, trainLabel)
    pred = model.predict(testData)
    sensitivity = np.round(np.sum(pred[np.where(testLabel == 1)[0]] == 1) / 
                           np.sum(testLabel == 1), 3)
    specificity = np.round(np.sum(pred[np.where(testLabel == 0)[0]] == 0) / 
                           np.sum(testLabel == 0), 3)
    accuracy = np.round(np.sum(pred == testLabel) / len(testLabel), 3)

    x = np.linspace(np.min(testData[:, 0]), np.max(testData[:, 0]), 100)
    y = -model.intercept_ / model.coef_[0, 1] - model.coef_[0, 0] / model.coef_[0, 1] * x
    idK = np.intersect1d(np.where(y >= np.min(testData[:, 1]))[0], 
                         np.where(y <= np.max(testData[:, 1]))[0])
    idK = np.sort(idK)
    x = x[idK]
    y = y[idK]
    testColor = np.array(['red' if testLabel[i] == 0 else 'blue' for i in 
                          range(len(testLabel))])

    plt.figure(fig)
    plt.scatter(testData[:, 0], testData[:, 1], s=2, c=testColor, alpha=1)
    plt.plot(x, y, '-g', linewidth=2)
    plt.title('Sensitivity = ' + str(sensitivity) + ', Specificity = ' + 
              str(specificity) + ', Accuracy = ' + str(accuracy))
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()
    plt.close(fig)    

    return


### Generate 2D toy data

In [0]:
N1 = 3000 # Number of samples in majority class
N2 = 300 # Number of samples in minority class
fig = 0
randomSeed = 1
transM = np.array([[-0.3, 1.3], [1.3, -0.3]])
numFold = 3 # Number of data folds for cross-validation
numS = N1 + N2 # Total number of samples
class1 = np.array([np.random.RandomState(seed=randomSeed).normal(3, 3, N1), 
                   np.random.RandomState(seed=randomSeed+1).normal(3, 3, N1)]).transpose()
class1 = np.matmul(class1, transM)
class2 = np.array([np.random.RandomState(seed=randomSeed+2).normal(0, 2, N2), 
                   np.random.RandomState(seed=randomSeed+3).normal(0, 2, N2)]).transpose()
class2 = np.matmul(class2, transM)
data = np.vstack((class1, class2))
label = np.concatenate((np.zeros(N1), np.ones(N2)))
color = np.array(['red' if label[i] == 0 else 'blue' for i in range(len(label))])

fig = fig + 1
plt.figure(fig)
plt.scatter(data[:, 0], data[:, 1], s=2, c=color, alpha=1)
plt.title('Whole dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
plt.close(fig)

### Partition data into training set and testing set, and do classification

In [0]:
randID = np.random.RandomState(seed=randomSeed).permutation(numS)
trainID = randID[:int(numS*(numFold-1)/numFold)]
testID = randID[int(numS*(numFold-1)/numFold):]
trainData = data[trainID, :]
trainLabel = label[trainID]
testData = data[testID, :]
testLabel = label[testID]

fig = fig + 1
LogisticRegression2D(trainData, trainLabel, testData, testLabel, fig, randomSeed)

### Random under-sampling of majority class for classification

In [0]:
from imblearn.under_sampling import RandomUnderSampler
sampler = RandomUnderSampler(random_state=randomSeed)
trainData2, trainLabel2 = sampler.fit_resample(trainData, trainLabel)
fig = fig + 1
LogisticRegression2D(trainData2, trainLabel2, testData, testLabel, fig, randomSeed)

### Under-sampling of majority class using K-means clustering for classification

In [0]:
from sklearn.cluster import KMeans
class1ID = np.where(trainLabel == 0)[0]
class2ID = np.where(trainLabel == 1)[0]
trainDataClass1 = trainData[class1ID, :]
trainLabelClass1 = trainLabel[class1ID]
trainDataClass2 = trainData[class2ID, :]
trainLabelClass2 = trainLabel[class2ID]
kmeans = KMeans(n_clusters=len(trainLabelClass2), random_state=randomSeed).fit(trainDataClass1)
uniqueLabel, index = np.unique(kmeans.labels_, return_index=True)
trainData2 = np.vstack((trainDataClass1[index, :], trainDataClass2))
trainLabel2 = np.concatenate((trainLabelClass1[index], trainLabelClass2))
fig = fig + 1
LogisticRegression2D(trainData2, trainLabel2, testData, testLabel, fig, randomSeed)

### Random over-sampling of minority class for classification

In [0]:
from imblearn.over_sampling import RandomOverSampler
sampler = RandomOverSampler(random_state=randomSeed)
trainData2, trainLabel2 = sampler.fit_resample(trainData, trainLabel)
fig = fig + 1
LogisticRegression2D(trainData2, trainLabel2, testData, testLabel, fig, randomSeed)

### Over-sampling of minority class using SMOTE for classification

In [0]:
from imblearn.over_sampling import SMOTE
sampler = SMOTE(random_state=randomSeed)
trainData2, trainLabel2 = sampler.fit_resample(trainData, trainLabel)
fig = fig + 1
LogisticRegression2D(trainData2, trainLabel2, testData, testLabel, fig, randomSeed)

### Classification with weights on classes

In [0]:
classWeight = {0: 1, 1: np.sum(trainLabel == 0)/np.sum(trainLabel == 1)}
fig = fig + 1
LogisticRegression2D(trainData, trainLabel, testData, testLabel, fig, randomSeed, 
                     classWeight=classWeight)

# Ensemble Learning

### Generate toy data and visualization

In [0]:
N = 3000 # Number of samples
numFold = 3 # Number of data folds for cross-validation
numF = 20 # Total number of features
numInfo = 2 # Number of informative features
numEstimators = 100
randomSeed = 1
fig = 0

from sklearn.datasets import make_classification
data, label = make_classification(n_samples=N, n_features=numF, n_informative=numInfo, 
    n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, flip_y=0.1, 
    class_sep=0.3, shift=0.0, scale=1.0, shuffle=False, random_state=randomSeed)

fig = fig + 1
plt.figure(fig)
plt.scatter(data[:, 0], data[:, 1], s=2, c=['red' if label[i] == 0 else 'blue' 
                                            for i in range(len(label))], alpha=1)
plt.title('Whole dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
plt.close(fig)

### Partition data into training set and testing set

In [0]:
randID = np.random.RandomState(seed=randomSeed).permutation(N)
trainID = randID[:int(N*(numFold-1)/numFold)]
testID = randID[int(N*(numFold-1)/numFold):]
trainData = data[trainID, :]
trainLabel = label[trainID]
testData = data[testID, :]
testLabel = label[testID]

result = np.empty((5, 3))
result.fill(np.nan)
result = pd.DataFrame(result, index=['RandomForests', 'ExtraTrees', 'AdaBoost', 
    'lightGBM', 'Stacking'], columns=['Sensitivity', 'Specificity', 'Accuracy'])

### Define a wrapper function for classification and visualization

In [0]:
def Classification(trainData, trainLabel, testData, testLabel, numEstimators, clf, fig, randomSeed=1):

    if clf == 'RandomForests':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=numEstimators, random_state=randomSeed)
    if clf == 'lightGBM':
        import lightgbm as lgb
        model = lgb.LGBMClassifier(n_estimators=numEstimators, random_state=randomSeed)
    if clf == 'AdaBoost':
        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(n_estimators=numEstimators, random_state=randomSeed)
    if clf == 'ExtraTrees':
        from sklearn.ensemble import ExtraTreesClassifier
        model = ExtraTreesClassifier(n_estimators=numEstimators, random_state=randomSeed)
    model.fit(trainData, trainLabel)
    pred = model.predict(testData)
    predProba = model.predict_proba(testData)
    Sensitivity = np.round(np.sum(pred[np.where(testLabel == 1)[0]] == 1) / 
                           np.sum(testLabel == 1), 3)
    Specificity = np.round(np.sum(pred[np.where(testLabel == 0)[0]] == 0) / 
                           np.sum(testLabel == 0), 3)
    Accuracy = np.round(np.sum(pred == testLabel) / len(testLabel), 3)
    
    plt.figure(fig)
    plt.scatter(testData[:, 0], testData[:, 1], s=2, c=['red' if pred[i] == 0 else 
        'blue' for i in range(len(pred))], alpha=1)
    plt.title('Sensitivity = ' + str(Sensitivity) + ', Specificity = ' + 
              str(Specificity) + ', Accuracy = ' + str(Accuracy))
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()
    plt.close(fig)
    
    return Sensitivity, Specificity, Accuracy, predProba

### Random forests classification and visualization

In [0]:
fig = fig + 1
Sensitivity, Specificity, Accuracy, predRF = Classification(trainData, trainLabel, testData, 
    testLabel, numEstimators, 'RandomForests', fig, randomSeed)
result.loc['RandomForests', 'Sensitivity'] = Sensitivity
result.loc['RandomForests', 'Specificity'] = Specificity
result.loc['RandomForests', 'Accuracy'] = Accuracy

### Extra trees classification and visualization

In [0]:
fig = fig + 1
Sensitivity, Specificity, Accuracy, predET = Classification(trainData, trainLabel, testData, 
    testLabel, numEstimators, 'ExtraTrees', fig, randomSeed)
result.loc['ExtraTrees', 'Sensitivity'] = Sensitivity
result.loc['ExtraTrees', 'Specificity'] = Specificity
result.loc['ExtraTrees', 'Accuracy'] = Accuracy

### AdaBoost classification and visualization

In [0]:
fig = fig + 1
Sensitivity, Specificity, Accuracy, predAB = Classification(trainData, trainLabel, testData, 
    testLabel, numEstimators, 'AdaBoost', fig, randomSeed)
result.loc['AdaBoost', 'Sensitivity'] = Sensitivity
result.loc['AdaBoost', 'Specificity'] = Specificity
result.loc['AdaBoost', 'Accuracy'] = Accuracy

### LightGBM classification and visualization

In [0]:
fig = fig + 1
Sensitivity, Specificity, Accuracy, predGBM = Classification(trainData, trainLabel, testData, 
    testLabel, numEstimators, 'lightGBM', fig, randomSeed)
result.loc['lightGBM', 'Sensitivity'] = Sensitivity
result.loc['lightGBM', 'Specificity'] = Specificity
result.loc['lightGBM', 'Accuracy'] = Accuracy

### Define a function for generating meta training data

In [0]:
def GenerateMetaTrainingData(data, label, numFold, numEstimators, 
    clf=['RandomForests', 'lightGBM', 'AdaBoost', 'ExtraTrees'], randomSeed=1):
  
    import lightgbm as lgb
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.ensemble import AdaBoostClassifier

    numS = data.shape[0]
    numC = len(clf)
    trainMeta = np.empty((numS, numC))
    randID = np.random.RandomState(seed=randomSeed).permutation(numS)

    # Generate sample IDs for cross-validation
    sFold = []
    foldSize = np.ceil(numS/numFold)
    for n in range(numFold):
        startID = np.int(foldSize*n)
        endID = np.int(np.min([foldSize*(n + 1), numS]))
        sFold.append({'trainID': randID[np.setdiff1d(range(numS), range(startID, 
            endID))], 'testID': randID[startID:endID]})

    for n in range(numFold):
        for c in range(len(clf)):
            if clf[c] == 'RandomForests':
                model = RandomForestClassifier(n_estimators=numEstimators, random_state=randomSeed)
            if clf[c] == 'lightGBM':
                model = lgb.LGBMClassifier(n_estimators=numEstimators, random_state=randomSeed)
            if clf[c] == 'AdaBoost':
                model = AdaBoostClassifier(n_estimators=numEstimators, random_state=randomSeed)
            if clf[c] == 'ExtraTrees':
                model = ExtraTreesClassifier(n_estimators=numEstimators, random_state=randomSeed)
            model.fit(data[sFold[n]['trainID'], :], label[sFold[n]['trainID']])
            trainMeta[sFold[n]['testID'], c] = model.predict_proba(data[sFold[n]['testID'], :])[:, 0]

    return trainMeta

### Generate and standardize meta training data

In [0]:
trainMeta = GenerateMetaTrainingData(trainData, trainLabel, numFold, numEstimators,
    clf=['RandomForests', 'lightGBM', 'AdaBoost', 'ExtraTrees'], randomSeed=randomSeed)

testMeta = np.empty((predRF.shape[0], 4))
testMeta[:, 0] = predRF[:, 0]
testMeta[:, 1] = predGBM[:, 0]
testMeta[:, 2] = predAB[:, 0]
testMeta[:, 3] = predET[:, 0]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
temp = scaler.fit_transform(np.vstack((trainMeta, testMeta)))
trainMeta = temp[:trainMeta.shape[0], :]
testMeta = temp[trainMeta.shape[0]:, :]


### Train logistic regression as the meta classifier to produce final prediction outcome

In [0]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=randomSeed, solver='lbfgs')
LR.fit(trainMeta, trainLabel)
pred = LR.predict(testMeta)
result.loc['Stacking', 'Sensitivity'] = np.round(np.sum(pred[np.where(testLabel 
    == 1)[0]] == 1) / np.sum(testLabel == 1), 3)
result.loc['Stacking', 'Specificity'] = np.round(np.sum(pred[np.where(testLabel 
    == 0)[0]] == 0) / np.sum(testLabel == 0), 3)
result.loc['Stacking', 'Accuracy'] = np.round(np.sum(pred == testLabel) / 
    len(testLabel), 3)

fig = fig + 1
plt.figure(fig)
plt.scatter(testData[:, 0], testData[:, 1], s=2, c=['red' if pred[i] == 0 
    else 'blue' for i in range(len(pred))], alpha=1)
plt.title('Sensitivity = ' + str(result.loc['Stacking', 'Sensitivity']) + 
          ', Specificity = ' + str(result.loc['Stacking', 'Specificity']) + 
          ', Accuracy = ' + str(result.loc['Stacking', 'Accuracy']))
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
plt.close(fig)

### Summary of prediction performance

In [0]:
print(result)