In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold

from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.feature_selection import SelectKBest

from keras.layers import Lambda, Input, Dense, MaxPooling1D, BatchNormalization
from keras.models import Model
from keras.losses import mse, binary_crossentropy
from keras import utils
from keras import backend as K
from keras import layers
from keras import objectives
from keras import optimizers
from keras import regularizers
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

import itertools as it
from sklearn.preprocessing import OneHotEncoder

from IPython.display import clear_output

Using TensorFlow backend.


In [2]:
def gaussian_expansion(x_data,y, magnitude, sigma, k):
    mu = 0.0
    num_samples = x_data.shape[0]
    x_noised = []
    for i in range(magnitude):
        noise = np.random.normal(mu, sigma, x_data.shape)
        x_noised.append(x_data + noise)
        
    y_new = [y] * (magnitude +1)
    y_new = np.asarray(y_new).flatten()
    print(y_new.shape)
    x_noised = np.asarray(x_noised)
    x_noised = x_noised.reshape(magnitude*x_data.shape[0],k,1)
    print(x_noised.shape)
    print(x_data.shape)
    y_new = enc.transform(np.reshape(y_new, (-1,1)))
    new_x = np.concatenate((x_data, x_noised))
    
    print(y_new.shape)
    print(new_x.shape)
    return(new_x, y_new)

In [12]:
def create_model(k):
    inputs = layers.Input(shape=(k,1,))
    x = layers.Conv1D(32, 2)(inputs)
    x = layers.MaxPooling1D()(x)
    x = layers.Dropout(0.2)(x)
    x = BatchNormalization()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(30, activation='relu')(x)
    outputs = layers.Dense(2,activation='softmax')(x)
    fancy_model = Model(inputs=inputs, outputs=outputs)

    fancy_model.summary()
    fancy_model.compile(loss='binary_crossentropy',
                   optimizer='Adam', metrics=['accuracy'])
    return fancy_model

In [11]:
sub = pd.read_csv('all_subg_samples.csv', index_col=0).values
sup = pd.read_csv('all_supg_samples.csv', index_col=0).values

print(sub.shape)
print(sup.shape)

(105, 1998)
(105, 1998)


In [7]:
def kfold_cnn(X, y, n_folds, sigma, noise_size, k, enc):
    skf = StratifiedKFold(n_splits=n_folds)
    scores = []
    for train, test in skf.split(X_new, y):
        X_train, X_test = X_new[train], X_new[test]
        y_train, y_test = y[train], y[test]
        y_test = enc.transform(np.reshape(y_test, (-1,1)))
        x_tr, y_tr = gaussian_expansion(X_train, y_train,noise_size, sigma,k)
        model = create_model(k)
        model.fit(x_tr, y_tr, batch_size=6, epochs=30, validation_data=(X_test, y_test), 
                  callbacks=[EarlyStopping('val_loss', patience=4, restore_best_weights=True)])
        clear_output()
        evals = model.evaluate(X_test, y_test)
        scores.append(evals[1])
    return {'Params':{'noise_sigma': sigma, 'noise_size':noise_size, 'k_best':k}, 
            'Result':{'mean_acc':np.mean(scores), 'mean_std':np.std(scores)}}

In [6]:
noise_sigma = [0.0, 0.01, 0.005]
noise_size = [10, 25, 40]
gen = it.product(noise_sigma, noise_size)


noise_params = []
for i in gen:
    noise_params.append(i)


In [13]:
X = np.concatenate((sub, sup))

y = [0] * 105 + [1] * 105
y = np.asarray(y)

X_new = SelectKBest(mutual_info_classif, k=200).fit_transform(X, y)
X_new = np.reshape(X_new, (210,200,1))


enc = OneHotEncoder(sparse=False)
labels = np.arange(2)
labels = np.reshape(labels, (len(labels),1))
enc.fit(labels)

results = []

for params in noise_params:
    results.append(kfold_cnn(X_new, y, 15, params[0], params[1], 200, enc))



In [14]:
results

[{'Params': {'noise_sigma': 0.0, 'noise_size': 10, 'k_best': 200},
  'Result': {'mean_acc': 0.6238095303376515, 'mean_std': 0.10900497954220405}},
 {'Params': {'noise_sigma': 0.0, 'noise_size': 25, 'k_best': 200},
  'Result': {'mean_acc': 0.6666666666666666, 'mean_std': 0.13468699510253443}},
 {'Params': {'noise_sigma': 0.0, 'noise_size': 40, 'k_best': 200},
  'Result': {'mean_acc': 0.6476190487543741, 'mean_std': 0.1341809802504021}},
 {'Params': {'noise_sigma': 0.01, 'noise_size': 10, 'k_best': 200},
  'Result': {'mean_acc': 0.680952384074529, 'mean_std': 0.09712418019542482}},
 {'Params': {'noise_sigma': 0.01, 'noise_size': 25, 'k_best': 200},
  'Result': {'mean_acc': 0.6857142905394237, 'mean_std': 0.09689042724229598}},
 {'Params': {'noise_sigma': 0.01, 'noise_size': 40, 'k_best': 200},
  'Result': {'mean_acc': 0.6619047681490581, 'mean_std': 0.10900498605180613}},
 {'Params': {'noise_sigma': 0.005, 'noise_size': 10, 'k_best': 200},
  'Result': {'mean_acc': 0.6523809572060902, 'me

In [8]:
results

[{'Params': {'noise_sigma': 0.0, 'noise_size': 10, 'k_best': 100},
  'Result': {'mean_acc': 0.8875, 'mean_std': 0.1849831073368593}},
 {'Params': {'noise_sigma': 0.0, 'noise_size': 25, 'k_best': 100},
  'Result': {'mean_acc': 0.9125, 'mean_std': 0.16345871038277526}},
 {'Params': {'noise_sigma': 0.0, 'noise_size': 40, 'k_best': 100},
  'Result': {'mean_acc': 0.8875, 'mean_std': 0.1849831073368593}},
 {'Params': {'noise_sigma': 0.0, 'noise_size': 100, 'k_best': 100},
  'Result': {'mean_acc': 0.8875, 'mean_std': 0.20116846174288852}},
 {'Params': {'noise_sigma': 0.1, 'noise_size': 10, 'k_best': 100},
  'Result': {'mean_acc': 0.675, 'mean_std': 0.225}},
 {'Params': {'noise_sigma': 0.1, 'noise_size': 25, 'k_best': 100},
  'Result': {'mean_acc': 0.625, 'mean_std': 0.20155644370746376}},
 {'Params': {'noise_sigma': 0.1, 'noise_size': 40, 'k_best': 100},
  'Result': {'mean_acc': 0.6625, 'mean_std': 0.21323402636539976}},
 {'Params': {'noise_sigma': 0.1, 'noise_size': 100, 'k_best': 100},
  'R

In [13]:
def partition(data):
    rows=data.index
    indices = []
    for p in range(11,26):
        train = []
        test = []
        sample = "{0}".format(p)
        for i in range(len(rows)):   
            if sample in rows[i]:
                test.append(i)
            else:
                train.append(i)
        indices.append((train, test))
    return indices

In [14]:
#pass X as a dataframe here
def kfold_partitioned_cnn(X, y, sigma, noise_size, k, enc):
    train_test_split = partition(X)
    scores = []
    for split in train_test_split:
        X_train, X_test = X.values[split[0]], X.values[split[1]]
        X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
        X_test = X_test.reshape(X_test.shape[0], X_test.shape[1],1)
        y_train, y_test = y[split[0]], y[split[1]]
        y_test = enc.transform(np.reshape(y_test, (-1,1)))
        x_tr, y_tr = gaussian_expansion(X_train, y_train,noise_size, sigma,k)
        model = create_model(k)
        model.fit(x_tr, y_tr, batch_size=6, epochs=10, validation_data=(X_test, y_test), 
                  callbacks=[EarlyStopping('val_loss', patience=4, restore_best_weights=True)])
        clear_output()
        evals = model.evaluate(X_test, y_test)
        scores.append(evals[1])
    return {'Params':{'noise_sigma': sigma, 'noise_size':noise_size, 'k_best':k}, 
            'Result':{'mean_acc':np.mean(scores), 'mean_std':np.std(scores)}}

In [15]:
sub = pd.read_csv('sub_best.csv', index_col=0)
sup = pd.read_csv('sup_best.csv', index_col=0)

noise_sigma = [0.0, 0.01, 0.005]
noise_size = [10, 25, 40]
gen = it.product(noise_sigma, noise_size)


noise_params = []
for i in gen:
    noise_params.append(i)
X = sub.append(sup)

In [16]:
sub = pd.read_csv('sub_best.csv', index_col=0)
sup = pd.read_csv('sup_best.csv', index_col=0)
data = sub.append(sup)

data_index = data.index

X_vals = data.values

X_new = SelectKBest(mutual_info_classif, k=200).fit_transform(X_vals, y)

X = pd.DataFrame(data=X_new, index=data_index)

In [17]:
results2 = []
for params in noise_params:
    results2.append(kfold_partitioned_cnn(X, y, params[0], params[1], 200, enc))



In [18]:
results2

[{'Params': {'noise_sigma': 0.0, 'noise_size': 10, 'k_best': 200},
  'Result': {'mean_acc': 0.8979166666666667, 'mean_std': 0.14722549786712294}},
 {'Params': {'noise_sigma': 0.0, 'noise_size': 25, 'k_best': 200},
  'Result': {'mean_acc': 0.9166666666666666, 'mean_std': 0.11102427162061856}},
 {'Params': {'noise_sigma': 0.0, 'noise_size': 40, 'k_best': 200},
  'Result': {'mean_acc': 0.88125, 'mean_std': 0.1746275799141323}},
 {'Params': {'noise_sigma': 0.01, 'noise_size': 10, 'k_best': 200},
  'Result': {'mean_acc': 0.8916666666666667, 'mean_std': 0.1259684704819248}},
 {'Params': {'noise_sigma': 0.01, 'noise_size': 25, 'k_best': 200},
  'Result': {'mean_acc': 0.9333333333333333, 'mean_std': 0.10324593077803224}},
 {'Params': {'noise_sigma': 0.01, 'noise_size': 40, 'k_best': 200},
  'Result': {'mean_acc': 0.925, 'mean_std': 0.10813282726967482}},
 {'Params': {'noise_sigma': 0.005, 'noise_size': 10, 'k_best': 200},
  'Result': {'mean_acc': 0.9104166666666667, 'mean_std': 0.1190602671852

In [21]:
for i in scores:
    print(i)
print(np.mean(scores), " ", np.std(scores)) #no noise

1.0
1.0
0.75
0.75
0.75
0.75
1.0
0.75
1.0
0.75
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.5
0.9   0.14577379737113252


In [10]:
for i in scores:
    print(i)
print(np.mean(scores), " ", np.std(scores))
#params: 40fold noise generation with std=0.001
#20 fold cv with batch size 6, epochs=10, patience=4, early stopping


0.75
1.0
1.0
0.75
1.0
1.0
1.0
1.0
1.0
0.75
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.5
1.0
1.0
0.9375   0.1340475661845451


In [24]:
for i in scores:
    print(i)
print(np.mean(scores), " ", np.std(scores)) #sigma = 0.005 for noise

0.75
1.0
1.0
0.5
0.5
1.0
1.0
1.0
1.0
0.75
1.0
0.5
1.0
0.5
1.0
1.0
1.0
1.0
1.0
1.0
0.875   0.20155644370746376
