In [1]:
import numpy as np
import pandas as pd
from time import time
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from random import randint
import os
import matplotlib.pyplot as plt
import tensorflow_addons as tfa


import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import plotting_funcs as pf
import preprocessing as prep
import clustering_model as cm
import config as conf
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [2]:
X = np.load('x_balanced_sample_16.npy')
y = np.load('y_balanced_sample_16.npy')
types = np.array([conf.wells_to_genetype_dict[well] for well in y])
y_letter = np.array([well[:1] for well in y])
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
#labs = le.fit_transform(y)
#labs_by_letter = le.fit_transform(y_letter)
labs_by_type = le.fit_transform(types)
labs_by_type = to_categorical(labs_by_type)
X_XY = X[:,:,:2]
X_added = prep.add_transformations(X)
print(X_XY.shape , labs_by_type.shape)

(10000, 16, 2) (10000, 5)


In [3]:
def preprocess_x(train,test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train.reshape(-1, train.shape[-1])).reshape(train.shape)
    test = scaler.transform(test.reshape(-1, test.shape[-1])).reshape(test.shape)
    return train,test

In [4]:
#model params permanent
input_dim = X_XY.shape[-1]
num_labels = len(np.unique(types))
timesteps = X_XY.shape[-2]

#model params to optimize
n_filters_ = [64,64,32] #can vary in length (>1)
n_units_ = [64,8]
kernel_size_ = 12 
pool_size_ = 8 # 8 equals flatten of 16

#fit params permanent
optimizer='adam'
loss = 'categorical_crossentropy' # need label encoded labels
epochs=100
save_dir='results/tmp'
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
batch_size=256 # not really interesting to optimize 

In [None]:
#here should be param dicts and for loop over options
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True,random_state=42)
results = []
#second iteration over best options that are different enough to try again
n_filters_vec = [[128,64,64,32],[64,64,0,0],[64,64,32,0]] 
n_units_vec = [[128,16],[64,8]]
kernel_size_vec = [8,12,16] 
pool_size_vec = [8,16] # 8 equals latent of 16, 16 equals latent of 8
#total = 3*2*3*2 = 36

iii=0
for pool_size in pool_size_vec:
    for kernel_size in kernel_size_vec:
        for n_units in n_units_vec:
            for n_filters in n_filters_vec:
                val_accs = []
                val_aucs = []
                for index, (train_indices, val_indices) in enumerate(skf.split(X_XY, y)):
                    print("fold " + str(index+1) + "/" + str(N_SPLITS) + "...")
                    X_train, X_val = preprocess_x(X_XY[train_indices], X_XY[val_indices])
                    y_train, y_val = labs_by_type[train_indices], labs_by_type[val_indices]

                    classifier_model = None
                    classifier_model = cm.temporal_classifier(input_dim,num_labels,timesteps,n_filters,kernel_size,pool_size,n_units)
                    classifier_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy',tf.keras.metrics.AUC(multi_label = True,name = 'auc')])
                    
                    t0 = time()
                    history = classifier_model.fit(X_train, y_train,validation_data=(X_val,y_val) , batch_size=batch_size, epochs=epochs, verbose=0, callbacks=es)
                    print('Pretraining time: ', time() - t0)
                    
                    _,train_acc,train_auc = classifier_model.evaluate(X_train, y_train, verbose=0)
                    _,test_acc,test_auc = classifier_model.evaluate(X_val, y_val, verbose=0)
                    print('accuracy: Train: %.3f, Test: %.3f' % (train_acc, test_acc))
                    print('auc: Train: %.3f, Test: %.3f' % (train_auc, test_auc))
                    val_aucs.append(test_auc)
                    val_accs.append(test_acc)
                iii+=1
                print(iii)
                print('mean validation: Accuracy: %.3f, Auc: %.3f' % (np.mean(val_accs), np.mean(val_aucs)))
                results.append([np.mean(val_accs),np.mean(val_aucs),pool_size,kernel_size,n_units[0],n_units[1],n_filters[0],n_filters[1],n_filters[2],n_filters[3]])

In [None]:
np.save('kfold_cv_results.npy',np.array(results))

# explore results

In [15]:
col_names = ['accuracy','AUC','pool_size','kernel_size','lstm1','lstm2','conv1','conv2','conv3','conv4']
results_df = pd.DataFrame(results, columns = col_names)
results_df['auc2acc'] = 2*results_df['accuracy']+results_df['AUC']

In [17]:
results_df.sort_values(by=['auc2acc'],ascending=False)

Unnamed: 0,accuracy,AUC,pool_size,kernel_size,lstm1,lstm2,conv1,conv2,conv3,conv4,auc2acc
12,0.4584,0.759117,8,16,128,16,128,64,64,32,1.675917
18,0.4503,0.759338,16,8,128,16,128,64,64,32,1.659938
6,0.4501,0.756715,8,12,128,16,128,64,64,32,1.656915
24,0.4493,0.757122,16,12,128,16,128,64,64,32,1.655722
10,0.4487,0.756221,8,12,64,8,64,64,0,0,1.653621
13,0.4497,0.752484,8,16,128,16,64,64,0,0,1.651884
7,0.4459,0.754627,8,12,128,16,64,64,0,0,1.646427
14,0.4468,0.752243,8,16,128,16,64,64,32,0,1.645843
0,0.4435,0.755277,8,8,128,16,128,64,64,32,1.642277
1,0.4432,0.754475,8,8,128,16,64,64,0,0,1.640875


best arch is #12