In [1]:
#Import packages
from keras import backend as K
import numpy as np
from random import randint, random,sample
from numpy import array
from numpy import argmax
import pandas as pd
from Bio import SeqIO
from Bio.Seq import translate
from sklearn.cross_validation import train_test_split
from matplotlib import pyplot
import math
from time import time
from numpy import unique
from pandas import DataFrame
import pickle
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

ModuleNotFoundError: No module named 'keras'

In [3]:
#Reading the High variable region data used in the RandomForest classifier (16S Classifier)
#to exclude the conserved region sequences
HVR  = pd.read_csv('HVR.csv')
HVR.columns = ['S.No','forward_primer','For_seq','reverse_primer','Rev_seq','Region']
HVR.head()

Unnamed: 0,S.No,forward_primer,For_seq,reverse_primer,Rev_seq,Region
0,1,119,AGYGGCGNACGGGTGAGTAA,338,TGCTGCCTCCCGTAGGAGT,V2
1,2,357,CCTACGGGAGGCAGCAG,518,ATTACCGCGGCTGCTGG,V3
2,3,577,AYTGGGYDTAAAGNG,785,TACNVGGGTATCTAATCC,V4
3,4,785,AGGATTAGATACCCT,907,CCGTCAATTCCTTTGAGTTT,V5
4,5,978,TCGAtGCAACGCGAAGAA,1062,ACATtTCACaACACGAGCTGACGA,V6


In [4]:
from sklearn.metrics import r2_score, accuracy_score

In [5]:
#Reading a small subset of the SILVA databae
#For full training download this 
# https://www.arb-silva.de/fileadmin/silva_databases/release_132/Exports/SILVA_132_SSURef_Nr99_tax_silva.fasta.gz
path = 'SILVA_132_dump.fasta'

In [14]:
#Determine the maximum length of any sequence to add zeros in the end (zero padding)
#max_len = max(df['len'])
max_len=4000

In [13]:
def zero_pad(list1):
    x = np.pad(list1, (0,max_len-len(list1)), 'constant', constant_values=0)
    return x.astype('uint8')

In [12]:
##Function for Nucleotide sequence one hot encoding
#1 Declare the alphabet
alphabet = 'ACGTN'
integer = [1,2,3,4,5]
#2 Declare mapping functions
char_to_int = {'A':1,'C':2,'G':3,'U':4,'R':0,'Y':0,'S':0,
               'W':0,'K':0,'M':0,'B':0,'D':0,'H':0,'V':0,'N':5}
int_to_char = {1:'A',2:'C',3:'G',4:'U',0:'R',0:'Y',0:'S',
               0:'W',0:'K',0:'M',0:'B',0:'D',0:'H',0:'V',5:'N'}

def encode_nu(sequence,n_features=4):
    #3 convert char to number
    encoded = [char_to_int[char] for char in sequence]
    return array(encoded)#.astype('uint8')

# Decode a encoded string
def decode_nu(encoded):
    decoded =  ''
    decoded = [int_to_char[integ] for integ in encoded]
    return decoded

In [10]:
def encode_pad(seq):
    return zero_pad(encode_nu(str(record.seq)))

In [9]:
#Reading the database as a panda dataframe
reads=[]
for record in SeqIO.parse(path, "fasta"):
    str_ = str(record.description).split(" ", 1)[1]
    encoded = record.seq 
    #encoded = zero_pad(encode_nu(str(record.seq)))
    
    if str_.count(';') == 6:
        [kingdom, phylum, class_,order,family,genus,species] = str_.split(';')
    elif str_.count(';') > 6:    
        list =  str_.split(';')
        [kingdom,phylum, class_,order,family,genus,species] = list[-7:]
    else:
        list =  str_.split(';')
        num = len(list)
        [kingdom,phylum, class_,order,family,genus,species][:num] = list
        [kingdom,phylum, class_,order,family,genus,species][num:] = '_'*(7-num)
    reads.append([record.name,
                  encoded,
                  kingdom ,phylum, class_,order,family,genus,species
                  ,len(record.seq)])    
    
df=pd.DataFrame(reads,columns=['id','seq','kingdom','phylum','class_','order',
                               'family','genus','species','len'])

In [15]:
# Applying zero padding and encoding
df['seq'] = df['seq'].apply(encode_pad)

In [20]:
print("The number of unique Genus classes : "  ,len(np.unique(df['genus'])))

The number of unique Genus classes :  557


In [22]:
# Converting classes into vectors
df['kingdom'] = pd.factorize(df['kingdom'].values)[0]
df['phylum'] = pd.factorize(df['phylum'].values)[0]
df['class_'] = pd.factorize(df['class_'].values)[0]
df['order'] = pd.factorize(df['order'].values)[0]
df['family'] = pd.factorize(df['family'].values)[0]
df['genus'] = pd.factorize(df['genus'].values)[0]

In [23]:
len(unique(df['genus'])),len(unique(df['family'])),len(unique(df['order'])),len(unique(df['class_'])), len(unique(df['phylum']))

(557, 176, 86, 34, 22)

In [15]:
counts = df['phylum'].value_counts()
df = df[df['phylum'].isin(counts[counts > 2].index)]
counts = df['class_'].value_counts()
df = df[df['class_'].isin(counts[counts > 2].index)]
counts = df['order'].value_counts()
df = df[df['order'].isin(counts[counts > 2].index)]
counts = df['family'].value_counts()
df = df[df['family'].isin(counts[counts > 2].index)]
counts = df['genus'].value_counts()
df = df[df['genus'].isin(counts[counts > 2].index)]

In [16]:
len(unique(df['genus'])),len(unique(df['family'])),len(unique(df['order'])),len(unique(df['class_'])), len(unique(df['phylum']))

(148, 68, 37, 13, 10)

In [17]:
df.shape

(2330, 10)

# Apply multi outupt stratified sampling to ensure homogenous distribution to all features

In [29]:
def get_all_labels(instances):
    all_labels = set()
    for x in instances:
        all_labels |= set(x)
    return all_labels

In [30]:
def stratified_cross_validation_split(instances, k=10, r=None):
    """Creates a stratified cross-validation split of the given dataset as described in [1,2].
    [1] http://lpis.csd.auth.gr/publications/sechidis-ecmlpkdd-2011.pdf
    [2] https://de.slideshare.net/tsoumakas/on-the-stratification-of-multilabel-data
    Args:
        instances (list): Is a list of lists. For example [[1, 2, 3], [4,5]] means that the first instance has labels [1,2,3] attached, and the second item has the labels [4,5].
        k (int, optional): The number of folds
        r (None, optional): Weighting of sets. See paper
    Returns:
        list: A list with k lists. The items in one of the k lists are the indices of the instances. So when the list [[1, 3], [2,5]] is returned, the first set has the items [1, 3] and the second set has [2, 5] as instances. The number in the set are the indices of the instance in the "instances" parameter.
    """
    labels = get_all_labels(instances)

    num_labels = len(labels)
    num_instances = len(instances)

    assert(num_labels > 0)
    assert(num_instances > 0)
    assert(k > 0 and k <= num_instances)

    if not r:
        r = [1 / k] * k

    # Calculate the desired number of examples at each subset
    c_1 = [math.floor(num_instances * x) for x in r]
    c_2 = {}
    # Calculate the desired number of examples of each label at each subset
    def get_D(used_instances):
        D = {}
        for idx, instance in enumerate(instances):
            for label in instance:
                if label not in D:
                    D[label] = []
                D[label].append(idx)
        # for label in labels:
        #    # Find the examples of each label in the initial set
        #    D_2[label] = [idx for idx, x in enumerate(instances) if label in x and idx not in used_instances]
        return D

    # Most expensive
    D = get_D(set())
    for label, vals in D.items():
        c_2[label] = [len(vals) * x for x in r]

    used_instances = set()
    S = [[] for x in range(k)]
    counter = 0
    checkpoints = [x * num_instances / k for x in range(k)]
    print('Starting sorting elements into sets')
    while len(used_instances) < len(instances):
        if len(checkpoints) and len(used_instances) > checkpoints[0]:
            print('{:>2.0f}%'.format(len(used_instances) / len(instances) * 100))
            checkpoints = checkpoints[1:] if len(checkpoints) else []
        counter += 1
        # Find the label with the fewest (but at least one) remaining examples, breaking ties randomly
        def sort_fn(x):
            return len(x[1])
        l = min(D.items(), key=sort_fn)
        l = l[0]
        for inst in D[l]:
            M = sorted(enumerate(c_2[l]), key=lambda x: x[1])
            M = [(idx, examples_wished) for idx, examples_wished in M if examples_wished == M[-1][1]]
            if len(M) == 1:
                m = M[0][0]
            else:
                idxs = [x[0] for x in M]
                np.random.shuffle(idxs)
                s = sorted([(idx, x) for idx, x in enumerate(c_1) if idx in idxs], key=lambda x: [1])
                s = [idx for idx, x in s if x == s[-1][1]]
                if len(s) == 1:
                    m = s[0]
                else:
                    m = s[np.random.randint(0, len(s))]
            # Find the subset(s) with the largest number of desired examples for this
            # label, breaking ties by considering the largest number of desired
            # examples, breaking further ties randomly
            Y = instances[inst]
            S[m].append(inst)
            used_instances.add(inst)
            # Update desired number of examples
            for label in Y:
                if inst in D[label]:
                    # Instead of updating D globally on each iteration, update it on each instance
                    D[label].remove(inst)
                    if len(D[label]) == 0:
                        # Delete label from D so that the sorting gets faster
                        del D[label]
                c_2[label][m] -= 1
            c_1[m] -= 1
    print("StratifyIterations: {}".format(counter + 1))
    return S

In [31]:
#x = [[1,2,3,12,11,11,13], [4,5,15,15,15,15],[13,15,72,72,15,13]]
x= df.iloc[:,2:7]
y = np.array(x).tolist()
#print(x)
np.random.seed(2)
split = stratified_cross_validation_split(y,k=3,r=(0.7,0.1,0.2))

splitting_index = open('split_test.txt', 'w')
for item in split:
    splitting_index.write("%s\n" % item)

Starting sorting elements into sets
 0%
33%
69%
StratifyIterations: 547


## Saving the splitting index

In [32]:
for i in range(3):
    with open('split__'+str(i), 'wb') as fp:
        pickle.dump(split[i], fp)

In [33]:
split_list = []
for i in range(3):
    with open ('split__'+str(i), 'rb') as fp:
        split_list.append(pickle.load(fp))

In [34]:
split_list = split

In [35]:
# Build Train , test , valid data from the slitting

In [36]:
train = df.iloc[split_list[0],:]
valid = df.iloc[split_list[1],:]
test = df.iloc[split_list[2],:]

In [37]:
train.shape, test.shape, valid.shape

((3632, 10), (1033, 10), (519, 10))

In [38]:
#Ensuring all data have the same labels

In [39]:
train = train[train['family'].isin(unique(valid['family']))]
test = test[test['family'].isin(unique(valid['family']))]

In [40]:
train = train[train['genus'].isin(unique(valid['genus']))]
test = test[test['genus'].isin(unique(valid['genus']))]

In [41]:
train = train[train['family'].isin(unique(test['family']))]
valid = valid[valid['family'].isin(unique(test['family']))]

In [42]:
train = train[train['genus'].isin(unique(test['genus']))]
valid = valid[valid['genus'].isin(unique(test['genus']))]

In [43]:
test = test[test['genus'].isin(unique(train['genus']))]
valid = valid[valid['genus'].isin(unique(train['genus']))]

In [44]:
len(np.unique(test['genus'])),len(np.unique(train['genus'])),len(np.unique(valid['genus']))

(114, 114, 114)

In [45]:
len(np.unique(test['family'])),len(np.unique(train['family'])),len(np.unique(valid['family']))

(64, 63, 63)

In [46]:
len(np.unique(test['order'])),len(np.unique(train['order'])),len(np.unique(valid['order']))

(31, 31, 31)

In [47]:
len(np.unique(test['class_'])),len(np.unique(train['class_'])),len(np.unique(valid['class_']))

(10, 10, 10)

In [48]:
len(np.unique(test['phylum'])),len(np.unique(train['phylum'])),len(np.unique(valid['phylum']))

(7, 7, 7)

In [26]:
#Data normaliztion

In [49]:
x_train = train['seq']/5
x_test  = test['seq']/5
x_valid = valid['seq']/5

In [50]:
x_train = np.concatenate(x_train.values).reshape(x_train.shape[0],max_len).tolist()
x_test = np.concatenate(x_test.values).reshape(x_test.shape[0],max_len).tolist()
x_valid = np.concatenate(x_valid.values).reshape(x_valid.shape[0],max_len).tolist()

In [51]:
#Convering each label to categorical
from keras.utils import to_categorical

y1_train = to_categorical(train['phylum'])
y1_test = to_categorical(test['phylum'])
y1_valid = to_categorical(valid['phylum'])

y2_train = to_categorical(train['class_'])
y2_test = to_categorical(test['class_'])
y2_valid = to_categorical(valid['class_'])

y3_train = to_categorical(train['order'])
y3_test = to_categorical(test['order'])
y3_valid = to_categorical(valid['order'])

y4_train = to_categorical(train['family'])
y4_test = to_categorical(test['family'])
y4_valid = to_categorical(valid['family'])

y5_train = to_categorical(train['genus'])
y5_test = to_categorical(test['genus'])
y5_valid = to_categorical(valid['genus'])

In [52]:
y_train = [y1_train,y2_train,y3_train,y4_train,y5_train]
y_test = [y1_test,y2_test,y3_test,y4_test,y5_test]
y_valid= [y1_valid,y2_valid,y3_valid,y4_valid,y5_valid]

In [53]:
y_train[0].shape

(2870, 20)

In [54]:
y_valid[0].shape

(439, 20)

In [55]:
y_test[0].shape

(848, 20)

In [56]:
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Input, Masking, Dropout
from keras.layers import LSTM, TimeDistributed, AveragePooling1D, Flatten, TimeDistributed
from keras.optimizers import Adam
from keras.utils.training_utils import multi_gpu_model
from keras.callbacks import ReduceLROnPlateau, EarlyStopping,ModelCheckpoint, CSVLogger
from keras.layers import Conv1D, GlobalMaxPooling1D, ConvLSTM2D, Bidirectional

In [43]:
#Uncomment for parallel GPU training
import os
##os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [57]:
# Model1
#LSTM_Masked_no_dropout

print('Build LSTM_no_dropout model...')

input1 = Input(shape=(max_len,))
m = Embedding(6, 50)(input1)
m = Masking(mask_value=0)(m)
m = LSTM(32,return_sequences=False)(m)
output1 = Dense(y1_train.shape[1],activation='softmax', name='output1')(m)
output2 = Dense(y2_train.shape[1],activation='softmax', name='output2')(m)
output3 = Dense(y3_train.shape[1],activation='softmax', name='output3')(m)
output4 = Dense(y4_train.shape[1],activation='softmax', name='output4')(m)
output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)

m = Model(inputs=[input1], outputs=[output1, output2, output3,output4, output5])

#m = multi_gpu_model(m, gpus=4)

optimizer = Adam(lr=0.0001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc']) 

Build LSTM_no_dropout model...


In [None]:
m.fit(np.array(x_train),
      [y1_train,y2_train,y3_train,y4_train,y5_train],
      batch_size=10,
      validation_data=(np.array(x_valid),y_valid),
      epochs=95)

In [80]:
# Model2
#CNN_no_dropout

print('Build CNN_no_dropout model...')

input1 = Input(shape=(max_len,), dtype='uint8')
m = Embedding(5, 32)(input1)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Flatten()(m)
#m = Dense(32)(m)
output1 = Dense(y1_train.shape[1],activation='softmax', name='output1')(m)
output2 = Dense(y2_train.shape[1],activation='softmax', name='output2')(m)
output3 = Dense(y3_train.shape[1],activation='softmax', name='output3')(m)
output4 = Dense(y4_train.shape[1],activation='softmax', name='output4')(m)
output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)

m = Model(inputs=[input1], outputs=[output1, output2, output3,output4, output5])

#m = multi_gpu_model(m, gpus=4)
optimizer = Adam(lr=0.00001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc']) 

Build CNN_no_dropout model...


In [136]:
filepath = ''.join('./smallRun/CNN.hdfs')
Checkpoint = ModelCheckpoint(filepath,monitor='val_loss',
                             verbose=0, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)
m.fit(np.array(x_train),y_train,callbacks=[Checkpoint],verbose=0,
      batch_size=10,validation_data=(np.array(x_valid),y_valid),
      epochs=60)

<keras.callbacks.History at 0x7f7dbe6eeac8>

In [None]:
m.load_weights('./smallRun/CNN.hdfs')
pred = m.predict(np.array(x_test))
accuracy_score(y_test[4].argmax(axis=1),pred[4].argmax(axis=1))

In [74]:
# Model3
#BILSTM_no_dropout

print('Build Bi-LSTM model...')

input1 = Input(shape=(max_len,), dtype='uint8')
m = Embedding(6, 32, name='embedding')(input1)
m = Masking(mask_value=0)(m)
m = Bidirectional(LSTM(32,return_sequences=True))(m)
#m = Conv1D(32,(2))(m)
#m = Flatten()(m)
#m = GlobalMaxPooling1D()(m)
#m = TimeDistributed(layer=)(m)
output1 = Dense(y1_train.shape[1],activation='softmax', name='output1')(m)
output2 = Dense(y2_train.shape[1],activation='softmax', name='output2')(m)
output3 = Dense(y3_train.shape[1],activation='softmax', name='output3')(m)
output4 = Dense(y4_train.shape[1],activation='softmax', name='output4')(m)
output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)

m = Model(inputs=[input1], outputs=[output1, output2, output3,output4, output5])

#m = multi_gpu_model(m, gpus=4)
optimizer = Adam(lr=0.0001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc']) 

Build Bi-LSTM model...


In [None]:
m.fit(np.array(x_train),
      [y1_train,y2_train,y3_train,y4_train,y5_train],
      batch_size=10,
      validation_data=(np.array(x_valid),y_valid),
      epochs=5)

In [308]:
filepath = ''.join('./bestModel/All_128_batch.hdfs')
Checkpoint = ModelCheckpoint(filepath,#monitor='val_loss',
                             verbose=0, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)

In [58]:
# Model4
#CNN_no_dropout_genus_only
input1 = Input(shape=(max_len,), dtype='uint8')
m = Embedding(5, 32)(input1)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Flatten()(m)
output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)
m = Model(inputs=[input1], outputs=[output5])
optimizer = Adam(lr=0.00001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc'])
filepath = ''.join('./smallRun/CNN_genus.hdfs')
Checkpoint = ModelCheckpoint(filepath,monitor='val_loss',
                             verbose=0, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)
csv_logger = CSVLogger('smallRun/CNN_genus.log')
m.fit(np.array(x_train),y5_train,callbacks=[Checkpoint,csv_logger],verbose=0,
      batch_size=10,validation_data=(np.array(x_valid),y5_valid),
      epochs=100)
m.load_weights('./smallRun/CNN_genus.hdfs')
pred = m.predict(np.array(x_test))
accuracy_score(y5_test.argmax(axis=1),pred.argmax(axis=1))

Build CNN_no_dropout model...


0.14356435643564355

In [None]:
# Model5
#CNN_dropout_genus_only
input1 = Input(shape=(max_len,), dtype='uint8')
m = Embedding(5, 32)(input1)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Flatten()(m)
m = Dropout(0.2)(m)
output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)
m = Model(inputs=[input1], outputs=[output5])
optimizer = Adam(lr=0.00001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc'])
filepath = ''.join('./smallRun/CNN_genus.hdfs')
Checkpoint = ModelCheckpoint(filepath,monitor='val_loss',
                             verbose=0, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)
csv_logger = CSVLogger('smallRun/CNN_drop_genus.log')
m.fit(np.array(x_train),y5_train,callbacks=[Checkpoint,csv_logger],verbose=0,
      batch_size=10,validation_data=(np.array(x_valid),y5_valid),
      epochs=100)

In [65]:
m.load_weights('./smallRun/CNN_genus.hdfs')
pred = m.predict(np.array(x_test))
accuracy_score(y5_test.argmax(axis=1),pred.argmax(axis=1))

0.14356435643564355

In [66]:
# Model6
#Stacked_CNN_genus
input1 = Input(shape=(max_len,), dtype='uint8')
m = Embedding(5, 32)(input1)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Flatten()(m)
output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)
m = Model(inputs=[input1], outputs=[output5])
optimizer = Adam(lr=0.00001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc'])
filepath = ''.join('./smallRun/Stacked_CNN_genus.hdfs')
Checkpoint = ModelCheckpoint(filepath,monitor='val_loss',
                             verbose=0, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)
csv_logger = CSVLogger('smallRun/Stacked_CNN_genus.log')
m.fit(np.array(x_train),y5_train,callbacks=[Checkpoint,csv_logger],verbose=0,
      batch_size=10,validation_data=(np.array(x_valid),y5_valid),
      epochs=100)
m.load_weights('./smallRun/Stacked_CNN_genus.hdfs')
pred = m.predict(np.array(x_test))
accuracy_score(y5_test.argmax(axis=1),pred.argmax(axis=1))

0.14356435643564355

In [67]:
# Model7
#3_Stacked_CNN_genus

input1 = Input(shape=(max_len,), dtype='uint8')
m = Embedding(5, 32)(input1)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Flatten()(m)
#m = Dense(32)(m)

output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)

m = Model(inputs=[input1], outputs=[output5])

optimizer = Adam(lr=0.00001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc'])
filepath = ''.join('./smallRun/3Stacked_CNN_genus.hdfs')
Checkpoint = ModelCheckpoint(filepath,monitor='val_loss',
                             verbose=0, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)
csv_logger = CSVLogger('smallRun/3Stacked_CNN_genus.log')
m.fit(np.array(x_train),y5_train,callbacks=[Checkpoint,csv_logger],verbose=0,
      batch_size=10,validation_data=(np.array(x_valid),y5_valid),
      epochs=100)
m.load_weights('./smallRun/3Stacked_CNN_genus.hdfs')
pred = m.predict(np.array(x_test))
accuracy_score(y5_test.argmax(axis=1),pred.argmax(axis=1))

0.14356435643564355

In [68]:
# Model8
#Stacked_drop_CNN_genus

input1 = Input(shape=(max_len,), dtype='uint8')
m = Embedding(5, 32)(input1)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Dropout(0.2)(m)
m = Conv1D(16,(3))(m)
m = AveragePooling1D(8,2)(m)
m = Dropout(0.2)(m)
m = Flatten()(m)
output5 = Dense(y5_train.shape[1],activation='softmax', name='output5')(m)

m = Model(inputs=[input1], outputs=[output5])

optimizer = Adam(lr=0.00001)
m.compile(loss='categorical_crossentropy',optimizer=optimizer,metrics=['acc'])
filepath = ''.join('./smallRun/Stacked_drop_CNN_genus.hdfs')
Checkpoint = ModelCheckpoint(filepath,monitor='val_loss',
                             verbose=0, save_best_only=True,
                             save_weights_only=True, mode='auto', period=1)
csv_logger = CSVLogger('smallRun/Stacked_drop_CNN_genus.log')
m.fit(np.array(x_train),y5_train,callbacks=[Checkpoint,csv_logger],verbose=0,
      batch_size=10,validation_data=(np.array(x_valid),y5_valid),
      epochs=100)
m.load_weights('./smallRun/Stacked_drop_CNN_genus.hdfs')
pred = m.predict(np.array(x_test))
accuracy_score(y5_test.argmax(axis=1),pred.argmax(axis=1))

0.14356435643564355