In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.models import load_model

# load and prepare sequence/expression data

In [2]:
# read original data from 
data = pd.read_csv('../../dataframes/DF_prest.csv', index_col=0)

# setup 'docs' for use with Tokenizer
def nt_seq_doc(nt_sequence):
    if 'GACAAGCTTGCGGCCGCA' not in nt_sequence:
        return None
    true_nt = nt_sequence.split('GACAAGCTTGCGGCCGCA')[1]
    if len(true_nt) % 3 != 0:
        return None
    return ' '.join([true_nt[i:i+3] 
                     for i in range(0, len(true_nt), 3)])
# split quantiles
def assign_class(conc):
    if conc <= low_cut:
        return 0
    elif conc >= high_cut:
        return 1
    return

data['nt_seq_doc'] = data['nt_seq'].apply(nt_seq_doc)
data = data[pd.notnull(data['nt_seq_doc'])]

# identify high and low classes by conc_cf quantiles
low_cut = data['conc_cf'].quantile(0.25)
high_cut = data['conc_cf'].quantile(0.75)

data['class'] = data['conc_cf'].apply(assign_class)
data = data[pd.notnull(data['class'])]
# check shape
print('data shape: ', data.shape)

data shape:  (22364, 8)


In [3]:
# define sequence documents
docs = list(data['nt_seq_doc'])
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)

# one hot encode documents
X = t.texts_to_sequences(docs)
X = sequence.pad_sequences(X, maxlen=200)
X = np.array([to_categorical(seq, num_classes=65) for seq in X])
y = data['class'].values

# create test-train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [4]:
X_train.shape

(15654, 200, 65)

# build and train autoencoder

In [20]:
from keras.layers import Input, LSTM, RepeatVector
from keras.models import Model

latent_dim = 4
input_dim = 65

# ENCODER
inputs = Input(shape=(200, 65))
# a few many to many layers:
outputs = LSTM(100,return_sequences=True)(inputs)
outputs = LSTM(50,return_sequences=True)(outputs)    
# many to one layer:
outputs = LSTM(10)(outputs)

encoder = Model(inputs,outputs)

# DECODER
inputs = Input((10,))
#repeat to make one to many:
outputs = RepeatVector(200)(inputs)
#a few many to many layers:
outputs = LSTM(50,return_sequences=True)(outputs)

#last layer
outputs = LSTM(65,return_sequences=True)(outputs)

decoder = Model(inputs,outputs)

# AUTOENCODER
inputs = Input((200,65))
outputs = encoder(inputs)
outputs = decoder(outputs)

autoencoder = Model(inputs,outputs)

In [21]:
autoencoder.compile(loss='binary_crossentropy', optimizer='adam')
print(autoencoder.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        (None, 200, 65)           0         
_________________________________________________________________
model_15 (Model)             (None, 10)                99040     
_________________________________________________________________
model_16 (Model)             (None, 200, 65)           42360     
Total params: 141,400
Trainable params: 141,400
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
autoencoder.fit(X_train, X_train, 
          epochs=3, 
          batch_size=100,
          shuffle=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1bb83c5c0>

In [None]:
autoencoder.save('autoencoder.h5')

# load and play with autoencoder

In [5]:
autoencoder = load_model('autoencoder.h5')

In [6]:
X_encoded = autoencoder.predict(X_train)

In [8]:
X_encoded.shape

(15654, 200, 65)