# Vanilla LSTM for Gene/No gene classification
The Milestone 1 corresponds to the classication task of, given a sequence, predict if it contains a gene, a partial sequence of a gene or just intergenic code.

In [0]:
!pip3 install pyfastx

Collecting pyfastx
[?25l  Downloading https://files.pythonhosted.org/packages/f5/15/5e891f5cf52383fe5dc13c83a4642347472e0ab4f5a09b7e4fc847f7f599/pyfastx-0.5.9-cp36-cp36m-manylinux2010_x86_64.whl (764kB)
[K     |████████████████████████████████| 768kB 3.5MB/s 
[?25hInstalling collected packages: pyfastx
Successfully installed pyfastx-0.5.9


In [0]:
import numpy as np
import pickle
import torch
import torch.nn as nn
import pandas as pd

from tqdm import tqdm # progress bar
from preproc_pipeline import pre_process
from warnings import simplefilter

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## 1. Dataset for training
The genome of E. coli will be used for this purpose.

In [0]:
simplefilter("ignore")
genome = "GCF_000008865.2_ASM886v2_genomic.fna"
feature_table = "GCA_000008865.2_ASM886v2_feature_table.tsv"
df = pre_process(genome, feature_table)

In [0]:
print(df.sequence.apply(lambda x: len(x)).max())
len(df)

1999


12328

In [0]:
print(
    f"columns -> {list(df.columns)}\n"
    f"labels in dataframe -> {list(df.label.unique())}"
)

columns -> ['sequence', 'label', 'seq_length', 'padded_sequences']
labels in dataframe -> ['gene', 'intergenic', 'partial']


Let's get a one hot mapping for the labels.

In [0]:
labels = list(df.label.unique())
lab0 = np.zeros(len(labels))
lab2vec = {}
vec2lab = {}
for i, label in enumerate(list(df.label.unique())):
    labv = lab0.copy()
    labv[i] = 1
    lab2vec[label] = labv
    vec2lab[tuple(labv)] = label

print(f"lab2vec -> {lab2vec}\nvec2lab -> {vec2lab}")

lab2vec -> {'gene': array([1., 0., 0.]), 'intergenic': array([0., 1., 0.]), 'partial': array([0., 0., 1.])}
vec2lab -> {(1.0, 0.0, 0.0): 'gene', (0.0, 1.0, 0.0): 'intergenic', (0.0, 0.0, 1.0): 'partial'}


In [0]:
print(df[df.sequence.apply(lambda x: len(x)==0)].count())
df = df[~df.sequence.apply(lambda x: len(x)==0)]

sequence            1
label               1
seq_length          1
padded_sequences    1
dtype: int64


Need to check why it always generate a 0 length row. I think is the last one, but I am not sure.

In [0]:
df["label_onehot"] = df.label.apply(lambda x: lab2vec[x])
# toy = pd.concat([df[df.label=="gene"].sample(frac=1/30),
#                 df[df.label=="intergenic"].sample(frac=1/30),
#                 df[df.label=="partial"].sample(frac=1/30)]).reset_index(drop=True)
df_train = df.sample(frac=8/10) # shuffle
df_test = df[~df.index.isin(df_train.index)].dropna().reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_train

Unnamed: 0,sequence,label,seq_length,padded_sequences,label_onehot
0,TGGCTGGCTAAATATTGGTGGATTCTGGTGATTGTCTTTTTGGTAG...,partial,117,TGGCTGGCTAAATATTGGTGGATTCTGGTGATTGTCTTTTTGGTAG...,"[0.0, 0.0, 1.0]"
1,TAGTTTATCCAGCTCAGGGTTACTGACCGCCAGATTGCTCCCGCCA...,partial,424,TAGTTTATCCAGCTCAGGGTTACTGACCGCCAGATTGCTCCCGCCA...,"[0.0, 0.0, 1.0]"
2,GGACAATGAATTACAGCCCGCAGTTTAAACATCTTCGCGCGCACAG...,intergenic,54,GGACAATGAATTACAGCCCGCAGTTTAAACATCTTCGCGCGCACAG...,"[0.0, 1.0, 0.0]"
3,GTCACGCGCATCAGCCTGAAGGAGAGAACACGATGGCTATTCCAAA...,partial,502,GTCACGCGCATCAGCCTGAAGGAGAGAACACGATGGCTATTCCAAA...,"[0.0, 0.0, 1.0]"
4,GGTCTGAAATTCAAAACCCGACTCGGTATTGGTCGCCCAGGCCATC...,partial,155,GGTCTGAAATTCAAAACCCGACTCGGTATTGGTCGCCCAGGCCATC...,"[0.0, 0.0, 1.0]"
...,...,...,...,...,...
9857,GATGTATAGTCTCATCCTGCGGCAGAACAAGACGGATAAAAAATCC...,gene,1359,GATGTATAGTCTCATCCTGCGGCAGAACAAGACGGATAAAAAATCC...,"[1.0, 0.0, 0.0]"
9858,GGCACAACTGGCAGACGCCATCGCCAGAGCGGTATACGCTTGCGAT...,partial,425,GGCACAACTGGCAGACGCCATCGCCAGAGCGGTATACGCTTGCGAT...,"[0.0, 0.0, 1.0]"
9859,GTAACAATATCCTACGCACTTTTTTAACAAAAACTGAGACTAGTAC...,gene,1415,GTAACAATATCCTACGCACTTTTTTAACAAAAACTGAGACTAGTAC...,"[1.0, 0.0, 0.0]"
9860,ACGTTCAGATAATGTCTAATATTTGGTCTAAAGAAGAAACTCTGTG...,gene,1213,ACGTTCAGATAATGTCTAATATTTGGTCTAAAGAAGAAACTCTGTG...,"[1.0, 0.0, 0.0]"


## 2. Embeddings
The next step is to use the whole sequence to compute the embeddings. First, get a set of k-mers, that will be our words for this NLP problem.

In [0]:
def window(fseq, window_size, slide = 1):
    # create a window of size k
    N = len(fseq)
    for i in range(0, N - window_size + 1, slide):
      if i+window_size+slide < N:
        yield fseq[i:i+window_size]


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

def get_index_of_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i 
    return index

def get_max_prob_result(input, ix_to_word):
    return ix_to_word[get_index_of_max(input)]

class CBOW(torch.nn.Module):

    def __init__(self, vocab_size, embedding_dim, padding_idx):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, 
                                       padding_idx=padding_idx) #used predefined nn.Embedding
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.LongTensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)
    
k = 3                     # so our words has length 3
SLIDE = 1                 # sampling slide
CONTEXT_SIZE = 2          # 2 words to the left, 2 to the right
EMDEDDING_DIM = 25        # embedding dimension
EPOCHS = 10               # number of epochs for training
model = None              # CBOW model

Finally, gather all the kmers and apply the CBOW algorithm.

In [0]:
# Since we have the embeddings stored, we are going to ignore the following next
# two cells and use this one
stored = True

if stored:
  with open("wti.p", "rb") as f:
    word_to_ix = pickle.load(f)
  ix_to_word = {v: k for k,v in word_to_ix.items()}
  model_save_name = 'ma_model.pt'
  path = model_save_name
  #path = F"/content/drive/My Drive/{model_save_name}" 
  model = CBOW(len(word_to_ix), EMDEDDING_DIM, padding_idx=word_to_ix["X"]).cpu()
  model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

In [0]:
# kmer in all of the 1/4 sequences of the dataset
kmers = [ kmer for kmer in window("".join([seq for seq in df_train.loc[:,"sequence"]]), k, SLIDE) ]  
vocab_size = len(kmers)

data = []
print("Filling context data...")
pbar = tqdm(total=vocab_size - 4-1) # just to output something in screen
for i in range(2, vocab_size - 2): # first word to have 2 words before is the "third" one (0,1,2)
    context = (kmers[i - 2], kmers[i - 1],
               kmers[i + 1], kmers[i + 2])
    target = kmers[i]
    data.append((context, target))
    pbar.update(1)

word_to_ix, ix_to_word = {},{}
for i, word in enumerate(set(kmers)):
    word_to_ix[word] = i
    ix_to_word[i] = word

ix = len(ix_to_word)
ix_to_word[ix] = "X"  # our padding character
word_to_ix["X"] = ix

with open("/content/drive/My Drive/wti.p", "wb") as f:
  pickle.dump(word_to_ix, f)

with open("/content/drive/My Drive/itw.p", "wb") as f:
  pickle.dump(ix_to_word, f)

  0%|          | 0/71818 [00:00<?, ?it/s]

Filling context data...


 37%|███▋      | 26782/71818 [00:00<00:00, 267819.62it/s]

In [0]:
import sys
from time import time 
model = CBOW(len(word_to_ix), EMDEDDING_DIM, padding_idx=word_to_ix["X"])

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
print("Training embeddings...")

model_save_name = 'ma_model.pt'
path = F"/content/drive/My Drive/{model_save_name}" 

start = time()
tot = len(data)
print()
for epoch in range(EPOCHS):
  start = time()
  total_loss = 0
  i = 1
  for context, target in data:
    if not i%100:
      sys.stdout.write(f"\r{i}/{tot} in {round(time()-start,2)}s        ")
      sys.stdout.flush()
    context_vector = make_context_vector(context, word_to_ix)  
    model.zero_grad()
    log_probs = model(context_vector)
    loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
    loss.backward()
    optimizer.step()
    total_loss += loss.data
    i+=1
  torch.save(model.state_dict(), path)
  print(f"\nEpoch {epoch+1}/{EPOCHS} in {time()-start}\n\n")

Training embeddings...

71800/71819 in 51.19s        
Epoch 1/10 in 51.209683656692505


71800/71819 in 50.83s        
Epoch 2/10 in 50.84529900550842


71800/71819 in 49.03s        
Epoch 3/10 in 49.05419707298279


71800/71819 in 48.94s        
Epoch 4/10 in 48.96987223625183


71800/71819 in 48.67s        
Epoch 5/10 in 48.692935943603516


71800/71819 in 49.58s        
Epoch 6/10 in 49.599257946014404


71800/71819 in 50.32s        
Epoch 7/10 in 50.34233236312866


71800/71819 in 51.9s        
Epoch 8/10 in 51.92097878456116


71800/71819 in 50.9s        
Epoch 9/10 in 50.92261862754822


71800/71819 in 50.0s        
Epoch 10/10 in 50.01446485519409




## 4. Vanilla RNN model

In [0]:
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, hidden_out, output_dim,
                 padding_idx, max_seq_length):

        super().__init__()
        self.nb_tags = output_dim

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=padding_idx)
        
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        self.max_seq_length = max_seq_length
        self.lhid = nn.Linear(hidden_dim*max_seq_length, hidden_out)

        self.fc = nn.Linear(hidden_out, output_dim)


    def forward(self, text, X_lengths):

        self.hidden = self.init_hidden()
        print(f"hidden -> {self.hidden.size()}") #1,50,34,1
        # self.hidden = self.hidden.permute(0,3,1,2)
        #text = [sent len, batch size]
        # 1. embedding
        embedded = self.embedding(text)
        # 2. pack so padded won't be shown
        # print('Embedded: ')
        # print(embedded.size())

        embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, X_lengths, batch_first=True)

        # print('padded_sequence:')
        # print(embedded.data.size())

        #embedded = [sent len, batch size, emb dim]
        output, (hidden, lt) = self.rnn(embedded, self.hidden)
        # print('output rnn:')
        # print(output.data.size())


        #print(output.size())
        # 3. get that so it's correctly packed for the hidden layer
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True, total_length=self.max_seq_length)
        #print('Padding off')
        #print(output.size())
        # print('packed:')
        # print(output.size())

        output = output.contiguous()
        output = output.permute(2,1,0)
        output = output.reshape(-1, output.shape[2])
        # print('view:')
        # print(output.size())
        #print(output.size())
        #output = output.reshape(output.size(0),output.size(2)*100)
        #print(output.size())
        #output = output.permute(1,0)
        #print('contigous')
        #print(output.size())
        output = output.permute(1,0)
        output = nn.functional.relu(self.lhid(output))
        #print('after linear layer')
        #print(output.size())

        # 4. classification
        output = self.fc(output)
        #print(output.size())
        output = nn.functional.log_softmax(output, dim=1)
        #print(output.size())
        # output = output.view(text.size()[0], -1)
        return output

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        # hidden_a = torch.randn(1, 50, 34)
        hidden_a = torch.zeros((50, 34),
                         dtype=torch.double)
        hidden_a = Variable(hidden_a)
        #hidden_a = Variable(hidden_a)
        # print(f"hidden_a -> {hidden_a.size()}") #1,50,34
        #hidden_a = torch.unsqueeze(hidden_a, 3)
        print(f"hidden_a_uns -> {hidden_a.size()}") #1,50,34,1
        # hidden_a
        return hidden_a



## 4. Tweak the embeddings to accomodate varying sizes of the sequences
Once we have the model and the embeddings, we would need to tweak the embeddings so that they are adjusted for the padded sequences.

### 4.1. Add the padding char to the embeddings

Now, get sequences as indexes.

In [0]:
def to_ix(kmer, word_to_ix):
  if "X" in kmer:
    return word_to_ix["X"]
  elif kmer in word_to_ix:
    return word_to_ix[kmer]
  else:
    return len(word_to_ix)

def wind_idx(seq):
  return [to_ix(kmer, word_to_ix)  for kmer in window(seq, k, SLIDE)]

df_train["seq_idx"] = df_train["padded_sequences"].apply(wind_idx)
df_test["seq_idx"] = df_test["padded_sequences"].apply(wind_idx)

In [0]:
np.unique(df_train.seq_idx.apply(lambda x: len(x)))

array([1995])

Finally, instantiante the model and initialize the weigths of the embeddings.

In [0]:
print(word_to_ix["X"])

64


In [0]:
X_lengths = df_train.seq_length
max_seq_length = df_train.seq_idx.apply(lambda x: len(x)).max() # Find maximum sequence length

rnn = RNN(input_dim=len(word_to_ix), embedding_dim=EMDEDDING_DIM,
          hidden_dim = 34, hidden_out = 90, output_dim=3, 
          padding_idx = word_to_ix["X"], max_seq_length = max_seq_length)

In [0]:
rnn.embedding.weight.data.copy_(model.embeddings.weight)

tensor([[ 1.0206,  0.0405,  0.5029,  ..., -0.6731, -0.3842,  0.0184],
        [-0.7707, -0.6785,  0.0301,  ..., -1.0804, -0.8689,  1.6514],
        [-0.9782,  1.6782, -0.1411,  ...,  1.4668,  0.4697,  1.3576],
        ...,
        [ 0.4343,  1.8272,  1.5139,  ..., -0.0838, -0.5038,  0.9401],
        [ 0.0956, -1.3569, -0.2694,  ..., -1.5897,  0.7561, -0.0719],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       grad_fn=<CopyBackwards>)

## 5. Training loop


All that is left is to split our training and testing and train the model.

In [0]:
from torch.utils.data import DataLoader, Dataset

class oversampdata(Dataset):
  def __init__(self, data):
    # first column is list of index sentence
    self.data = torch.LongTensor(data.iloc[:,0])
    # second column is the label
    self.targets = torch.LongTensor(data.iloc[:,1])
    # third column in lengths (to pack sequences)
    self.lengths = torch.LongTensor(data.iloc[:,2])
  def __len__(self):
    return len(self.data)
  def __getitem__(self, index):
    data_val = self.data[index]
    target = self.targets[index]
    length = self.lengths[index]
    return data_val, target, length

train_dataset = oversampdata(df_train.loc[:,["seq_idx", "label_onehot", "seq_length"]])
valid_dataset = oversampdata(df_test.loc[:,["seq_idx", "label_onehot", "seq_length"]])

In [0]:
BATCH_SIZE = 50

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, 
                                          shuffle=True)
testloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, 
                                         shuffle=False)

In [0]:
def binary_accuracy(preds, y):
   """
   Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
   """
   #round predictions to the closest integer
   rounded_preds = torch.round(torch.sigmoid(preds))
   correct = (rounded_preds == y).float() #convert into float for division
   acc = correct.sum() / len(correct)
   return acc

In [0]:
import sys

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    print("Training...")

    for i, batch in enumerate(iterator):
        if i%50:
          sys.stdout.write(f"\rIteration {i}        ")
          sys.stdout.flush()

        inputs, labels, lengths = batch

        # lengths must be sorted
        indices = list(np.array(lengths.sort().indices)[::-1])
        all_sorted = [inputs[indices], labels[indices], lengths[indices]]
        inputs, labels_onehot, lengths = all_sorted
         
        optimizer.zero_grad()
                
        predictions = model(inputs, lengths)
        
        labels_idx = torch.LongTensor([np.where(label==1)[0][0] for label in labels_onehot])


        loss = criterion(predictions, labels_idx)
        
        acc = binary_accuracy(predictions, labels_onehot)
        
        loss.backward()
        
        optimizer.step()
        
        print(loss.item())
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    print()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    print("Evaluating...")
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            inputs, labels, lengths = batch
            # lengths must be sorted
            indices = list(np.array(lengths.sort().indices)[::-1])
            all_sorted = [inputs[indices], labels[indices], lengths[indices]]
            inputs, labels_onehot, lengths = all_sorted

            predictions = model(inputs, lengths)
            labels_idx = torch.LongTensor([np.where(label==1)[0][0] for label in labels_onehot])

            loss = criterion(predictions, labels_idx)
            acc = binary_accuracy(predictions, labels_onehot)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
#criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()

In [0]:
len(train_dataset)

9862

In [0]:
%%time
N_EPOCHS = 5
model_save_name="ma_rnn.pt"
path = F"/content/drive/My Drive/{model_save_name}"

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    print("Epoch: " + str(epoch))
    train_loss, train_acc = train(rnn, trainloader, optimizer, criterion)
    valid_loss, valid_acc = evaluate(rnn, testloader, criterion)

    #train_loss  = train(rnn, trainloader, optimizer, criterion)
    #valid_loss  = evaluate(rnn, testloader, criterion)    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(rnn.state_dict(), path)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    #print(f'\tTrain Loss: {train_loss:.3f}')
    #print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch: 0
Training...
hidden_a_uns -> torch.Size([50, 34])
hidden -> torch.Size([50, 34])


RuntimeError: ignored

In [0]:
train_dataset[0]