# Vanilla LSTM for Gene/No gene classification
The Milestone 1 corresponds to the classication task of, given a sequence, predict if it contains a gene, a partial sequence of a gene or just intergenic code.

In [1]:
!pip3 install pyfastx

Collecting pyfastx
[?25l  Downloading https://files.pythonhosted.org/packages/f5/15/5e891f5cf52383fe5dc13c83a4642347472e0ab4f5a09b7e4fc847f7f599/pyfastx-0.5.9-cp36-cp36m-manylinux2010_x86_64.whl (764kB)
[K     |▍                               | 10kB 17.4MB/s eta 0:00:01[K     |▉                               | 20kB 2.2MB/s eta 0:00:01[K     |█▎                              | 30kB 3.3MB/s eta 0:00:01[K     |█▊                              | 40kB 2.1MB/s eta 0:00:01[K     |██▏                             | 51kB 2.6MB/s eta 0:00:01[K     |██▋                             | 61kB 3.1MB/s eta 0:00:01[K     |███                             | 71kB 3.6MB/s eta 0:00:01[K     |███▍                            | 81kB 4.1MB/s eta 0:00:01[K     |███▉                            | 92kB 4.6MB/s eta 0:00:01[K     |████▎                           | 102kB 3.5MB/s eta 0:00:01[K     |████▊                           | 112kB 3.5MB/s eta 0:00:01[K     |█████▏                          | 1

In [0]:
import numpy as np
import pickle
import torch
import torch.nn as nn
import pandas as pd

from tqdm import tqdm # progress bar
from preproc_pipeline import window_pipeline
from warnings import simplefilter

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## 1. Dataset for training
The genome of E. coli will be used for this purpose.

In [0]:
simplefilter("ignore")
genome = "GCF_000008865.2_ASM886v2_genomic.fna"
feature_table = "GCA_000008865.2_ASM886v2_feature_table.tsv"
df = window_pipeline(genome, feature_table)

In [459]:
print(df.sequence.apply(lambda x: len(x)).max())
len(df)

50


36494

In [460]:
print(
    f"columns -> {list(df.columns)}\n"
    f"labels in dataframe -> {list(df.label.unique())}"
)

columns -> ['sequence', 'label']
labels in dataframe -> ['gene', 'intergenic', 'partial']


Let's get a one hot mapping for the labels.

In [461]:
labels = list(df.label.unique())
lab0 = np.zeros(len(labels))
lab2vec = {}
vec2lab = {}
for i, label in enumerate(list(df.label.unique())):
    labv = lab0.copy()
    labv[i] = 1
    lab2vec[label] = labv
    vec2lab[tuple(labv)] = label

print(f"lab2vec -> {lab2vec}\nvec2lab -> {vec2lab}")

lab2vec -> {'gene': array([1., 0., 0.]), 'intergenic': array([0., 1., 0.]), 'partial': array([0., 0., 1.])}
vec2lab -> {(1.0, 0.0, 0.0): 'gene', (0.0, 1.0, 0.0): 'intergenic', (0.0, 0.0, 1.0): 'partial'}


In [462]:
print(df[df.sequence.apply(lambda x: len(x)==0)].count())
df = df[~df.sequence.apply(lambda x: len(x)==0)]

sequence    0
label       0
dtype: int64


Need to check why it always generate a 0 length row. I think is the last one, but I am not sure.

In [463]:
df["label_onehot"] = df.label.apply(lambda x: lab2vec[x])
toy = pd.concat([df[df.label=="gene"].sample(n=5000),
                df[df.label=="intergenic"].sample(n=5000)])
                # df[df.label=="partial"].sample(n=100)]).reset_index(drop=True)
toy["label_onehot"] = toy.label_onehot.apply(lambda x: x[0:2])
df_train = toy.sample(frac=8/10) # shuffle
#df_test = df[~df.index.isin(df_train.index)].dropna().reset_index(drop=True)
df_test = toy[~toy.sequence.isin(df_train.sequence)].reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_train

Unnamed: 0,sequence,label,label_onehot
0,CACAGTTTAAATTTCCCCACGCGCATAGCCTTAATATCAGCACGCA...,intergenic,"[0.0, 1.0]"
1,TAACGGCTGACGGCGCGTTGCCATTCGCTAATCCATGCCTGACGTT...,gene,"[1.0, 0.0]"
2,ATTTTCACTAACATGACTGATTAACTGCTCTTTTGTAAAAGTGGTC...,intergenic,"[0.0, 1.0]"
3,TGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATAAATGTT...,intergenic,"[0.0, 1.0]"
4,AAGAAAGCAGGGGAGAAGCAGGCGGCAGTACGCCTTTGGTTTATCC...,intergenic,"[0.0, 1.0]"
...,...,...,...
7995,ATCCGGTATCAGGAAGAGTGCGACGATGTTCTTTACGAGTACTGGA...,gene,"[1.0, 0.0]"
7996,AATGATTTGCTTGCTGTTTATTAAAGATATTTAGCTCACCAGTCTG...,gene,"[1.0, 0.0]"
7997,AAACGTCCGGTGAGTTCCGATGACGTCGAAATGGCAATCAATCATA...,gene,"[1.0, 0.0]"
7998,CCGGCAGCAGTACCTGGACCGGGGTATCATCGGCATGGAGTTTGCC...,gene,"[1.0, 0.0]"


In [464]:
df_test

Unnamed: 0,sequence,label,label_onehot
0,GTATTTCAAGATTAAAGAAGACCGGCGTAAGGCGGCACGGGGAGAG...,gene,"[1.0, 0.0]"
1,GGTGGGGAGTATTAACCGCGTTATCCGTCCGCGTTTGAAGTTGCAT...,gene,"[1.0, 0.0]"
2,AAATATGCTGATAAAATTGCTCGCGGCTATTTTGATTGTTATCAGT...,gene,"[1.0, 0.0]"
3,TGGGATCTGAAATCAGTAAAAAAGATATCACCCGTCTGGGCTTTCG...,gene,"[1.0, 0.0]"
4,TGGCAGATAATCCAAACCCTTCATCGCCCCTGCCGGACGTGTTTTC...,gene,"[1.0, 0.0]"
...,...,...,...
1953,CCTATGCATACGCCACCTTCGGGTGGCGTTGTTTTTTGCGAGACGA...,intergenic,"[0.0, 1.0]"
1954,GTCATTTCTCCTTCTAAGAAGCGAGTAAGTACCTGCAAATCCGAAG...,intergenic,"[0.0, 1.0]"
1955,TCGTTTCCAGTTTATAGGGGTGTCATTTTCACAGGGTGACATAGCA...,intergenic,"[0.0, 1.0]"
1956,GTACGGCAGCCAATACAGTCAGCCGGATTAACGATAATTAACGATT...,intergenic,"[0.0, 1.0]"


## 2. Embeddings
The next step is to use the whole sequence to compute the embeddings. First, get a set of k-mers, that will be our words for this NLP problem.

Finally, gather all the kmers and apply the CBOW algorithm.

In [0]:
# Since we have the embeddings stored, we are going to ignore the following next
# two cells and use this one
stored = False

if stored:
  with open("/content/drive/My Drive/wti.p", "rb") as f:
    word_to_ix = pickle.load(f)
  ix_to_word = {v: k for k,v in word_to_ix.items()}
  model_save_name = 'ma_model.pt'
  path = model_save_name
  path = F"/content/drive/My Drive/{model_save_name}" 
  model = CBOW(len(word_to_ix), EMDEDDING_DIM, padding_idx=word_to_ix["X"]).cpu()
  model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))

## 4. Vanilla RNN model

In [0]:
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, hidden_out, output_dim, t):

        super().__init__()
        self.nb_tags = output_dim

        #, in_channels, out_channels, kernel_size, stride
        in_channels = 1
        out_channels = 1
        kernel_size = 8
        stride = 2

        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding = 0, bias = False)
        #self.maxpool = nn.maxpool()
        
        #out_seq_len = seq_len - kernel_size + 1

        self.rnn = nn.LSTM(input_dim, hidden_dim, bidirectional=False, batch_first=True)
        
        self.lhid = nn.Linear(22*hidden_dim, hidden_out)

        self.fc = nn.Linear(hidden_out, output_dim)

        self.drop = nn.Dropout(0.25)


    def forward(self, text):
        
        text = text.permute(0,2,1)
        output = self.conv(text)
        #print(output.data.size())


        output = output.permute(0,2,1)  
        #print(output.data.size())    
        #text = [sent len, batch size]
        # 1. LSTM
        output, hidden = self.rnn(output)
        #print('rnn')
        #print(output.data.size())


        # 2. get that so it's correctly packed for the hidden layer
        output = output.contiguous()
        output = output.reshape(output.shape[0], -1)
        #print('reshape')
        #print(output.size())

        output = self.drop(nn.functional.relu(self.lhid(output)))
        #print('ff')
        #print(output.size())
        #print('after linear layer')
        #print(output.size())

        # 3. classification
        output = self.fc(output)
        #print('fff')
        #print(output.size())
        #print(output.size())
        #output = nn.functional.log_softmax(output, dim=1)
        #print(output.size())
        # output = output.view(text.size()[0], -1)
        return output

## 4. Tweak the embeddings to accomodate varying sizes of the sequences
Once we have the model and the embeddings, we would need to tweak the embeddings so that they are adjusted for the padded sequences.

### 4.1. Add the padding char to the embeddings

Now, get sequences as indexes.

The following code is to try and entirely different way with enco

In [0]:
def code_one_hot(seq,vocab):
  encoding = np.array([vocab[ch] for ch in seq], dtype="int64")

  encoding.reshape(encoding.shape[0], encoding.shape[1], 1)
  return encoding

nuc_to_ix = {"A": [1,0,0,0], "G": [0,1,0,0], "T": [0,0,1,0], "C" : [0,0,0,1]}
nuc_to_class = {"A": [0], "G": [1], "T": [2], "C" : [3]}

nuc_to_ix = {"A": [1,0,0,0], "G": [0,1,0,0], "T": [0,0,1,0], "C" : [0,0,0,1]}
nuc_to_class = {k: np.array(v, "int64") for k,v in nuc_to_class.items()}

In [468]:
df_train["onehot"] = df_train.sequence.apply(lambda x: code_one_hot(x, nuc_to_ix))
df_test["onehot"] = df_test.sequence.apply(lambda x: code_one_hot(x, nuc_to_ix))

df_train["c_class"] = df_train.sequence.apply(lambda x: code_one_hot(x, nuc_to_class))
df_test["c_class"] = df_test.sequence.apply(lambda x: code_one_hot(x, nuc_to_class))

df_train

Unnamed: 0,sequence,label,label_onehot,onehot,c_class
0,CACAGTTTAAATTTCCCCACGCGCATAGCCTTAATATCAGCACGCA...,intergenic,"[0.0, 1.0]","[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 0, 1], [1,...","[[3], [0], [3], [0], [1], [2], [2], [2], [0], ..."
1,TAACGGCTGACGGCGCGTTGCCATTCGCTAATCCATGCCTGACGTT...,gene,"[1.0, 0.0]","[[0, 0, 1, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0,...","[[2], [0], [0], [3], [1], [1], [3], [2], [1], ..."
2,ATTTTCACTAACATGACTGATTAACTGCTCTTTTGTAAAAGTGGTC...,intergenic,"[0.0, 1.0]","[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0,...","[[0], [2], [2], [2], [2], [3], [0], [3], [2], ..."
3,TGTTGAATGATTGTCAAATTAGATGATTAAAAATTAAATAAATGTT...,intergenic,"[0.0, 1.0]","[[0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0,...","[[2], [1], [2], [2], [1], [0], [0], [2], [1], ..."
4,AAGAAAGCAGGGGAGAAGCAGGCGGCAGTACGCCTTTGGTTTATCC...,intergenic,"[0.0, 1.0]","[[1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1,...","[[0], [0], [1], [0], [0], [0], [1], [3], [0], ..."
...,...,...,...,...,...
7995,ATCCGGTATCAGGAAGAGTGCGACGATGTTCTTTACGAGTACTGGA...,gene,"[1.0, 0.0]","[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0,...","[[0], [2], [3], [3], [1], [1], [2], [0], [2], ..."
7996,AATGATTTGCTTGCTGTTTATTAAAGATATTTAGCTCACCAGTCTG...,gene,"[1.0, 0.0]","[[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0,...","[[0], [0], [2], [1], [0], [2], [2], [2], [1], ..."
7997,AAACGTCCGGTGAGTTCCGATGACGTCGAAATGGCAATCAATCATA...,gene,"[1.0, 0.0]","[[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0,...","[[0], [0], [0], [3], [1], [2], [3], [3], [1], ..."
7998,CCGGCAGCAGTACCTGGACCGGGGTATCATCGGCATGGAGTTTGCC...,gene,"[1.0, 0.0]","[[0, 0, 0, 1], [0, 0, 0, 1], [0, 1, 0, 0], [0,...","[[3], [3], [1], [1], [3], [0], [1], [3], [0], ..."


In [469]:
print(f"number of rows -> {len(df_train)}")
print(f"length of index seq -> {np.unique(df_train.c_class.apply(lambda x: len(x)))}")

number of rows -> 8000
length of index seq -> [50]


Finally, instantiante the model and initialize the weigths of the embeddings.

In [470]:
print(word_to_ix["X"])
print(len(set(word_to_ix.keys()))-1)

64
64


In [0]:
t = len(df_train.iloc[0,-1]) # Find length of seqs (fixed)

rnn = RNN(input_dim=1, hidden_dim = 34, hidden_out = 90, output_dim=2, t = t)


## 5. Training loop


All that is left is to split our training and testing and train the model.

In [0]:
from torch.utils.data import DataLoader, Dataset

class oversampdata(Dataset):
  def __init__(self, data):
    # first column is list of index sentence
    self.data = torch.LongTensor(np.ndarray.astype(np.array([
                    np.array(r) 
                    for r in data.iloc[:,0].to_numpy()]), "int64"))
    # second column is the label
    self.targets = torch.LongTensor(np.ndarray.astype(np.array([
                    np.array(r) 
                    for r in data.iloc[:,1].to_numpy()]), "int64"))

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    data_val = self.data[index]
    target = self.targets[index]
    return data_val, target

train_dataset = oversampdata(df_train.loc[:,["c_class", "label_onehot"]])
valid_dataset = oversampdata(df_test.loc[:,["c_class", "label_onehot"]])

In [0]:
BATCH_SIZE = 30

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, 
                                          shuffle=True, drop_last = False)
testloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, 
                                         shuffle=True, drop_last = False)

In [0]:
def binary_accuracy(preds, y):
   """
   Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
   """
   rounded_preds = torch.zeros(preds.size())
   for i in range(len(preds)):
     idx_max = torch.where(preds[i] == preds[i].max())
     rounded_preds[i][idx_max] = 1
   rounded_preds = torch.FloatTensor(rounded_preds).int()
   correct = (rounded_preds == y).float() #convert into float for division
   acc = (correct.sum()/len(preds[0])) / len(correct)
   return acc

In [0]:
from torch.nn import Parameter
import sys

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    print("Training...")

    for i, batch in enumerate(iterator):
        if i%50:
          sys.stdout.write(f"\rIteration {i}        ")
          sys.stdout.flush()

        inputs, labels_onehot = batch

        inputs = Parameter(inputs.float(), requires_grad=True)

        predictions = model(inputs)
        
        labels_idx = torch.LongTensor([np.where(label==1)[0][0] for label in labels_onehot])
        loss = criterion(predictions, labels_idx)

        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
        acc = binary_accuracy(predictions, labels_onehot)
        # print(loss.item())
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    print()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    print("Evaluating...")

    for i, batch in enumerate(iterator):

        inputs, labels_onehot = batch
        
        inputs = Parameter(inputs.float(), requires_grad=False)

        predictions = model(inputs)
        labels_idx = torch.LongTensor([np.where(label==1)[0][0] for label in labels_onehot])

        loss = criterion(predictions, labels_idx)
        acc = binary_accuracy(predictions, labels_onehot)

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

optimizer = torch.optim.Adam(rnn.parameters(), lr=1e-3)
#criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()

In [476]:
len(train_dataset)

8000

In [477]:
%%time
N_EPOCHS = 50
model_save_name="ma_rnn.pt"
path = F"/content/drive/My Drive/{model_save_name}"

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    print("Epoch: " + str(epoch))
    train_loss, train_acc = train(rnn, trainloader, optimizer, criterion)
        
    print()
    valid_loss, valid_acc = evaluate(rnn, testloader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(rnn.state_dict(), path)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 0
Training...
Iteration 266        

Evaluating...
	Train Loss: 0.682 | Train Acc: 54.34%
	 Val. Loss: 0.655 |  Val. Acc: 60.53%
Epoch: 1
Training...
Iteration 266        

Evaluating...
	Train Loss: 0.638 | Train Acc: 63.70%
	 Val. Loss: 0.627 |  Val. Acc: 64.66%
Epoch: 2
Training...
Iteration 266        

Evaluating...
	Train Loss: 0.620 | Train Acc: 65.95%
	 Val. Loss: 0.620 |  Val. Acc: 66.96%
Epoch: 3
Training...
Iteration 266        

Evaluating...
	Train Loss: 0.612 | Train Acc: 66.52%
	 Val. Loss: 0.605 |  Val. Acc: 67.54%
Epoch: 4
Training...
Iteration 266        

Evaluating...
	Train Loss: 0.607 | Train Acc: 66.85%
	 Val. Loss: 0.602 |  Val. Acc: 68.27%
Epoch: 5
Training...
Iteration 266        

Evaluating...
	Train Loss: 0.602 | Train Acc: 67.37%
	 Val. Loss: 0.602 |  Val. Acc: 69.23%
Epoch: 6
Training...
Iteration 266        

Evaluating...
	Train Loss: 0.598 | Train Acc: 67.92%
	 Val. Loss: 0.597 |  Val. Acc: 69.49%
Epoch: 7
Training...
Iteration 266        

Eval

KeyboardInterrupt: ignored