In [1]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random

In [2]:
sos = 0
eos = 1
max_length = 1275
lang1 = 'seq1'
lang2 = 'seq2'

#class Amino_Acids:
 #   def __init__(self):
        

In [3]:
def read_sequence(df, lang1, lang2):
    sequence1 = df[lang1]
    sequence2 = df[lang2]
    sequence1 = str(sequence1)
    sequence2 = str(sequence2)
    return sequence1, sequence2

def read_file(loc, lang1, lang2):
    df = pd.read_csv(loc, delimiter='\t', header=None, names=[lang1, lang2])
    return df

def process_data(lang1,lang2):
    df = read_file('data/seq2seq/biological_sequences/%s-%s.txt' % (lang1, lang2), lang1, lang2)
    print("Read %s sequence pairs" % len(df))
    sequence1, sequence2 = read_sequence(df, lang1, lang2)
    #source = Lang()
    #target = Lang()
    pairs = []
    for i in range(len(df)):
        if len(sequence1[i].split(',')) <= max_length and len(sequence2[i].split(' ')) <= max_length:
            full = [sequence1[i], sequence2[i]]
            #source.addSentence(sentence1[i])
            #target.addSentence(sentence2[i])
            pairs.append(full)

    return  pairs
    
   

In [4]:
class Encoder(nn.Module):
    def __init__(self, ip_dim, hid_dim, embed_dim, num_layers):
        super(Encoder, self).__init__()
        self.ip_dim = ip_dim
        self.op_dim = op_dim
        self.hid_dim = hid_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(ip_dim, embed_dim)
        self.gru = nn.Gru(embed_dim, hid_dim, num_layers)
        
    def forward(self, src):

        embedded = self.embedding(src).view(1,1,-1)
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

In [5]:
class Decoder(nn.Module):
    def __init__(self, op_dim,hid_dim, embed_dim, num_layers):
        super(Decoder, self).__init__()
        self.op_dim = op_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(op_dim, embed_dim)
        self.gru = nn.GRU(embed_dim, hid_dim, num_layers=self.num_layers)
        self.out = nn.Linear(hid_dim, op_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):

        # reshape the input to (1, batch_size)
        input = input.view(1, -1)
        embedded = F.relu(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)       
        prediction = self.softmax(self.out(output[0]))

        return prediction, hidden

In [6]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, max_length = max_length):
        super().__init__()
      
        #initialize the encoder and decoder
        self.encoder = encoder
        self.decoder = decoder
        
     
    def forward(self, source, target, teacher_forcing_ratio=0.5):

        input_length = source.size(0) #get the input length (number of words in sequence)
        batch_size = target.shape[1] 
        target_length = target.shape[0]
        vocab_size = self.decoder.output_dim
#initialize a variable to hold the predicted outputs
        outputs = torch.zeros(target_length, batch_size, vocab_size)

#encode every word in a sequence
        for i in range(input_length):
            encoder_output, encoder_hidden = self.encoder(source[i])

#use the encoder’s hidden layer as the decoder hidden
        decoder_hidden = encoder_hidden
  
 #add a token before the first predicted word
        decoder_input = torch.tensor([SOS_token])  # SOS

#topk is used to get the top K value over a list
#predict the output word from the current target word. If we enable the teaching force,  then the #next decoder input is the next word, else, use the decoder output highest value. 

        for t in range(target_length):   
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            outputs[t] = decoder_output
            teacher_force = random.random() < teacher_forcing_ratio
            topv, topi = decoder_output.topk(1)
            input = (target[t] if teacher_force else topi)
            if(teacher_force == False and input.item() == EOS_token):
                break

        return outputs

In [7]:
teacher_forcing_ratio = 0.5

def clacModel(model, input_tensor, target_tensor, model_optimizer, criterion):
    model_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    loss = 0
    epoch_loss = 0
   # print(input_tensor.shape)

    output = model(input_tensor, target_tensor)

    num_iter = output.size(0)
    print(num_iter)

#calculate the loss from a predicted sequence with the expected result
    for ot in range(num_iter):
        loss += criterion(output[ot], target_tensor[ot])

    loss.backward()
    model_optimizer.step()
    epoch_loss = loss.item() / num_iter

    return epoch_loss

def trainModel(model, source, target, pairs, num_iteration=20000):
    model.train()

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.NLLLoss()
    total_loss_iterations = 0

    training_pairs = [tensorsFromPair(source, target, random.choice(pairs))
            for i in range(num_iteration)]
  
    for iter in range(1, num_iteration+1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = clacModel(model, input_tensor, target_tensor, optimizer, criterion)

        total_loss_iterations += loss

        if iter % 5000 == 0:
            avarage_loss= total_loss_iterations / 5000
            total_loss_iterations = 0
            print('%d %.4f' % (iter, avarage_loss))
          
    torch.save(model.state_dict(), 'mytraining.pt')
    return model

In [8]:
lang1 = 'seq1'
lang2 = 'seq2'
pairs = process_data(lang1, lang2)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

#print number of words
input_size = df.length()
output_size = df.length()
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 100000


Read 72814 sequence pairs


IndexError: string index out of range