In [1]:
%autosave 10

Autosaving every 10 seconds


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import random
import numpy as np
import re
import string
from string import digits
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from nltk.translate.bleu_score import sentence_bleu

In [4]:
os.chdir("/content/drive/My Drive/NeuralMT")
os.listdir()

['en_hi.pkl',
 'data',
 'cc.en.300.vec.gz',
 'cc.hi.300.vec.gz',
 'seq2seq_0.02.params',
 'seq2seq_0.01.params',
 'seq2seq_0.04.params',
 'seq2seq_0.0.params',
 'seq2seq_0.19.params',
 'seq2seq_0.25.params',
 'seq2seq_0.24.params',
 'seq2seq_0.31.params',
 'seq2seq_0.23.params',
 'seq2seq_0.11.params',
 'data_small']

#try cuda

In [None]:
data_folder = "data"
files = os.listdir(data_folder)
source_files = [os.path.join(data_folder, file) for file in files if ".en" in file]
target_files = [os.path.join(data_folder, file) for file in files if ".hi" in file]
print(source_files)

def generate_vocab(filenames):
    vocab = set()
    # no_words = 0
    for file in filenames:
        content = open(file).read()
        sentences = [sentence.split() for sentence in content.split('\n')]
        for sentence in sentences:
            for word in sentence:
                if word.lower() not in vocab:
                    vocab.add(word.lower())
                    # no_words += 1
    vocab = {word: num for num, word in enumerate(vocab)}
    return vocab

def parse_files_to_indices(filename, vocab):
    content = open(filename).read()
    return [[vocab[word] for word in sentence.split() if word in vocab] for sentence in content.split('\n')]

class Dataset:
    def __init__(self, source_files, target_files, batch_size=10):
        self.source_vocab = generate_vocab(source_files) #for entire lang
        self.target_vocab = generate_vocab(target_files) #for entire lang
        
        self.len_source = len(self.source_vocab.keys())
        self.source_pad, self.source_start, self.source_end = self.len_source + 2, self.len_source + 1, self.len_source
        self.len_target = len(self.target_vocab.keys())
        self.target_pad, self.target_start, self.target_end = self.len_target + 2, self.len_target + 1, self.len_target
        
        self.source_vocab['<pad>'], self.source_vocab['<start>'], self.source_vocab['<end>'] = [self.source_pad, 
                                                                                                self.source_start, 
                                                                                                self.source_end]
        self.target_vocab['<pad>'], self.target_vocab['<start>'], self.target_vocab['<end>'] = [self.target_pad,
                                                                                                self.target_start, 
                                                                                                self.target_end]
                                                                                              
        ##created indices for <pad>, <start> and <end> tokens
        self.len_source = len(self.source_vocab.keys())
        self.len_target = len(self.target_vocab.keys())
        
        ##inverse vocab for entire lang
        self.source_vocab_inv = {value:key for key, value in self.source_vocab.items()}
        self.target_vocab_inv = {value:key for key, value in self.target_vocab.items()}
        
        ##parsing files into indices
        for filename in source_files:
            if 'train' in filename:
                self.source_train = parse_files_to_indices(filename, self.source_vocab)
            if 'test' in filename:
                self.source_test = parse_files_to_indices(filename, self.source_vocab)
            if 'dev' in filename:
                self.source_dev = parse_files_to_indices(filename, self.source_vocab)
                
        for filename in target_files:
            if 'train' in filename:
                self.target_train = parse_files_to_indices(filename, self.target_vocab)
            if 'test' in filename:
                self.target_test = parse_files_to_indices(filename, self.target_vocab)
            if 'dev' in filename:
                self.target_dev = parse_files_to_indices(filename, self.target_vocab)
                
        ##random shuffling of training data before splitting into batches    
        self.indices = list(range(len(self.source_train)))
        random.shuffle(self.indices)
        self.current = -batch_size
        self.batch_size = batch_size
        
    def __get_batch_input(self, indices):
        source, target_input, target_target = [], [], []
        
        # padding length of sentence
        source_max_len = max(len(self.source_train[i]) for i in indices)
        target_max_len = max(len(self.target_train[i]) for i in indices)
        
        for i in indices:
            length = len(self.source_train[i])
            # reverse_source_sentences and pad at beginning
            sentence = [self.source_pad for _ in range(source_max_len - length)] + [self.source_end] + self.source_train[i][::-1] + [self.source_start]
            source.append(sentence)
            
            length = len(self.target_train[i])
            # padding at end for target
            sentence = [self.target_start] + self.target_train[i] + [self.target_end] + [self.target_pad for _ in range(target_max_len - length)]
            target_input.append(sentence)
            #removed <sos> and put <pad> at end
            target_target.append(sentence[1:] + [self.target_pad])
            
        return np.array(source), np.array(target_input), np.array(target_target)
    
    def get_batch_input(self):
        if self.current > len(self.indices) - self.batch_size:
            self.current = 0
            return None, None, None
        self.current += self.batch_size
        return self.__get_batch_input(self.indices[self.current: self.current + self.batch_size])
    
    def convert_indices_to_words(self, indices):
        output = []
        for sentence in indices:
            s = []
            for word in sentence:
                s.append(self.target_vocab_inv[word])
            output.append(s)
        return output


# ```
# %run seq2seq.ipynb
# 
# d = Dataset(source_files, target_files, 10)
# m = Seq2Seq(d.len_source, d.len_target, 9, 7)
# m.cuda()
# 
# a, b = d.get_batch_input()
# source, target = torch.cuda.LongTensor(a), torch.cuda.LongTensor(b)
# 
# # For teacher forcing:
# m(source, target)
# 
# # Using model's own predictions:
# m(source)
# ```

# In[37]:


def train(model, dataset, coverage=False, coverage_type="linguistic", iterations=1, use_teacher_forcing=True, log=True):
    # TODO: attention error
    loss_func = nn.NLLLoss()
    attn_loss_func = nn.MSELoss()
    
    optimizer = optim.Adagrad(model.parameters()) 
    
    for i in range(iterations):
        while True:
            optimizer.zero_grad() #set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
            source, target_input, target_output = d.get_batch_input() #sentences made into indices and padded to sentence lenght 

            if source is None: # end of current iteration
                break

            source, target_input, target_output = [torch.LongTensor(source), 
                                                   torch.LongTensor(target_input).cuda, 
                                                   torch.LongTensor(target_output).cuda]
            
            source_mask = torch.ones(source.shape).cuda()
            source_mask[source == dataset.source_pad] = 0
            
            if use_teacher_forcing:
                pred, attn = model(source, target_input, source_mask=source_mask)
                # mask whatevers after <stop> 
                target_mask = torch.ones(target_output.shape).cuda()
                target_mask[target_output == dataset.target_pad] = 0
                pred = pred * target_mask.unsqueeze(-1)
                target_output = target_output * target_mask.long()
            else:
                pred, attn_weights = model(source)
                
            no_words = pred.shape[0] * pred.shape[1]
            pred = pred.reshape(no_words, -1)
            target_output = target_output.reshape(no_words)

            pred_error = loss_func(pred, target_output)
            attn_error = None
            
            if coverage:
                # if coverage type is linguistic, ignore fertility
                attn_weights, fertility = attn
                if coverage_type == "linguistic":
                    fertility = torch.ones(fertility.shape).cuda()
                attn_error = attn_loss_func(torch.sum(attn_weights, dim=-1) * source_mask, fertility * source_mask)
                pred_error += attn_error
                
            pred_error.backward() #backward prop
            optimizer.step()
            
            if log:
                print(d.current/d.batch_size, pred_error, end='\r')


# In[38]:


def convert_pred_to_indices(pred):
    return torch.max(pred, dim=-1)[1]


# In[39]:


def eval(model, dataset, log=False, source_test=None, target_test=None):
    scores = []
    if source_test is None:
        source_test = dataset.source_test
        target_test = dataset.target_test
        
    for sentence, target in zip(source_test, target_test):
        input = torch.LongTensor([[dataset.source_end] + sentence[::-1] + [dataset.source_start]]).cuda
        pred, _ = model(input).cuda()
        pred_words = convert_pred_to_indices(pred).cpu().numpy()
        predicted_target = dataset.convert_indices_to_words(pred_words)
        target = dataset.convert_indices_to_words([target])
        if log:
            print(target, predicted_target[0])
        scores.append(sentence_bleu(target, predicted_target[0], weights=(1, 0, 0, 0)))
    return sum(scores)/len(scores)

['data/dev.en', 'data/test.en', 'data/train.en']


In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np


# In[2]:


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f


# In[3]:


class Encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, input):
        '''
        input dim => 2d no_sentences * no_words
        make sure that the input is in reverse
        
        Outputs:
        output, h, c
        
        output => dim: (num_sentences * num_words * hidden_dim)
        h => (num_layers * num_sentences * hidden_dim)
        c => (num_layers * num_sentences * hidden_dim)
        '''
        embeddings = self.embedding(input)
        output, (h, c) = self.encoder(embeddings)
        
        return output, h, c


# In[4]:


class Decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0, start_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.word_predictor = nn.Linear(hidden_dim, num_embeddings)
        self.start_dim = start_dim
        
    def forward(self, output, h, c, input=None, max_sen_len=20):
        '''
        Output:
        pred: (no_sentences * no_words * no_embeddings)
        **Note: if input is not None, log_softmax is returned
        else, input is returned without softmax**
        '''
        
        if input is not None: # training with teacher forcing
            
            embeddings = self.embedding(input)
            yts, _ = self.decoder(embeddings, (h, c))
            orig_shape = yts.shape
            pred = self.word_predictor(yts.reshape(orig_shape[0]*orig_shape[1], -1)).reshape(*orig_shape[:-1], -1)
            return torch.log_softmax(pred, dim=-1), None
        
        else: # training using the decoders' own predictions
            
            num_sentences = output.shape[0]
            words_selected = torch.LongTensor([[self.start_dim] for _ in range(num_sentences)]).cuda
            pred = []
            for i in range(max_sen_len):
                embeddings = self.embedding(words_selected)
                y_t, (h, c) = self.decoder(embeddings, (h, c))
                orig_shape = y_t.shape
                pred_t = self.word_predictor(y_t.reshape(orig_shape[0]*orig_shape[1], 
                                                         -1)).reshape(*orig_shape[:-1], -1).squeeze(1)
                pred.append(pred_t)
                words_selected = torch.max(pred_t, dim=1)[1].unsqueeze(1)
            
            pred = torch.stack(pred, dim=1)
            return torch.log_softmax(pred, dim=-1), None


# In[5]:


class Seq2Seq(nn.Module):
    def __init__(self, source_vocab_len, target_vocab_len, embedding_dim, hidden_dim, num_layers=1,
                       padding_dim_source=0, start_dim_target=1, padding_dim_target=0):
        super().__init__()
        self.encoder = Encoder(source_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_source)
        self.decoder = Decoder(target_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_target, start_dim_target)
        
    def forward(self, source_input, target_input=None, source_mask=None):
        output, h, c = self.encoder(source_input)
        return self.decoder(output, h, c, target_input)


# **Without teacher forcing**
# ```
# input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s = Seq2Seq(10, 12, 9, 7)
# s.cuda()
# pred = s(input)
# ```

# **With teacher forcing**
# 
# ```
# tar_input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s(input, tar_input).shape
# ```

# In[ ]:

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[30]:


# get_ipython().run_line_magic('run', 'utils.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq_with_attention.ipynb')
# get_ipython().run_line_magic('run', 'effective_approaches.ipynb')
# get_ipython().run_line_magic('run', 'coverage.ipynb')


# In[4]:

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# m.to(device)
# torch.cuda.set_device(0)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# In[24]:


d = Dataset(source_files, target_files, batch_size=10)
embedding_dim = 50
hidden_dim = 100
num_layers = 2


# In[25]:


def train_and_save(m, d, str_, coverage=False, coverage_type="linguistic", log=False):
    prev_score = -1
    for i in range(20):
        train(m, d, coverage, coverage_type, log=log)
        score = eval(m, d)
        best_state = None
        print()
        print(score)
        if score > prev_score:
            #torch.save(m.state_dict(), str_ + str(score)[:4] + ".params_")
            best_state = m.state_dict()
            prev_score = score
            
    torch.save(best_state, str_ + str(prev_score)[:4] + ".params")
    print(prev_score)


# In[26]:

m = Seq2Seq(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
            padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
#print(m.cuda())
train_and_save(m, d, "seq2seq_")


# In[ ]:


# m = Seq2SeqWithAttention(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
#             padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
# print(m.cuda())
# train_and_save(m, d, "attention_")


RuntimeError: ignored

#try cpu


In [None]:
import re
import string
from string import digits

In [5]:
# import pandas as pd
df = pd.read_pickle('en_hi.pkl')[:100]

In [None]:
df.head(20)

Unnamed: 0,english,hindi
57382,Who is this man?\n,-ये आदमी कौन है ?\n
28149,And the other brother?\n,और दूसरे भाई?\n
80284,"- Greetings, Mustakeen bhai.\n","नमस्ते, Mustakeen भाई।\n"
63596,"Please, reason with them.\n","कृपया, उन लोगों के साथ कारण.\n"
9459,There's a howling gale.\n,तेज़ हवा आ रही है।\n
67410,"- No, you don't.\n","- नहीं , कभी नहीं .\n"
89506,"the violence, the rage.\n","हिंसा, स्पूर्ति।\n"
71637,Signal decoy in place.\n,सिग्नल जगह में लूभाव।\n
66488,You guys come with?\n,तुम लोग इसे ले तो आये हो ?\n
52809,How long you staying?\n,तुम कब तक रह?\n


In [6]:
lines = df.copy()
# general
lines['english']=lines['english'].apply(lambda x: re.sub(r"n\'t", " not", x))
lines['english']=lines['english'].apply(lambda x: re.sub(r"\'re", " are", x))
lines['english']=lines['english'].apply(lambda x: re.sub(r"\'s", " is", x))
lines['english']=lines['english'].apply(lambda x: re.sub(r"n\'d", " would", x))
lines['english']=lines['english'].apply(lambda x: re.sub(r"n\'ll", " will", x))
lines['english']=lines['english'].apply(lambda x: re.sub(r"n\'ve", " have", x))
lines['english']=lines['english'].apply(lambda x: re.sub(r"n\'m", " am", x))

lines['hindi'] = lines['hindi'].apply(lambda x: x.replace('.',' ।'))
# lines['hindi'] = lines['hindi'].apply(lambda x: x.replace('।',' ।'))

# lines['english']=lines['english'].apply(lambda x: re.sub(r"n\in'", "ing", x))

# Lowercase all characters
lines['english']=lines['english'].apply(lambda x: x.lower())
lines['hindi']=lines['hindi'].apply(lambda x: x.lower())


exclude = set(['$','&','+',':',';','=','@','|','<','>','^','*','%','-','#','\'']) # Set of all special characters
# Remove all the special characters
lines['english']=lines['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi']=lines['hindi'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english']=lines['english'].apply(lambda x: x.translate(remove_digits))
lines['hindi']=lines['hindi'].apply(lambda x: x.translate(remove_digits))

lines['hindi'] = lines['hindi'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english']=lines['english'].apply(lambda x: x.strip())
lines['hindi']=lines['hindi'].apply(lambda x: x.strip())
lines['english']=lines['english'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi']=lines['hindi'].apply(lambda x: re.sub(" +", " ", x))

# Add start and end tokens to target sequences
lines['hindi'] = lines['hindi'].apply(lambda x :  x + '\n')

lines['english'] = lines['english'].apply(lambda x : x + '\n')
lines.head(20)

Unnamed: 0,english,hindi
57382,who is this man?\n,ये आदमी कौन है ?\n
28149,and the other brother?\n,और दूसरे भाई?\n
80284,"greetings, mustakeen bhai.\n","नमस्ते, mustakeen भाई।\n"
63596,"please, reason with them.\n","कृपया, उन लोगों के साथ कारण ।\n"
9459,there is a howling gale.\n,तेज़ हवा आ रही है।\n
67410,"no, you do not.\n","नहीं , कभी नहीं ।\n"
89506,"the violence, the rage.\n","हिंसा, स्पूर्ति।\n"
71637,signal decoy in place.\n,सिग्नल जगह में लूभाव।\n
66488,you guys come with?\n,तुम लोग इसे ले तो आये हो ?\n
52809,how long you staying?\n,तुम कब तक रह?\n


In [None]:
# for index,row in lines[:10].iterrows():
#   # for lan in ['english','hindi']:
  
#     ret = [re.sub(fr'([{string.punctuation}])\B', r' \1', row[lan]).split() for lan in ['english','hindi']]
#     print(ret)

In [7]:
train, validate, test = np.split(lines.sample(frac=1), [int(.7*len(df)), int(.8*len(df))])
print(train.shape, validate.shape, test.shape)

(70, 2) (10, 2) (20, 2)


In [None]:
# for index,row in train[:10].iterrows():
#   # for lan in ['english','hindi']:
  
#     ret = [re.sub(fr'([{string.punctuation  }])\B', r' \1', str(row)).split()]
#     print(ret)
#     print(index,row)

In [8]:
# with open('/content/drive/My Drive/NeuralMT/data/dev.en', 'w') as writefile:
#     writefile.write(('').join(validate['english']))
# with open('/content/drive/My Drive/NeuralMT/data/dev.hi', 'w') as writefile:
#     writefile.write(('').join(validate['hindi']))
# with open('/content/drive/My Drive/NeuralMT/data/test.en', 'w') as writefile:
#     writefile.write(('').join(test['english']))
# with open('/content/drive/My Drive/NeuralMT/data/test.hi', 'w') as writefile:
#     writefile.write(('').join(test['hindi']))
# with open('/content/drive/My Drive/NeuralMT/data/train.en', 'w') as writefile:
#     writefile.write(('').join(train['english']))
# with open('/content/drive/My Drive/NeuralMT/data/train.hi', 'w') as writefile:
#     writefile.write(('').join(train['hindi']))

with open('/content/drive/My Drive/NeuralMT/data_small/dev.en', 'w') as writefile:
    writefile.write(('').join(validate['english']))
with open('/content/drive/My Drive/NeuralMT/data_small/dev.hi', 'w') as writefile:
    writefile.write(('').join(validate['hindi']))
with open('/content/drive/My Drive/NeuralMT/data_small/test.en', 'w') as writefile:
    writefile.write(('').join(test['english']))
with open('/content/drive/My Drive/NeuralMT/data_small/test.hi', 'w') as writefile:
    writefile.write(('').join(test['hindi']))
with open('/content/drive/My Drive/NeuralMT/data_small/train.en', 'w') as writefile:
    writefile.write(('').join(train['english']))
with open('/content/drive/My Drive/NeuralMT/data_small/train.hi', 'w') as writefile:
    writefile.write(('').join(train['hindi']))

In [11]:
data_folder = "data"
files = os.listdir(data_folder)
source_files = [os.path.join(data_folder, file) for file in files if ".en" in file]
target_files = [os.path.join(data_folder, file) for file in files if ".hi" in file]
print(source_files)

def generate_vocab(filenames):
    vocab = set()
    # no_words = 0
    for file in filenames:
        content = open(file).read()
        sentences = [re.sub(fr'([{string.punctuation}])\B', r' \1', sentence).split() for sentence in content.split('\n')]
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab.add(word)
                    # no_words += 1

    vocab = {word: num for num, word in enumerate(vocab)}
    return vocab

def parse_files_to_indices(filename, vocab):
    content = open(filename).read()
    return [[vocab[word] for word in re.sub(fr'([{string.punctuation}])\B', r' \1', sentence).split() if word in vocab] for sentence in content.split('\n')]

class Dataset:
    def __init__(self, source_files, target_files, batch_size=10):
        self.source_vocab = generate_vocab(source_files) #for entire lang
        self.target_vocab = generate_vocab(target_files) #for entire lang
        
        self.len_source = len(self.source_vocab.keys())
        self.source_pad, self.source_start, self.source_end = self.len_source + 2, self.len_source + 1, self.len_source
        self.len_target = len(self.target_vocab.keys())
        self.target_pad, self.target_start, self.target_end = self.len_target + 2, self.len_target + 1, self.len_target
        
        self.source_vocab['<pad>'], self.source_vocab['<start>'], self.source_vocab['<end>'] = [self.source_pad, 
                                                                                                self.source_start, 
                                                                                                self.source_end]
        self.target_vocab['<pad>'], self.target_vocab['<start>'], self.target_vocab['<end>'] = [self.target_pad,
                                                                                                self.target_start, 
                                                                                                self.target_end]
                                                                                              
        ##created indices for <pad>, <start> and <end> tokens
        self.len_source = len(self.source_vocab.keys())
        self.len_target = len(self.target_vocab.keys())
        
        ##inverse vocab for entire lang
        self.source_vocab_inv = {value:key for key, value in self.source_vocab.items()}
        self.target_vocab_inv = {value:key for key, value in self.target_vocab.items()}
        
        ##parsing files into indices
        for filename in source_files:
            if 'train' in filename:
                self.source_train = parse_files_to_indices(filename, self.source_vocab)
            if 'test' in filename:
                self.source_test = parse_files_to_indices(filename, self.source_vocab)
            if 'dev' in filename:
                self.source_dev = parse_files_to_indices(filename, self.source_vocab)
                
        for filename in target_files:
            if 'train' in filename:
                self.target_train = parse_files_to_indices(filename, self.target_vocab)
            if 'test' in filename:
                self.target_test = parse_files_to_indices(filename, self.target_vocab)
            if 'dev' in filename:
                self.target_dev = parse_files_to_indices(filename, self.target_vocab)
                
        ##random shuffling of training data before splitting into batches    
        self.indices = list(range(len(self.source_train)))
        random.shuffle(self.indices)
        self.current = -batch_size
        self.batch_size = batch_size
        
    def __get_batch_input(self, indices):
        source, target_input, target_target = [], [], []
        
        # padding length of sentence
        source_max_len = max(len(self.source_train[i]) for i in indices)
        target_max_len = max(len(self.target_train[i]) for i in indices)
        
        for i in indices:
            length = len(self.source_train[i])
            # reverse_source_sentences and pad at beginning
            sentence = [self.source_pad for _ in range(source_max_len - length)] + [self.source_end] + self.source_train[i][::-1] + [self.source_start]
            source.append(sentence)
            
            length = len(self.target_train[i])
            # padding at end for target
            sentence = [self.target_start] + self.target_train[i] + [self.target_end] + [self.target_pad for _ in range(target_max_len - length)]
            target_input.append(sentence)
            #removed <sos> and put <pad> at end
            target_target.append(sentence[1:] + [self.target_pad])
            
        return np.array(source), np.array(target_input), np.array(target_target)
    
    def get_batch_input(self):
        if self.current > len(self.indices) - self.batch_size:
            self.current = 0
            return None, None, None
        self.current += self.batch_size
        return self.__get_batch_input(self.indices[self.current: self.current + self.batch_size])
    
    def convert_indices_to_words(self, indices):
        output = []
        for sentence in indices:
            s = []
            for word in sentence:
                if word == self.target_end:
                  break
                s.append(self.target_vocab_inv[word])
            output.append(s)
        return output

    def convert_indices_to_words_eng(self, indices):
        output = []
        for sentence in indices:
            s = []
            for word in sentence:
                s.append(self.source_vocab_inv[word])
            output.append(s)
        return output


# ```
# %run seq2seq.ipynb
# 
# d = Dataset(source_files, target_files, 10)
# m = Seq2Seq(d.len_source, d.len_target, 9, 7)
# m.cuda()
# 
# a, b = d.get_batch_input()
# source, target = torch.cuda.LongTensor(a), torch.cuda.LongTensor(b)
# 
# # For teacher forcing:
# m(source, target)
# 
# # Using model's own predictions:
# m(source)
# ```

# In[37]:


def train(model, dataset, coverage=False, coverage_type="linguistic", iterations=1, use_teacher_forcing=True, log=True):
    # TODO: attention error
    loss_func = nn.NLLLoss()
    attn_loss_func = nn.MSELoss()
    
    optimizer = optim.Adagrad(model.parameters()) 
    
    for i in range(iterations):
        iter_loss = 0
        while True:
            
            optimizer.zero_grad() #set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
            source, target_input, target_output = d.get_batch_input() #sentences made into indices and padded to sentence lenght 

            if source is None: # end of current iteration
                break

            source, target_input, target_output = [torch.LongTensor(source), 
                                                   torch.LongTensor(target_input), 
                                                   torch.LongTensor(target_output)]
            
            source_mask = torch.ones(source.shape)
            source_mask[source == dataset.source_pad] = 0
            
            if use_teacher_forcing:
                pred, attn = model(source, target_input, source_mask=source_mask)
                # mask whatevers after <stop> 
                target_mask = torch.ones(target_output.shape)
                target_mask[target_output == dataset.target_pad] = 0
                pred = pred * target_mask.unsqueeze(-1)
                target_output = target_output * target_mask.long()
            else:
                pred, attn_weights = model(source)
                
            no_words = pred.shape[0] * pred.shape[1]
            pred = pred.reshape(no_words, -1)
            target_output = target_output.reshape(no_words)

            pred_error = loss_func(pred, target_output)
            iter_loss += pred_error
            # print("loss ", pred_error)
            attn_error = None

                
            pred_error.backward() #backward prop
            optimizer.step()
            
            if log:
                print(d.current/d.batch_size, pred_error, end='\r')
        print("loss ", iter_loss)


# In[38]:


def convert_pred_to_indices(pred):
    # print('pred',pred.shape)
    # print('pred_ideal',torch.max(pred, dim=-1)[1].shape)
    # print(torch.max(pred, dim=-1)[1])
    return torch.max(pred, dim=-1)[1]


# In[39]:


def eval(model, dataset, log=False, source_test=None, target_test=None):
    scores = []
    if source_test is None:
        source_test = dataset.source_test
        target_test = dataset.target_test
        
    for sentence, target in zip(source_test, target_test):
        input_text = dataset.convert_indices_to_words_eng([sentence[::-1]] )
        input = torch.LongTensor([[dataset.source_end] + sentence[::-1] + [dataset.source_start]])

        pred, _ = model(input)
        pred_words = convert_pred_to_indices(pred).cpu().numpy()
        
        predicted_target = dataset.convert_indices_to_words(pred_words)
        target = dataset.convert_indices_to_words([target])
        if log:
            print(input_text,target, predicted_target[0])
        scores.append(sentence_bleu(target, predicted_target[0], weights=(1, 0, 0, 0)))
    return sum(scores)/len(scores)

['data/dev.en', 'data/test.en', 'data/train.en']


In [12]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np


# In[2]:


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f


# In[3]:


class Encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, input):
        '''
        input dim => 2d no_sentences * no_words
        make sure that the input is in reverse
        
        Outputs:
        output, h, c
        
        output => dim: (num_sentences * num_words * hidden_dim)
        h => (num_layers * num_sentences * hidden_dim)
        c => (num_layers * num_sentences * hidden_dim)
        '''
        embeddings = self.embedding(input)
        output, (h, c) = self.encoder(embeddings)
        
        return output, h, c


# In[4]:


class Decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0, start_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.word_predictor = nn.Linear(hidden_dim, num_embeddings)
        self.start_dim = start_dim
        
    def forward(self, output, h, c, input=None, max_sen_len=20):
        '''
        Output:
        pred: (no_sentences * no_words * no_embeddings)
        **Note: if input is not None, log_softmax is returned
        else, input is returned without softmax**
        '''
        
        if input is not None: # training with teacher forcing
            
            embeddings = self.embedding(input)
            yts, _ = self.decoder(embeddings, (h, c))
            orig_shape = yts.shape
            # print("orig_shape",orig_shape, orig_shape[:-1],"nfk ", *orig_shape[:-1])
            pred = self.word_predictor(yts.reshape(orig_shape[0]*orig_shape[1], -1)).reshape(*orig_shape[:-1], -1)
            return torch.log_softmax(pred, dim=-1), None
        
        else: # training using the decoders' own predictions
            
            num_sentences = output.shape[0]
            words_selected = torch.LongTensor([[self.start_dim] for _ in range(num_sentences)])
            pred = []
            for i in range(max_sen_len):
                embeddings = self.embedding(words_selected)
                y_t, (h, c) = self.decoder(embeddings, (h, c))
                orig_shape = y_t.shape
                pred_t = self.word_predictor(y_t.reshape(orig_shape[0]*orig_shape[1], 
                                                         -1)).reshape(*orig_shape[:-1], -1).squeeze(1)

                pred.append(pred_t)
                words_selected = torch.max(pred_t, dim=1)[1].unsqueeze(1)
            
            pred = torch.stack(pred, dim=1)
            return torch.log_softmax(pred, dim=-1), None


# In[5]:


class Seq2Seq(nn.Module):
    def __init__(self, source_vocab_len, target_vocab_len, embedding_dim, hidden_dim, num_layers=1,
                       padding_dim_source=0, start_dim_target=1, padding_dim_target=0):
        super().__init__()
        self.encoder = Encoder(source_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_source)
        self.decoder = Decoder(target_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_target, start_dim_target)
        
    def forward(self, source_input, target_input=None, source_mask=None):
        output, h, c = self.encoder(source_input)
        return self.decoder(output, h, c, target_input)


# **Without teacher forcing**
# ```
# input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s = Seq2Seq(10, 12, 9, 7)
# s.cuda()
# pred = s(input)
# ```

# **With teacher forcing**
# 
# ```
# tar_input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s(input, tar_input).shape
# ```

# In[ ]:

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cpu


In [13]:
#!/usr/bin/env python
# coding: utf-8

# In[30]:


# get_ipython().run_line_magic('run', 'utils.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq_with_attention.ipynb')
# get_ipython().run_line_magic('run', 'effective_approaches.ipynb')
# get_ipython().run_line_magic('run', 'coverage.ipynb')


# In[4]:

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# m.to(device)
# torch.cuda.set_dezce(0)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# In[24]:


d = Dataset(source_files, target_files, batch_size=10)
embedding_dim = 50
hidden_dim = 100
num_layers = 2


# In[25]:


def train_and_save(m, d, str_, coverage=False, coverage_type="linguistic", log=False):
    prev_score = -1
    for i in range(20):
        train(m, d, coverage, coverage_type, log=log)
        score = eval(m, d, log=True)
        best_state = None
        print()
        print(score)
        if score > prev_score:
            #torch.save(m.state_dict(), str_ + str(score)[:4] + ".params_")
            best_state = m.state_dict()
            prev_score = score
            
    torch.save(best_state, str_ + str(prev_score)[:4] + ".params")
    print(prev_score)


# In[26]:

m = Seq2Seq(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
            padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
#print(m.cuda())
train_and_save(m, d, "seq2seq_")


# In[ ]:


# m = Seq2SeqWithAttention(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
#             padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
# print(m.cuda())
# train_and_save(m, d, "attention_")


loss  tensor(2243.2874, grad_fn=<AddBackward0>)
[['.', 'speak', 'to', 'not', 'try']] [['बोलो', 'मत।']] ['मैं', 'एक', 'को', 'में']
[['?', 'po', 'seen', 'you', 'have']] [['तुमने', 'पो', 'को', 'देखा', '?']] ['आप', 'क्या', 'क्या', 'है', '?']
[['.', 'now', 'stop', 'can', 'you', ',', 'uh']] [['उह', ',', 'तुम', 'अब', 'बंद', 'कर', 'सकते', 'हैं', '।']] ['मैं', ',', 'मैं', 'नहीं', 'है', '।']
[['!', 'him', 'for', 'going', 'im']] [['मैं', 'उसके', 'लिए', 'जा', 'रहा', 'हूँ', '!']] ['मैं', 'एक', 'एक', 'है', '?']
[['.', 'down', 'is', 'oxygen', 'is', 'mako']] [['mako', 'की', 'ऑक्सीजन', 'नीचे', 'है', '।']] ['हम', 'एक', 'एक', 'में', 'है', '।']
[['!', 'red', 'code', 'a', 'got', 'we']] [['हमें', 'लाल', 'कोड', 'मिला', 'है', '!']] ['हम', 'एक', 'एक', 'एक', 'है', '!']
[['.', 'it', 'made', 'i']] [['~', '~', 'मैं', 'इसे', 'बनाया', 'है', '।']] ['मैं', 'मैं', 'नहीं', 'है', '।']
[['?', 'mean', 'that', 'does', 'what']] [['कि', 'क्या', 'मतलब', 'है', '?']] ['क्या', 'क्या', 'क्या', 'है', '?']
[['?', 'at', 'guy', 'this'

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


[['?', 'them', 'tell', 'you', 'would', 'what']] [['क्या', 'आप', 'उन्हें', 'बताना', 'होगा', '?']] ['आप', 'क्या', 'क्या', 'है', '?']
[['.', 'type', 'her', 'for', ',', 'pretty']] [['ठीक', 'ठाक', 'सुंदर', 'है']] ['मैं', ',', 'एक', 'नहीं', 'है', '।']
[['.', 'lebanon', 'west', ',', 'africa', 'central', ',', 'exmercenary']] [['पूर्व', 'भाड़े', ',', 'मध्य', 'अफ्रीका', ',', 'पश्चिम', 'लेबनान', '।']] [',', ',', ',', 'यह', 'एक', 'है', '।']
[['?', 'available', 'trixie', 'mistress', 'is']] [['ट्रिक्सी', 'से', 'बात', 'हो', 'सकती', 'है', '?']] ['यह', 'क्या', 'है', '?']
[['!', 'pressure', 'to', 'due', 'pressed', 'was']] [['दबाव', 'की', 'वजह', 'से', 'दबाया', 'गया', 'था', '!']] ['मैं', 'एक', 'एक', 'में', 'है', '।']
[['?', 'bourbon', 'bulleit', 'carry', 'you']] [['आप', 'बुलेट', 'बोरबॉन', 'ले', '?']] ['आप', 'क्या', 'क्या', 'है', '?']
[['.', 'shame', 'damn', 'a', 'is', 'that']] [['एक', 'बहुत', 'शर्म', 'की', 'बात', 'है', '।']] ['यह', 'एक', 'एक', 'है', '।']
[['.', 'me', 'excuse', ',', 'sorry', 'so', 'im']] [

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[['?', 'king', 'the', 'or', 'jack', 'the']] [['जैक', 'या', 'राजा', '?']] ['क्या', 'या', 'की', 'तरह', '?']
[['.', 'brother', 'your', 'about', 'is', '"it']] [['"', 'यह', 'अपने', 'भाई', 'के', 'बारे', 'में', 'है', '।']] ['हमारे', 'बारे', 'के', 'बारे', 'में', 'क्या', 'अपनी', 'हैं', '।']
[['?', 'sir', ',', 'do', 'we', 'can', 'what']] [['हम', 'श्रीमान', ',', 'क्या', 'कर', 'सकते', 'हैं', '?']] ['हम', 'सही', 'हो', 'सकता', 'है', '?']
[['.', 'moment', 'my', 'is', 'this']] [['यह', 'मेरा', 'समय', 'है।']] ['यह', 'मेरा', 'मेरा', 'है', '।']
[['?', 'me', 'kidding', 'fucking', 'you']] [['आप', 'मज़ाक', 'कर', 'रहे', 'हैं', '?']] ['मुझे', 'मुझे', 'खाता', 'है', '?']
[['.', 'coming', 'not', 'were', 'you', 'thought', 'she']] [['उसे', 'लग', 'रहा', 'था', 'तुम', 'आओगे', 'ही', 'नहीं।']] ['वह', 'तुम्हें', 'नहीं', 'करना', 'चाहिए', '।']
[['.', 'andrew', ',', 'this', 'about', 'sorry']] [['ऐंड्रऊ', 'इसके', 'लिए', 'माफ़', 'करना', '।']] ['इस', 'बारे', 'में',

##seq_attention

In [34]:
#!/usr/bin/env python
# coding: utf-8

# In[5]:


# get_ipython().run_line_magic('run', 'seq2seq.ipynb')


# In[2]:


class Attention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.layer1 = nn.Linear(dim * 2, dim)
        self.layer2 = nn.Linear(dim, 1)
        
    def forward(self, source_h, target_h, source_mask=None):
        
        num_sentences, num_words = source_h.shape[:-1]
        target_h = target_h.repeat(1, num_words).reshape(num_words * num_sentences, -1)
        mlp_input = torch.cat((target_h, source_h.reshape(num_words * num_sentences, -1)
                              ), dim=1).reshape(num_words * num_sentences, -1)
        
        h1 = torch.tanh(self.layer1(mlp_input))
        h2 = self.layer2(h1)
        
        raw_weights = h2.reshape(num_sentences, num_words)
        
        if source_mask is not None:
            raw_weights = source_mask * raw_weights
        weights = torch.softmax(f.relu(raw_weights), dim=1)
        
        return torch.sum(weights.unsqueeze(-1) * source_h, dim=1)


# ```
# source_h = torch.Tensor(np.random.randint(0, 10, (2, 4, 10)))
# target_h = torch.Tensor(np.random.randint(0, 10, (2, 10)))
# a = Attention(10)
# a(source_h, target_h)
# ```

# In[11]:


class DecoderAttention(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0, start_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.decoder = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.word_predictor = nn.Linear(hidden_dim, num_embeddings)
        self.start_dim = start_dim
        
        self.attention = Attention(hidden_dim)
        
    def forward(self, output, h, c, input=None, max_sen_len=20, source_mask=None):
        
        source_h = output
        num_sentences, num_words, hidden_dim = output.shape
        words_selected = torch.LongTensor([[self.start_dim] for _ in range(num_sentences)])

        decoder_context = torch.zeros(num_sentences, hidden_dim).cuda()
        pred = []
        
        if input is not None:
            max_sen_len = input.shape[1]
            teacher_words = input.t()
        
        for i in range(max_sen_len):
            embeddings = self.embedding(words_selected)
            decoder_input = torch.cat((embeddings, decoder_context.unsqueeze(1)), dim=-1)
            y_t, (h, c) = self.decoder(decoder_input, (h, c))
            orig_shape = y_t.shape
            pred_t = self.word_predictor(y_t.reshape(orig_shape[0]*orig_shape[1], 
                                                     -1)).reshape(*orig_shape[:-1], -1).squeeze(1)
            pred.append(pred_t)
            decoder_context = self.attention(source_h, h[-1], source_mask)
            
            if input is None:
                words_selected = torch.max(pred_t, dim=1)[1].unsqueeze(1)
            else:
                words_selected = teacher_words[i].unsqueeze(1)


        pred = torch.stack(pred, dim=1)
        return torch.log_softmax(pred, dim=-1), None


# In[28]:


class Seq2SeqWithAttention(nn.Module):
    def __init__(self, source_vocab_len, target_vocab_len, embedding_dim, hidden_dim, num_layers=1,
                       padding_dim_source=0, start_dim_target=1, padding_dim_target=0):
        super().__init__()
        self.encoder = Encoder(source_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_source)
        self.decoder = DecoderAttention(target_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_target, start_dim_target)
        
    def forward(self, source_input, target_input=None, source_mask=None):
        output, h, c = self.encoder(source_input)
        return self.decoder(output, h, c, target_input, source_mask=source_mask)


# ```
# s = Seq2SeqWithAttention(10, 12, 9, 8)
# s.cuda()
# input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# target = torch.cuda.LongTensor(np.random.randint(0, 12, (3, 10))
# 
# # without teacher forcing
# s(input)
# 
# # with teacher forcing
# ```

In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[30]:


# get_ipython().run_line_magic('run', 'utils.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq_with_attention.ipynb')
# get_ipython().run_line_magic('run', 'effective_approaches.ipynb')
# get_ipython().run_line_magic('run', 'coverage.ipynb')


# In[4]:

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# m.to(device)
# torch.cuda.set_dezce(0)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# In[24]:


d = Dataset(source_files, target_files, batch_size=10)
embedding_dim = 50
hidden_dim = 100
num_layers = 2


# In[25]:


def train_and_save(m, d, str_, coverage=False, coverage_type="linguistic", log=False):
    prev_score = -1
    for i in range(20):
        train(m, d, coverage, coverage_type, log=log)
        score = eval(m, d, log=True)
        best_state = None
        print()
        print(score)
        if score > prev_score:
            #torch.save(m.state_dict(), str_ + str(score)[:4] + ".params_")
            best_state = m.state_dict()
            prev_score = score
            
    torch.save(best_state, str_ + str(prev_score)[:4] + ".params")
    print(prev_score)


m = Seq2SeqWithAttention(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
            padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
# print(m.cuda())
train_and_save(m, d, "attention_")


#try


In [None]:
import pandas as pd
df = pd.read_pickle('en_hi.pkl')
train, validate, test = np.split(df.sample(frac=1), [int(.7*len(df)), int(.8*len(df))])
print(train.shape, validate.shape, test.shape)

(7000, 2) (1000, 2) (2000, 2)


In [None]:
with open('/content/drive/My Drive/NeuralMT/data/dev.en', 'w') as writefile:
    writefile.write(('').join(validate['english']))
with open('/content/drive/My Drive/NeuralMT/data/dev.hi', 'w') as writefile:
    writefile.write(('').join(validate['hindi']))
with open('/content/drive/My Drive/NeuralMT/data/test.en', 'w') as writefile:
    writefile.write(('').join(test['english']))
with open('/content/drive/My Drive/NeuralMT/data/test.hi', 'w') as writefile:
    writefile.write(('').join(test['hindi']))
with open('/content/drive/My Drive/NeuralMT/data/train.en', 'w') as writefile:
    writefile.write(('').join(train['english']))
with open('/content/drive/My Drive/NeuralMT/data/train.hi', 'w') as writefile:
    writefile.write(('').join(train['hindi']))

In [None]:
data_folder = "data"
files = os.listdir(data_folder)
source_files = [os.path.join(data_folder, file) for file in files if ".en" in file]
target_files = [os.path.join(data_folder, file) for file in files if ".hi" in file]
print(source_files)

def generate_vocab(filenames):
    vocab = set()
    # no_words = 0
    for file in filenames:
        content = open(file).read()
        sentences = [sentence.split() for sentence in content.split('\n')]
        for sentence in sentences:
            for word in sentence:
                if word.lower() not in vocab:
                    vocab.add(word.lower())
                    # no_words += 1
    vocab = {word: num for num, word in enumerate(vocab)}
    return vocab

def parse_files_to_indices(filename, vocab):
    content = open(filename).read()
    return [[vocab[word] for word in sentence.split() if word in vocab] for sentence in content.split('\n')]

class Dataset:
    def __init__(self, source_files, target_files, batch_size=10):
        self.source_vocab = generate_vocab(source_files) #for entire lang
        self.target_vocab = generate_vocab(target_files) #for entire lang
        
        self.len_source = len(self.source_vocab.keys())
        self.source_pad, self.source_start, self.source_end = self.len_source + 2, self.len_source + 1, self.len_source
        self.len_target = len(self.target_vocab.keys())
        self.target_pad, self.target_start, self.target_end = self.len_target + 2, self.len_target + 1, self.len_target
        
        self.source_vocab['<pad>'], self.source_vocab['<start>'], self.source_vocab['<end>'] = [self.source_pad, 
                                                                                                self.source_start, 
                                                                                                self.source_end]
        self.target_vocab['<pad>'], self.target_vocab['<start>'], self.target_vocab['<end>'] = [self.target_pad,
                                                                                                self.target_start, 
                                                                                                self.target_end]
                                                                                              
        ##created indices for <pad>, <start> and <end> tokens
        self.len_source = len(self.source_vocab.keys())
        self.len_target = len(self.target_vocab.keys())
        
        ##inverse vocab for entire lang
        self.source_vocab_inv = {value:key for key, value in self.source_vocab.items()}
        self.target_vocab_inv = {value:key for key, value in self.target_vocab.items()}
        
        ##parsing files into indices
        for filename in source_files:
            if 'train' in filename:
                self.source_train = parse_files_to_indices(filename, self.source_vocab)
            if 'test' in filename:
                self.source_test = parse_files_to_indices(filename, self.source_vocab)
            if 'dev' in filename:
                self.source_dev = parse_files_to_indices(filename, self.source_vocab)
                
        for filename in target_files:
            if 'train' in filename:
                self.target_train = parse_files_to_indices(filename, self.target_vocab)
            if 'test' in filename:
                self.target_test = parse_files_to_indices(filename, self.target_vocab)
            if 'dev' in filename:
                self.target_dev = parse_files_to_indices(filename, self.target_vocab)
                
        ##random shuffling of training data before splitting into batches    
        self.indices = list(range(len(self.source_train)))
        random.shuffle(self.indices)
        self.current = -batch_size
        self.batch_size = batch_size
        
    def __get_batch_input(self, indices):
        source, target_input, target_target = [], [], []
        
        # padding length of sentence
        source_max_len = max(len(self.source_train[i]) for i in indices)
        target_max_len = max(len(self.target_train[i]) for i in indices)
        
        for i in indices:
            length = len(self.source_train[i])
            # reverse_source_sentences and pad at beginning
            sentence = [self.source_pad for _ in range(source_max_len - length)] + [self.source_end] + self.source_train[i][::-1] + [self.source_start]
            source.append(sentence)
            
            length = len(self.target_train[i])
            # padding at end for target
            sentence = [self.target_start] + self.target_train[i] + [self.target_end] + [self.target_pad for _ in range(target_max_len - length)]
            target_input.append(sentence)
            #removed <sos> and put <pad> at end
            target_target.append(sentence[1:] + [self.target_pad])
            
        return np.array(source), np.array(target_input), np.array(target_target)
    
    def get_batch_input(self):
        if self.current > len(self.indices) - self.batch_size:
            self.current = 0
            return None, None, None
        self.current += self.batch_size
        return self.__get_batch_input(self.indices[self.current: self.current + self.batch_size])
    
    def convert_indices_to_words(self, indices):
        output = []
        for sentence in indices:
            s = []
            for word in sentence:
                s.append(self.target_vocab_inv[word])
            output.append(s)
        return output


# ```
# %run seq2seq.ipynb
# 
# d = Dataset(source_files, target_files, 10)
# m = Seq2Seq(d.len_source, d.len_target, 9, 7)
# m.cuda()
# 
# a, b = d.get_batch_input()
# source, target = torch.cuda.LongTensor(a), torch.cuda.LongTensor(b)
# 
# # For teacher forcing:
# m(source, target)
# 
# # Using model's own predictions:
# m(source)
# ```

# In[37]:


def train(model, dataset, coverage=False, coverage_type="linguistic", iterations=1, use_teacher_forcing=True, log=True):
    # TODO: attention error
    loss_func = nn.NLLLoss()
    attn_loss_func = nn.MSELoss()
    
    optimizer = optim.Adagrad(model.parameters()) 
    
    for i in range(iterations):
        while True:
            optimizer.zero_grad() #set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
            source, target_input, target_output = d.get_batch_input() #sentences made into indices and padded to sentence lenght 

            if source is None: # end of current iteration
                break

            source, target_input, target_output = [torch.cuda.LongTensor(source), 
                                                   torch.cuda.LongTensor(target_input), 
                                                   torch.cuda.LongTensor(target_output)]
            
            source_mask = torch.ones(source.shape).cuda()
            source_mask[source == dataset.source_pad] = 0
            
            if use_teacher_forcing:
                pred, attn = model(source, target_input, source_mask=source_mask)
                # mask whatevers after <stop> 
                target_mask = torch.ones(target_output.shape).cuda()
                target_mask[target_output == dataset.target_pad] = 0
                pred = pred * target_mask.unsqueeze(-1)
                target_output = target_output * target_mask.long()
            else:
                pred, attn_weights = model(source)
                
            no_words = pred.shape[0] * pred.shape[1]
            pred = pred.reshape(no_words, -1)
            target_output = target_output.reshape(no_words)

            pred_error = loss_func(pred, target_output)
            attn_error = None
            
            if coverage:
                # if coverage type is linguistic, ignore fertility
                attn_weights, fertility = attn
                if coverage_type == "linguistic":
                    fertility = torch.ones(fertility.shape).cuda()
                attn_error = attn_loss_func(torch.sum(attn_weights, dim=-1) * source_mask, fertility * source_mask)
                pred_error += attn_error
                
            pred_error.backward() #backward prop
            optimizer.step()
            
            if log:
                print(d.current/d.batch_size, pred_error, end='\r')


# In[38]:


def convert_pred_to_indices(pred):
    return torch.max(pred, dim=-1)[1]


# In[39]:


def eval(model, dataset, log=False, source_test=None, target_test=None):
    scores = []
    if source_test is None:
        source_test = dataset.source_test
        target_test = dataset.target_test
        
    for sentence, target in zip(source_test, target_test):
        input = torch.cuda.LongTensor([[dataset.source_end] + sentence[::-1] + [dataset.source_start]])
        pred, _ = model(input)
        pred_words = convert_pred_to_indices(pred).cpu().numpy()
        predicted_target = dataset.convert_indices_to_words(pred_words)
        target = dataset.convert_indices_to_words([target])
        if log:
            print(target, predicted_target[0])
        scores.append(sentence_bleu(target, predicted_target[0], weights=(1, 0, 0, 0)))
    return sum(scores)/len(scores)

['data/dev.en', 'data/test.en', 'data/train.en']


In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np


# In[2]:


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f


# In[3]:


class Encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, input):
        '''
        input dim => 2d no_sentences * no_words
        make sure that the input is in reverse
        
        Outputs:
        output, h, c
        
        output => dim: (num_sentences * num_words * hidden_dim)
        h => (num_layers * num_sentences * hidden_dim)
        c => (num_layers * num_sentences * hidden_dim)
        '''
        embeddings = self.embedding(input)
        output, (h, c) = self.encoder(embeddings)
        
        return output, h, c


# In[4]:


class Decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0, start_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.word_predictor = nn.Linear(hidden_dim, num_embeddings)
        self.start_dim = start_dim
        
    def forward(self, output, h, c, input=None, max_sen_len=20):
        '''
        Output:
        pred: (no_sentences * no_words * no_embeddings)
        **Note: if input is not None, log_softmax is returned
        else, input is returned without softmax**
        '''
        
        if input is not None: # training with teacher forcing
            
            embeddings = self.embedding(input)
            yts, _ = self.decoder(embeddings, (h, c))
            orig_shape = yts.shape
            pred = self.word_predictor(yts.reshape(orig_shape[0]*orig_shape[1], -1)).reshape(*orig_shape[:-1], -1)
            return torch.log_softmax(pred, dim=-1), None
        
        else: # training using the decoders' own predictions
            
            num_sentences = output.shape[0]
            words_selected = torch.cuda.LongTensor([[self.start_dim] for _ in range(num_sentences)])
            pred = []
            for i in range(max_sen_len):
                embeddings = self.embedding(words_selected)
                y_t, (h, c) = self.decoder(embeddings, (h, c))
                orig_shape = y_t.shape
                pred_t = self.word_predictor(y_t.reshape(orig_shape[0]*orig_shape[1], 
                                                         -1)).reshape(*orig_shape[:-1], -1).squeeze(1)
                pred.append(pred_t)
                words_selected = torch.max(pred_t, dim=1)[1].unsqueeze(1)
            
            pred = torch.stack(pred, dim=1)
            return torch.log_softmax(pred, dim=-1), None


# In[5]:


class Seq2Seq(nn.Module):
    def __init__(self, source_vocab_len, target_vocab_len, embedding_dim, hidden_dim, num_layers=1,
                       padding_dim_source=0, start_dim_target=1, padding_dim_target=0):
        super().__init__()
        self.encoder = Encoder(source_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_source)
        self.decoder = Decoder(target_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_target, start_dim_target)
        
    def forward(self, source_input, target_input=None, source_mask=None):
        output, h, c = self.encoder(source_input)
        return self.decoder(output, h, c, target_input)


# **Without teacher forcing**
# ```
# input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s = Seq2Seq(10, 12, 9, 7)
# s.cuda()
# pred = s(input)
# ```

# **With teacher forcing**
# 
# ```
# tar_input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s(input, tar_input).shape
# ```

# In[ ]:

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cpu


In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[30]:


# get_ipython().run_line_magic('run', 'utils.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq_with_attention.ipynb')
# get_ipython().run_line_magic('run', 'effective_approaches.ipynb')
# get_ipython().run_line_magic('run', 'coverage.ipynb')


# In[4]:

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# m.to(device)
torch.cuda.set_device(0)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# In[24]:


d = Dataset(source_files, target_files, batch_size=10)
embedding_dim = 50
hidden_dim = 100
num_layers = 2


# In[25]:


def train_and_save(m, d, str_, coverage=False, coverage_type="linguistic", log=False):
    prev_score = -1
    for i in range(20):
        train(m, d, coverage, coverage_type, log=log)
        score = eval(m, d)
        best_state = None
        print()
        print(score)
        if score > prev_score:
            #torch.save(m.state_dict(), str_ + str(score)[:4] + ".params_")
            best_state = m.state_dict()
            prev_score = score
            
    torch.save(best_state, str_ + str(prev_score)[:4] + ".params")
    print(prev_score)


# In[26]:

m = Seq2Seq(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
            padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
#print(m.cuda())
train_and_save(m, d, "seq2seq_")


# In[ ]:


# m = Seq2SeqWithAttention(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
#             padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
# print(m.cuda())
# train_and_save(m, d, "attention_")


RuntimeError: ignored

#utils

In [None]:
data_folder = "data"
files = os.listdir(data_folder)
source_files = [os.path.join(data_folder, file) for file in files if ".en" in file]
target_files = [os.path.join(data_folder, file) for file in files if ".hi" in file]
#print(source_files)

def generate_vocab(filenames):
    vocab = set()
    no_words = 0
    for file in filenames:
        content = open(file).read()
        sentences = [sentence.split() for sentence in content.split('\n')]
        for sentence in sentences:
            for word in sentence:
                if word.lower() not in vocab:
                    vocab.add(word.lower())
                    no_words += 1
    vocab = {word: num for num, word in enumerate(vocab)}
    return vocab

def parse_files_to_indices(filename, vocab):
    content = open(filename).read()
    return [[vocab[word] for word in sentence.split() if word in vocab] for sentence in content.split('\n')]

class Dataset:
    def __init__(self, source_files, target_files, batch_size=10):
        self.source_vocab = generate_vocab(source_files)
        self.target_vocab = generate_vocab(target_files)
        
        self.len_source = len(self.source_vocab.keys())
        self.source_pad, self.source_start, self.source_end = self.len_source + 2, self.len_source + 1, self.len_source
        self.len_target = len(self.target_vocab.keys())
        self.target_pad, self.target_start, self.target_end = self.len_target + 2, self.len_target + 1, self.len_target
        
        self.source_vocab['<pad>'], self.source_vocab['<start>'], self.source_vocab['<end>'] = [self.source_pad, 
                                                                                                self.source_start, 
                                                                                                self.source_end]
        self.target_vocab['<pad>'], self.target_vocab['<start>'], self.target_vocab['<end>'] = [self.target_pad,
                                                                                                self.target_start, 
                                                                                                self.target_end]
        self.len_source = len(self.source_vocab.keys())
        self.len_target = len(self.target_vocab.keys())
        
        self.source_vocab_inv = {value:key for key, value in self.source_vocab.items()}
        self.target_vocab_inv = {value:key for key, value in self.target_vocab.items()}


        for filename in source_files:
            if 'train' in filename:
                self.source_train = parse_files_to_indices(filename, self.source_vocab)
            if 'test' in filename:
                self.source_test = parse_files_to_indices(filename, self.source_vocab)
            if 'dev' in filename:
                self.source_dev = parse_files_to_indices(filename, self.source_vocab)
                
        for filename in target_files:
            if 'train' in filename:
                self.target_train = parse_files_to_indices(filename, self.target_vocab)
            if 'test' in filename:
                self.target_test = parse_files_to_indices(filename, self.target_vocab)
            if 'dev' in filename:
                self.target_dev = parse_files_to_indices(filename, self.target_vocab)
                
            
        self.indices = list(range(len(self.source_train)))
        random.shuffle(self.indices)
        self.current = -batch_size
        self.batch_size = batch_size
        
    def __get_batch_input(self, indices):
        source, target_input, target_target = [], [], []
        
        # padding length
        source_max_len = max(len(self.source_train[i]) for i in indices)
        target_max_len = max(len(self.target_train[i]) for i in indices)
        
        for i in indices:
            length = len(self.source_train[i])
            # reverse_source_sentences and pad at beginning
            sentence = [self.source_pad for _ in range(source_max_len - length)] + [self.source_end] + self.source_train[i][::-1] + [self.source_start]
            source.append(sentence)
            
            length = len(self.target_train[i])
            # padding at end for target
            sentence = [self.target_start] + self.target_train[i] + [self.target_end] + [self.target_pad for _ in range(target_max_len - length)]
            target_input.append(sentence)
            target_target.append(sentence[1:] + [self.target_pad])
            
        return np.array(source), np.array(target_input), np.array(target_target)
    
    def get_batch_input(self):
        if self.current > len(self.indices) - self.batch_size:
            self.current = 0
            return None, None, None
        self.current += self.batch_size
        return self.__get_batch_input(self.indices[self.current: self.current + self.batch_size])
    
    def convert_indices_to_words(self, indices):
        output = []
        for sentence in indices:
            s = []
            for word in sentence:
                s.append(self.target_vocab_inv[word])
            output.append(s)
        return output


# ```
# %run seq2seq.ipynb
# 
# d = Dataset(source_files, target_files, 10)
# m = Seq2Seq(d.len_source, d.len_target, 9, 7)
# m.cuda()
# 
# a, b = d.get_batch_input()
# source, target = torch.cuda.LongTensor(a), torch.cuda.LongTensor(b)
# 
# # For teacher forcing:
# m(source, target)
# 
# # Using model's own predictions:
# m(source)
# ```

# In[37]:


def train(model, dataset, coverage=False, coverage_type="linguistic", iterations=1, use_teacher_forcing=False, log=True):
    # TODO: attention error
    loss_func = nn.NLLLoss()
    attn_loss_func = nn.MSELoss()
    
    optimizer = optim.Adagrad(model.parameters())
    
    for i in range(iterations):
        while True:
            optimizer.zero_grad()
            source, target_input, target_output = d.get_batch_input()

            if source is None: # end of current iteration
                break

            source, target_input, target_output = [torch.cuda.LongTensor(source), 
                                                   torch.cuda.LongTensor(target_input), 
                                                   torch.cuda.LongTensor(target_output)]
            
            source_mask = torch.ones(source.shape).cuda()
            source_mask[source == dataset.source_pad] = 0
            
            if use_teacher_forcing:
                pred, attn = model(source, target_input, source_mask=source_mask)
                # mask whatevers after <stop> 
                target_mask = torch.ones(target_output.shape).cuda()
                target_mask[target_output == dataset.target_pad] = 0
                pred = pred * target_mask.unsqueeze(-1)
                target_output = target_output * target_mask.long()
            else:
                pred, attn_weights = model(source.cuda())
                
            no_words = pred.shape[0] * pred.shape[1]
            pred = pred.reshape(no_words, -1)
            target_output = target_output.reshape(no_words)

            pred_error = loss_func(pred, target_output)
            attn_error = None
            
            if coverage:
                # if coverage type is linguistic, ignore fertility
                attn_weights, fertility = attn
                if coverage_type == "linguistic":
                    fertility = torch.ones(fertility.shape).cuda()
                attn_error = attn_loss_func(torch.sum(attn_weights, dim=-1) * source_mask, fertility * source_mask)
                pred_error += attn_error
                
            pred_error.backward()
            optimizer.step()
            
            if log:
                print(d.current/d.batch_size, pred_error, end='\r')


# In[38]:


def convert_pred_to_indices(pred):
    return torch.max(pred, dim=-1)[1]


# In[39]:


def eval(model, dataset, log=False, source_test=None, target_test=None):
    scores = []
    if source_test is None:
        source_test = dataset.source_test
        target_test = dataset.target_test
        
    for sentence, target in zip(source_test, target_test):
        input = torch.cuda.LongTensor([[dataset.source_end] + sentence[::-1] + [dataset.source_start]])
        pred, _ = model(input)
        pred_words = convert_pred_to_indices(pred).cpu().numpy()
        predicted_target = dataset.convert_indices_to_words(pred_words)
        target = dataset.convert_indices_to_words([target])
        if log:
            print(target, predicted_target[0])
        scores.append(sentence_bleu(target, predicted_target[0], weights=(1, 0, 0, 0)))
    return sum(scores)/len(scores)


# In[ ]:





# In[ ]:





#seq2seq

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f

class Encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, input):
        '''
        input dim => 2d no_sentences * no_words
        make sure that the input is in reverse
        
        Outputs:
        output, h, c
        
        output => dim: (num_sentences * num_words * hidden_dim)
        h => (num_layers * num_sentences * hidden_dim)
        c => (num_layers * num_sentences * hidden_dim)
        '''
        embeddings = self.embedding(input)
        output, (h, c) = self.encoder(embeddings)
        
        return output, h, c

class Decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0, start_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.word_predictor = nn.Linear(hidden_dim, num_embeddings)
        self.start_dim = start_dim
        
    def forward(self, output, h, c, input=None, max_sen_len=20):
        '''
        Output:
        pred: (no_sentences * no_words * no_embeddings)
        **Note: if input is not None, log_softmax is returned
        else, input is returned without softmax**
        '''
        
        if input is not None: # training with teacher forcing
            
            embeddings = self.embedding(input)
            yts, _ = self.decoder(embeddings, (h, c))
            orig_shape = yts.shape
            pred = self.word_predictor(yts.reshape(orig_shape[0]*orig_shape[1], -1)).reshape(*orig_shape[:-1], -1)
            return torch.log_softmax(pred, dim=-1), None
        
        else: # training using the decoders' own predictions
            
            num_sentences = output.shape[0]
            words_selected = torch.cuda.LongTensor([[self.start_dim] for _ in range(num_sentences)])
            pred = []
            for i in range(max_sen_len):
                embeddings = self.embedding(words_selected)
                y_t, (h, c) = self.decoder(embeddings, (h, c))
                orig_shape = y_t.shape
                pred_t = self.word_predictor(y_t.reshape(orig_shape[0]*orig_shape[1], 
                                                         -1)).reshape(*orig_shape[:-1], -1).squeeze(1)
                pred.append(pred_t)
                words_selected = torch.max(pred_t, dim=1)[1].unsqueeze(1)
            
            pred = torch.stack(pred, dim=1)
            return torch.log_softmax(pred, dim=-1), None


# In[5]:


class Seq2Seq(nn.Module):
    def __init__(self, source_vocab_len, target_vocab_len, embedding_dim, hidden_dim, num_layers=1,
                       padding_dim_source=0, start_dim_target=1, padding_dim_target=0):
        super().__init__()
        self.encoder = Encoder(source_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_source)
        self.decoder = Decoder(target_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_target, start_dim_target)
        
    def forward(self, source_input, target_input=None, source_mask=None):
        output, h, c = self.encoder(source_input)
        return self.decoder(output, h, c, target_input)


# **Without teacher forcing**
# ```
# input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s = Seq2Seq(10, 12, 9, 7)
# s.cuda()
# pred = s(input)
# ```

# **With teacher forcing**
# 
# ```
# tar_input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# s(input, tar_input).shape
# ```

# In[ ]:

#seg2seq - attention

In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[5]:


# get_ipython().run_line_magic('run', 'seq2seq.ipynb')


# In[2]:


class Attention(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.layer1 = nn.Linear(dim * 2, dim)
        self.layer2 = nn.Linear(dim, 1)
        
    def forward(self, source_h, target_h, source_mask=None):
        
        num_sentences, num_words = source_h.shape[:-1]
        target_h = target_h.repeat(1, num_words).reshape(num_words * num_sentences, -1)
        mlp_input = torch.cat((target_h, source_h.reshape(num_words * num_sentences, -1)
                              ), dim=1).reshape(num_words * num_sentences, -1)
        
        h1 = torch.tanh(self.layer1(mlp_input))
        h2 = self.layer2(h1)
        
        raw_weights = h2.reshape(num_sentences, num_words)
        
        if source_mask is not None:
            raw_weights = source_mask * raw_weights
        weights = torch.softmax(f.relu(raw_weights), dim=1)
        
        return torch.sum(weights.unsqueeze(-1) * source_h, dim=1)


# ```
# source_h = torch.Tensor(np.random.randint(0, 10, (2, 4, 10)))
# target_h = torch.Tensor(np.random.randint(0, 10, (2, 10)))
# a = Attention(10)
# a(source_h, target_h)
# ```

# In[11]:


class DecoderAttention(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_dim, num_layers, padding_dim=0, start_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim, padding_dim)
        self.decoder = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.word_predictor = nn.Linear(hidden_dim, num_embeddings)
        self.start_dim = start_dim
        
        self.attention = Attention(hidden_dim)
        
    def forward(self, output, h, c, input=None, max_sen_len=20, source_mask=None):
        
        source_h = output
        num_sentences, num_words, hidden_dim = output.shape
        words_selected = torch.cuda.LongTensor([[self.start_dim] for _ in range(num_sentences)])

        decoder_context = torch.zeros(num_sentences, hidden_dim).cuda()
        pred = []
        
        if input is not None:
            max_sen_len = input.shape[1]
            teacher_words = input.t()
        
        for i in range(max_sen_len):
            embeddings = self.embedding(words_selected)
            decoder_input = torch.cat((embeddings, decoder_context.unsqueeze(1)), dim=-1)
            y_t, (h, c) = self.decoder(decoder_input, (h, c))
            orig_shape = y_t.shape
            pred_t = self.word_predictor(y_t.reshape(orig_shape[0]*orig_shape[1], 
                                                     -1)).reshape(*orig_shape[:-1], -1).squeeze(1)
            pred.append(pred_t)
            decoder_context = self.attention(source_h, h[-1], source_mask)
            
            if input is None:
                words_selected = torch.max(pred_t, dim=1)[1].unsqueeze(1)
            else:
                words_selected = teacher_words[i].unsqueeze(1)


        pred = torch.stack(pred, dim=1)
        return torch.log_softmax(pred, dim=-1), None


# In[28]:


class Seq2SeqWithAttention(nn.Module):
    def __init__(self, source_vocab_len, target_vocab_len, embedding_dim, hidden_dim, num_layers=1,
                       padding_dim_source=0, start_dim_target=1, padding_dim_target=0):
        super().__init__()
        self.encoder = Encoder(source_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_source)
        self.decoder = DecoderAttention(target_vocab_len, embedding_dim, hidden_dim, num_layers, padding_dim_target, start_dim_target)
        
    def forward(self, source_input, target_input=None, source_mask=None):
        output, h, c = self.encoder(source_input)
        return self.decoder(output, h, c, target_input, source_mask=source_mask)


# ```
# s = Seq2SeqWithAttention(10, 12, 9, 8)
# s.cuda()
# input = torch.cuda.LongTensor(np.random.randint(0, 10, (3, 6)))
# target = torch.cuda.LongTensor(np.random.randint(0, 12, (3, 10))
# 
# # without teacher forcing
# s(input)
# 
# # with teacher forcing
# ```

ERROR:root:File `'seq2seq.ipynb.py'` not found.


#train&test

In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[30]:


# get_ipython().run_line_magic('run', 'utils.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq.ipynb')
# get_ipython().run_line_magic('run', 'seq2seq_with_attention.ipynb')
# get_ipython().run_line_magic('run', 'effective_approaches.ipynb')
# get_ipython().run_line_magic('run', 'coverage.ipynb')


# In[4]:


torch.cuda.set_device(0)

# In[24]:


d = Dataset(source_files, target_files, batch_size=10)
embedding_dim = 50
hidden_dim = 100
num_layers = 2


# In[25]:


def train_and_save(m, d, str_, coverage=False, coverage_type="linguistic", log=False):
    prev_score = -1
    for i in range(20):
        train(m, d, coverage, coverage_type, log=log).cuda()
        score = eval(m, d)
        best_state = None
        print()
        print(score)
        if score > prev_score:
            #torch.save(m.state_dict(), str_ + str(score)[:4] + ".params_")
            best_state = m.state_dict()
            prev_score = score
            
    torch.save(best_state, str_ + str(prev_score)[:4] + ".params")
    print(prev_score)


# In[26]:


m = Seq2Seq(d.len_source, d.len_target, embedding_dim, hidden_dim, num_layers=2, 
            padding_dim_source=d.source_pad, padding_dim_target=d.target_pad, start_dim_target=d.target_start)
#print(m.cuda())
train_and_save(m, d, "seq2seq_")

RuntimeError: ignored