In [None]:
!pip install torchtext==0.10.0 pydantic

In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [1]:
import spacy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchtext
from torch.utils.data import Dataset
from torchtext.legacy.data import Iterator
from torchtext.legacy.data import Example
from torchtext.legacy.data import Field
from torchtext.datasets import SQuAD2
import random
import string

In [2]:
train_data, val_data = SQuAD2(split=('train', 'dev'))

In [3]:
train_dictionary = {"Questions" : [], "Answers": []}
for _, q, a, _ in train_data:
    train_dictionary["Questions"].append(q)
    train_dictionary["Answers"].append(a[0])
train_df = pd.DataFrame(train_dictionary)

In [4]:
val_dictionary = {"Questions" : [], "Answers": []}
for _, q, a, _ in val_data:
    val_dictionary["Questions"].append(q)
    val_dictionary["Answers"].append(a[0])
val_df = pd.DataFrame(val_dictionary)

In [5]:
train_df.head()

Unnamed: 0,Questions,Answers
0,When did Beyonce start becoming popular?,in the late 1990s
1,What areas did Beyonce compete in when she was...,singing and dancing
2,When did Beyonce leave Destiny's Child and bec...,2003
3,In what city and state did Beyonce grow up?,"Houston, Texas"
4,In which decade did Beyonce become famous?,late 1990s


In [6]:
val_df.head()

Unnamed: 0,Questions,Answers
0,In what country is Normandy located?,France
1,When were the Normans in Normandy?,10th and 11th centuries
2,From which countries did the Norse originate?,"Denmark, Iceland and Norway"
3,Who was the Norse leader?,Rollo
4,What century did the Normans first gain their ...,10th century


In [7]:
class Vocab:
    def __init__(self):
        self.word2index = {"SOS" : 0, "EOS": 1}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [8]:
nlp = spacy.load('en_core_web_sm')

def prepare_text(sentence):
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    tokens = ' '.join([token.text for token in nlp(sentence)])

    return tokens

def prepare_data(data_df):
    data_df['Questions'] = data_df['Questions'].apply(prepare_text)
    data_df['Answers'] = data_df['Answers'].apply(prepare_text)

    return data_df

In [9]:
train_df = prepare_data(train_df.iloc[:100, :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df['Questions'] = data_df['Questions'].apply(prepare_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df['Answers'] = data_df['Answers'].apply(prepare_text)


In [10]:
def toTensor(vocab, sentence):
    indices = [vocab.word2index[word] for word in sentence.split(' ')]
    indices.append(vocab.word2index['EOS'])
    return torch.Tensor(indices).long().to(device).view(-1, 1)

def getPairs(df):
    temp1 = df["Questions"].apply(lambda x: "".join(x) ).to_list()
    temp2 = df["Answers"].apply(lambda x: "".join(x) ).to_list()
    return [list(i) for i in zip(temp1, temp2)]

def getMaxLen(pairs):
    max_src = 0 
    max_trg = 0
    
    for p in pairs:
        max_src = len(p[0].split()) if len(p[0].split()) > max_src else max_src
        max_trg = len(p[1].split()) if len(p[1].split()) > max_trg else max_trg
        
    return max_src, max_trg

In [11]:
train_pairs = getPairs(train_df)

In [12]:
train_pairs[:5]

[['when did beyonce start becoming popular', 'in the late 1990s'],
 ['what areas did beyonce compete in when she was growing up',
  'singing and dancing'],
 ['when did beyonce leave destinys child and become a solo singer', '2003'],
 ['in what city and state did beyonce   grow up', 'houston texas'],
 ['in which decade did beyonce become famous', 'late 1990s']]

In [13]:
Q_vocab = Vocab()
A_vocab = Vocab()

for pair in train_pairs:
    Q_vocab.addSentence(pair[0])
    A_vocab.addSentence(pair[1])

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
source_data = [toTensor(Q_vocab, pair[0]) for pair in train_pairs]
target_data = [toTensor(A_vocab, pair[1]) for pair in train_pairs]

In [16]:
max_src, max_trg = getMaxLen(train_pairs)

In [17]:
source_data[0]

tensor([[2],
        [3],
        [4],
        [5],
        [6],
        [7],
        [1]], device='cuda:0')

In [18]:
target_data[0]

tensor([[2],
        [3],
        [4],
        [5],
        [1]], device='cuda:0')

In [19]:
Q_vocab.n_words

298

In [20]:
def evaluate(tensor, model):
    model.eval()
    outputs = model(tensor)
    output_text = [A_vocab.index2word[idx] for idx in outputs]
    output_sentence = ' '.join(output_text)
    return output_sentence, outputs

In [21]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, num_layers, dropout=0.2):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(self.emb_dim, self.emb_dim, self.num_layers, dropout=self.dropout_prob, bidirectional=True)
        self.dropout_layer = nn.Dropout(self.dropout_prob)
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*2, batch_size, self.emb_dim).to(device)
        cell = torch.zeros(self.num_layers*2, batch_size, self.emb_dim).to(device)
        return hidden, cell

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        x = self.dropout_layer(self.embedding(x))
        x, (hidden, cell) = self.lstm(x, (hidden, cell))
        return x, hidden, cell
    
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, num_layers, dropout=0.2):
        super().__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout

        self.embedding = nn.Embedding(output_dim, hidden_dim)
        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim, self.num_layers, dropout=self.dropout_prob, bidirectional=True)
        self.fc = nn.Linear(self.hidden_dim*2, self.output_dim)
        self.dropout_layer = nn.Dropout(self.dropout_prob)
        
    def forward(self, x, hidden, cell):
        x = self.dropout_layer(self.embedding(x))
        x, (hidden, cell) = self.lstm(x, (hidden, cell))
        x = self.dropout_layer(self.fc(x.squeeze(0)))
        return x, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, num_layers=2, dropout=0.2):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout

        self.encoder = Encoder(self.input_dim, self.hidden_dim, self.num_layers, self.dropout_prob).to(device)
        self.decoder = Decoder(self.output_dim, self.hidden_dim, self.num_layers, self.dropout_prob).to(device)


    def forward(self, src, trg=None, teacher_forcing_ratio=0.5):
        self.batch_size = src.shape[1]
        self.trg_vocab_size = self.output_dim
        if (trg != None):
          self.trg_len = trg.shape[0]

          outputs = torch.zeros(self.trg_len, self.batch_size, self.trg_vocab_size).to(device)

          hidden, cell = self.encoder.init_hidden(self.batch_size)
        
          for i in range(src.shape[0]):
              _, hidden, cell = self.encoder(src[i], hidden, cell)
            
          x = torch.zeros(1, self.batch_size, dtype=torch.long, device=device)
        
          for t in range(1, self.trg_len):
              output, hidden, cell = self.decoder(x, hidden, cell)
              outputs[t] = output
              best_guess = output.argmax(1)
              x = trg[t].unsqueeze(0) if random.random() < teacher_forcing_ratio else best_guess.unsqueeze(0)
        
          return outputs

        else:
          outputs = []
          hidden, cell = model.encoder.init_hidden(self.batch_size)

          for i in range(src.shape[0]):
            _, hidden, cell = self.encoder(src[i], hidden, cell)
          
          x = torch.zeros(1, self.batch_size, dtype=torch.long, device=device)

          counts = 0
          while x != A_vocab.word2index["EOS"] and counts < max_trg + 20:
            output, hidden, cell = model.decoder(x, hidden, cell)
            best_guess = output.argmax(1)
            outputs.append(best_guess.item())
            x = best_guess.unsqueeze(0)
            counts += 1
          
          return outputs


In [22]:
input_dim = Q_vocab.n_words
output_dim = A_vocab.n_words
trg_vocab_size = A_vocab.n_words
hidden_dim = 512
dropout = 0.5
batch_size = 16

In [23]:
output_dim

122

In [24]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7fec23dfb310>

In [25]:
model = Seq2Seq(input_dim, output_dim, hidden_dim, 2, dropout).to(device)

In [26]:
learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [27]:
sentence, output_tensor = evaluate(source_data[0], model)

In [28]:
print(sentence)
print(output_tensor)

forbes name name name no no no forbes name name men name name no no lead no june no june no forbes forbes forbes dreamgirls
[38, 51, 51, 51, 101, 101, 101, 38, 51, 51, 91, 51, 51, 101, 101, 18, 101, 30, 101, 30, 101, 38, 38, 38, 25]


In [29]:
num_epochs = 10
total = 0
loss = 0
for epoch in range(num_epochs):
    model.train()
    loss = 0
    for i in range(len(source_data)):
        src = source_data[i].to(device)
        trg = target_data[i].to(device)
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        current = criterion(output, trg)
        loss += current
        total += current
        if (i + 1) % batch_size == 0 or i == (len(source_data)-1):
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss = 0
            
    print_loss = total / len(source_data)
    print(f"Epoch {epoch + 1}/{num_epochs} | Training Loss: {print_loss:.3f}")
    total = 0
    if (epoch + 1) % 5 == 0:
      learning_rate = learning_rate / 10
      optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    model.eval()
    sentence, output_tensor = evaluate(source_data[0], model)
    print(output_tensor)
    print(sentence)
    print("\n\n\n")

        


Epoch 1/10 | Training Loss: 4.674
[94, 1]
my EOS




Epoch 2/10 | Training Loss: 3.723
[1]
EOS




Epoch 3/10 | Training Loss: 3.484
[1]
EOS




Epoch 4/10 | Training Loss: 3.545
[1]
EOS




Epoch 5/10 | Training Loss: 3.595
[1]
EOS




Epoch 6/10 | Training Loss: 3.335
[1]
EOS




Epoch 7/10 | Training Loss: 3.238
[1]
EOS




Epoch 8/10 | Training Loss: 3.236
[1]
EOS




Epoch 9/10 | Training Loss: 3.186
[1]
EOS




Epoch 10/10 | Training Loss: 3.249
[1]
EOS




