In [None]:
!pip install torchtext==0.10.0 pydantic

In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [1]:
import spacy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchtext
from torch.utils.data import Dataset
#from torchtext.legacy.data import Iterator
#from torchtext.legacy.data import Example
#from torchtext.legacy.data import Field
from torchtext.datasets import SQuAD2
import random
import string

In [2]:
train_data, val_data = SQuAD2(split=('train', 'dev'))

In [3]:
train_dictionary = {"Questions" : [], "Answers": []}
for _, q, a, _ in train_data:
    train_dictionary["Questions"].append(q)
    train_dictionary["Answers"].append(a[0])
train_df = pd.DataFrame(train_dictionary)

In [4]:
val_dictionary = {"Questions" : [], "Answers": []}
for _, q, a, _ in val_data:
    val_dictionary["Questions"].append(q)
    val_dictionary["Answers"].append(a[0])
val_df = pd.DataFrame(val_dictionary)

In [5]:
train_df.head()

Unnamed: 0,Questions,Answers
0,When did Beyonce start becoming popular?,in the late 1990s
1,What areas did Beyonce compete in when she was...,singing and dancing
2,When did Beyonce leave Destiny's Child and bec...,2003
3,In what city and state did Beyonce grow up?,"Houston, Texas"
4,In which decade did Beyonce become famous?,late 1990s


In [6]:
val_df.head()

Unnamed: 0,Questions,Answers
0,In what country is Normandy located?,France
1,When were the Normans in Normandy?,10th and 11th centuries
2,From which countries did the Norse originate?,"Denmark, Iceland and Norway"
3,Who was the Norse leader?,Rollo
4,What century did the Normans first gain their ...,10th century


In [7]:
class Vocab:
    def __init__(self):
        self.word2index = {"EOW" : 0, "EOS": 1}
        self.word2count = {}
        self.index2word = {0: "EOW", 1: "EOS"}
        self.n_words = 2  

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            if (word != "EOW"):
              self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [8]:
nlp = spacy.load('en_core_web_sm')

def prepare_text(sentence):
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    tokens = ' '.join([token.text for token in nlp(sentence)])

    return tokens

def prepare_data(data_df):
    data_df['Questions'] = data_df['Questions'].apply(prepare_text)
    data_df['Answers'] = data_df['Answers'].apply(prepare_text)

    return data_df

In [9]:
train_df = prepare_data(train_df.iloc[:100, :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [10]:
len(train_df)

100

In [11]:
train_df.iloc[0, 1]

'in the late 1990s'

In [12]:
def toTensor(vocab, sentence):
    indices = [vocab.word2index[word] for word in sentence.split(' ')]
    indices.append(vocab.word2index['EOS'])
    return torch.Tensor(indices).long().to(device).view(-1, 1)

def getPairs(df):
    pairs = []
    for i in range(len(df)):
      question, answer = getSentence(df.iloc[i, 0]), getSentence(df.iloc[i, 1])
      pairs.append([question, answer])
    return pairs
      
    #temp1 = df["Questions"].apply(lambda x: " ".join(x) ).to_list()
    #temp2 = df["Answers"].apply(lambda x: " ".join(x) ).to_list()
    #return [list(i) for i in zip(temp1, temp2)]

def getSentence(sentence):
    split = sentence.split()
    newSentence = ""
    for i in range(len(split)):
      for x in range(len(split[i])):
        if (i == len(split) - 1 and x == len(split[i]) - 1):
          newSentence += split[i][x]
        else:
          newSentence += split[i][x] + " "
      if (i != len(split) - 1):
        newSentence += "EOW "
    return newSentence

def getMaxLen(pairs):
    max_src = 0 
    max_trg = 0
    
    for p in pairs:
        max_src = len(p[0].split()) if len(p[0].split()) > max_src else max_src
        max_trg = len(p[1].split()) if len(p[1].split()) > max_trg else max_trg
        
    return max_src, max_trg

In [13]:
train_pairs = getPairs(train_df)

In [14]:
train_pairs[:5]

[['w h e n EOW d i d EOW b e y o n c e EOW s t a r t EOW b e c o m i n g EOW p o p u l a r',
  'i n EOW t h e EOW l a t e EOW 1 9 9 0 s'],
 ['w h a t EOW a r e a s EOW d i d EOW b e y o n c e EOW c o m p e t e EOW i n EOW w h e n EOW s h e EOW w a s EOW g r o w i n g EOW u p',
  's i n g i n g EOW a n d EOW d a n c i n g'],
 ['w h e n EOW d i d EOW b e y o n c e EOW l e a v e EOW d e s t i n y s EOW c h i l d EOW a n d EOW b e c o m e EOW a EOW s o l o EOW s i n g e r',
  '2 0 0 3'],
 ['i n EOW w h a t EOW c i t y EOW a n d EOW s t a t e EOW d i d EOW b e y o n c e EOW g r o w EOW u p',
  'h o u s t o n EOW t e x a s'],
 ['i n EOW w h i c h EOW d e c a d e EOW d i d EOW b e y o n c e EOW b e c o m e EOW f a m o u s',
  'l a t e EOW 1 9 9 0 s']]

In [15]:
Q_vocab = Vocab()
A_vocab = Vocab()

for pair in train_pairs:
    Q_vocab.addSentence(pair[0])
    A_vocab.addSentence(pair[1])

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
source_data = [toTensor(Q_vocab, pair[0]) for pair in train_pairs]
target_data = [toTensor(A_vocab, pair[1]) for pair in train_pairs]

In [18]:
max_src, max_trg = getMaxLen(train_pairs)

In [19]:
max_trg

41

In [20]:
source_data[0]

tensor([[ 2],
        [ 3],
        [ 4],
        [ 5],
        [ 0],
        [ 6],
        [ 7],
        [ 6],
        [ 0],
        [ 8],
        [ 4],
        [ 9],
        [10],
        [ 5],
        [11],
        [ 4],
        [ 0],
        [12],
        [13],
        [14],
        [15],
        [13],
        [ 0],
        [ 8],
        [ 4],
        [11],
        [10],
        [16],
        [ 7],
        [ 5],
        [17],
        [ 0],
        [18],
        [10],
        [18],
        [19],
        [20],
        [14],
        [15],
        [ 1]], device='cuda:0')

In [21]:
target_data[0]

tensor([[ 2],
        [ 3],
        [ 0],
        [ 4],
        [ 5],
        [ 6],
        [ 0],
        [ 7],
        [ 8],
        [ 4],
        [ 6],
        [ 0],
        [ 9],
        [10],
        [10],
        [11],
        [12],
        [ 1]], device='cuda:0')

In [22]:
Q_vocab.n_words

38

In [34]:
def evaluate(tensor, model):
    model.eval()
    outputs = model(tensor)
    output_text = [A_vocab.index2word[idx] for idx in outputs]
    output_sentence = ""
    for string in output_text:
      if (string == "EOW"):
        output_sentence += " "
      elif (string == "EOS"):
        output_sentence += "."
      else:
        output_sentence += string
    #output_sentence = ' '.join(output_text)
    return output_sentence, outputs

In [24]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, num_layers, dropout=0.2):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(self.emb_dim, self.emb_dim, self.num_layers, dropout=self.dropout_prob, bidirectional=True)
        self.dropout_layer = nn.Dropout(self.dropout_prob)
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*2, batch_size, self.emb_dim).to(device)
        cell = torch.zeros(self.num_layers*2, batch_size, self.emb_dim).to(device)
        return hidden, cell

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        x = self.dropout_layer(self.embedding(x))
        x, (hidden, cell) = self.lstm(x, (hidden, cell))
        return x, hidden, cell
    
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, num_layers, dropout=0.2):
        super().__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout

        self.embedding = nn.Embedding(self.output_dim, self.hidden_dim)
        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim, self.num_layers, dropout=self.dropout_prob, bidirectional=True)
        self.fc = nn.Linear(self.hidden_dim*2, self.output_dim)
        self.dropout_layer = nn.Dropout(self.dropout_prob)
        
    def forward(self, x, hidden, cell):
        x = self.dropout_layer(self.embedding(x))
        x, (hidden, cell) = self.lstm(x, (hidden, cell))
        x = self.dropout_layer(self.fc(x.squeeze(0)))
        return x, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, num_layers=2, dropout=0.2):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_prob = dropout

        self.encoder = Encoder(self.input_dim, self.hidden_dim, self.num_layers, self.dropout_prob).to(device)
        self.decoder = Decoder(self.output_dim, self.hidden_dim, self.num_layers, self.dropout_prob).to(device)


    def forward(self, src, trg=None, teacher_forcing_ratio=0.5):
        self.batch_size = src.shape[1]
        self.trg_vocab_size = self.output_dim
        if (trg != None):
          self.trg_len = trg.shape[0]

          outputs = torch.zeros(self.trg_len, self.batch_size, self.trg_vocab_size).to(device)
          best_guesses = []

          hidden, cell = self.encoder.init_hidden(self.batch_size)
        
          for i in range(src.shape[0]):
              _, hidden, cell = self.encoder(src[i], hidden, cell)
            
          x = torch.zeros(1, self.batch_size, dtype=torch.long, device=device)
        
          for t in range(0, self.trg_len):
              output, hidden, cell = self.decoder(x, hidden, cell)
              outputs[t] = output
              best_guess = output.argmax(1)
              best_guesses.append(best_guess.item())
              x = trg[t].unsqueeze(0) if random.random() < teacher_forcing_ratio else best_guess.unsqueeze(0)
        
          return outputs, best_guesses

        else:
          outputs = []
          hidden, cell = self.encoder.init_hidden(self.batch_size)

          for i in range(src.shape[0]):
            _, hidden, cell = self.encoder(src[i], hidden, cell)
          
          x = torch.zeros(1, self.batch_size, dtype=torch.long, device=device)

          counts = 0
          while x != A_vocab.word2index["EOS"] and counts < max_trg:
            output, hidden, cell = self.decoder(x, hidden, cell)
            best_guess = output.argmax(1)
            outputs.append(best_guess.item())
            x = best_guess.unsqueeze(0)
            counts += 1
          
          return outputs


In [25]:
input_dim = Q_vocab.n_words
output_dim = A_vocab.n_words
trg_vocab_size = A_vocab.n_words
hidden_dim = 1024
dropout = 0.0
batch_size = 64

In [26]:
output_dim

37

In [27]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f6842031fd0>

In [28]:
model = Seq2Seq(input_dim, output_dim, hidden_dim, 2, dropout).to(device)

In [29]:
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [30]:
sentence, output_tensor = evaluate(source_data[0], model)

In [31]:
print(sentence)
print(output_tensor)

pp22yyyyyyyyy2yyyyyyyyy2yyyyyyyyy2yyyyyyy.
[33, 33, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 21, 21, 21, 21, 21, 21, 21]


In [32]:
num_epochs = 60
total = 0
loss = 0
for epoch in range(num_epochs):
    model.train()
    loss = 0
    for i in range(len(source_data)):
        src = source_data[i].to(device)
        trg = target_data[i].to(device)
        output, lists = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        current = criterion(output, trg)
        loss += current
        total += current
        if (i + 1) % batch_size == 0 or i == (len(source_data)-1):
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss = 0
            
    print_loss = total / len(source_data)
    print(f"Epoch {epoch + 1}/{num_epochs} | Training Loss: {print_loss:.3f}")
    total = 0
    if (epoch + 1) % 20 == 0:
      learning_rate = learning_rate / 10
      optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        


Epoch 1/60 | Training Loss: 3.535
Epoch 2/60 | Training Loss: 3.249
Epoch 3/60 | Training Loss: 3.109
Epoch 4/60 | Training Loss: 2.911
Epoch 5/60 | Training Loss: 2.705
Epoch 6/60 | Training Loss: 2.609
Epoch 7/60 | Training Loss: 2.474
Epoch 8/60 | Training Loss: 2.389
Epoch 9/60 | Training Loss: 2.163
Epoch 10/60 | Training Loss: 2.058
Epoch 11/60 | Training Loss: 1.865
Epoch 12/60 | Training Loss: 1.736
Epoch 13/60 | Training Loss: 1.543
Epoch 14/60 | Training Loss: 1.401
Epoch 15/60 | Training Loss: 1.291
Epoch 16/60 | Training Loss: 1.093
Epoch 17/60 | Training Loss: 0.974
Epoch 18/60 | Training Loss: 0.851
Epoch 19/60 | Training Loss: 0.752
Epoch 20/60 | Training Loss: 0.634
Epoch 21/60 | Training Loss: 0.586
Epoch 22/60 | Training Loss: 0.531
Epoch 23/60 | Training Loss: 0.446
Epoch 24/60 | Training Loss: 0.427
Epoch 25/60 | Training Loss: 0.409
Epoch 26/60 | Training Loss: 0.387
Epoch 27/60 | Training Loss: 0.355
Epoch 28/60 | Training Loss: 0.371
Epoch 29/60 | Training Loss: 

In [44]:
sentence = input("Ethan: Hi, I'm Ethan, a Q&A chatbot created by Arnav Balaji. Enter 'exit' to quit. What can I answer for you?\nYou: ")
print()
while True:
    if (sentence.lower() == 'exit'):
            break
    try:
        tokens = prepare_text(sentence)
        tokens = getSentence(tokens)
        tensor = toTensor(Q_vocab, tokens)
        output, output_tensor = evaluate(tensor, model)
        sentence = input("Ethan: " + output + "\nYou: ")
        print()
    except: 
        sentence = input("Ethan: Sorry! I don't understand what you are saying! Please ask me something different.\nYou: ")
        print()

Ethan: Hi, I'm Ethan, a Q&A chatbot created by Arnav Balaji. Enter 'exit' to quit. What can I answer for you?
You: when did beyonce start becoming popular

Ethan: nn the late 1990s.
You: what album made beyonce a worldwide known artist

Ethan: aangerously in love.
You: how did beyonce describe herself as a feminist

Ethan: oodernday feminist.
You: charlies angels featured which single from the band members

Ethan: nndependent women part me.
You: who was blamed for luckett and roberson leaving destinys child

Ethan: eeyoncé.
You: 12ur93e48rf8 &&AWE**(

Ethan: Sorry! I don't understand what you are saying! Please ask me something different.
You: who is beyonce

Ethan: 000.
You: exit

