In [21]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import nltk

In [2]:
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')

In [6]:
import string

punc = string.punctuation

In [9]:
import re

In [19]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [62]:
def preprocessing(text):
  text = text.lower()
  text = text.translate(str.maketrans('', '', punc))
  return ' '.join(word_tokenize(text))


In [64]:
df['question'] = df['question'].apply(preprocessing)

In [195]:
cv = CountVectorizer()

In [196]:
corpus = df['question'].values + ' ' + df['answer'].values

In [197]:
cv.fit(corpus)

In [198]:
min(list(cv.vocabulary_.values()))

0

In [199]:
len(cv.vocabulary_)

328

In [200]:
cv.vocabulary_['<UNK>'] = len(cv.vocabulary_)

In [201]:
len(cv.vocabulary_)

329

In [202]:
def text_to_indices(text, vocab):
  indexed_text = []

  for token in preprocessing(text).split():

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [203]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    numerical_question = text_to_indices(self.df.iloc[index, 0], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index, 1], self.vocab)
    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [204]:
dataset = QADataset(df, cv.vocabulary_)

In [205]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)

In [206]:
class MyRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, x):
    embedding = self.embedding(x)
    hidden, final = self.rnn(embedding)
    final = self.fc(final.squeeze(0))
    return final

In [255]:
learning_rate = 0.001
epochs = 20

In [256]:
model = MyRNN(len(cv.vocabulary_))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [257]:
for epoch in range(epochs):

  epoch_loss = []

  for question, answer in dataloader:

    pred = model(question)

    loss = criterion(pred, answer[0])

    with torch.no_grad():
      epoch_loss.append(loss.item())

    optimizer.zero_grad()

    loss.backward()

    optimizer.step()

  print(f'loss at epoch {epoch} is {np.mean(epoch_loss)}')


loss at epoch 0 is 5.790180471208361
loss at epoch 1 is 4.7393211126327515
loss at epoch 2 is 3.7332786626285976
loss at epoch 3 is 3.182670089933607
loss at epoch 4 is 2.7066050757964453
loss at epoch 5 is 2.2349811100297505
loss at epoch 6 is 1.8006823738416036
loss at epoch 7 is 1.4211129426956177
loss at epoch 8 is 1.0939734280937248
loss at epoch 9 is 0.8408939237395923
loss at epoch 10 is 0.6434878641532527
loss at epoch 11 is 0.5008492721865575
loss at epoch 12 is 0.3920136799207992
loss at epoch 13 is 0.3150832389584846
loss at epoch 14 is 0.2562840404609839
loss at epoch 15 is 0.21371243306332163
loss at epoch 16 is 0.18059042659070756
loss at epoch 17 is 0.15132238397167788
loss at epoch 18 is 0.13025409209852418
loss at epoch 19 is 0.11343858069222834


In [258]:
def predict(model, question, threshold=0.5, vocab = cv.vocabulary_):
    numerical_question = text_to_indices(question, vocab)

    if not numerical_question:
        print("Input question could not be tokenized meaningfully or is empty.")
        return

    question_tensor = torch.tensor(numerical_question)
    question_tensor = question_tensor.unsqueeze(0) # Add batch dimension (batch_size=1)

    output = model(question_tensor)

    probs = torch.nn.functional.softmax(output, dim=1)

    val, index = torch.max(probs, dim=1)

    predicted_prob = val.item()
    predicted_idx = index.item()

    if predicted_prob < 0.1:
      print('I don"t know')
    else:
      idx_to_word = {idx: word for word, idx in vocab.items()}
      print(idx_to_word.get(predicted_idx, f"Unknown index {predicted_idx}"))

In [262]:
predict(model, 'What is capital of france?')

paris
