In [2]:
import pandas as pd


In [4]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [5]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?', '')
  text = text.replace("'", '')
  return text.split()

In [8]:
tokenize("what is adition's kjfds ?")

['what', 'is', 'aditions', 'kjfds']

In [6]:
# vocab
vocab = {'<UNK>':0}

In [7]:
def build_vocab(row):
  tokenize_question = tokenize(row['question'])
  tokenize_answer = tokenize(row['answer'])
  merged_tokens = tokenize_question + tokenize_answer

  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab)

In [8]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [9]:
# convert words into numarical

def text_to_indices(text, vocab):

  indexed_list = []

  for token in tokenize(text):

    if token in vocab:
      indexed_list.append(vocab[token])
    else:
      indexed_list.append(vocab['<UNK>'])
  return indexed_list

In [28]:
text_to_indices('What is my name', vocab)


[1, 2, 0, 0]

In [10]:
import torch
from torch.utils.data import DataLoader, Dataset

In [11]:
class QADataset(Dataset):

  def __init__(self, do, vocab):

    self.df = df
    self.vocab = vocab

  def __len__(self):

    return self.df.shape[0]

  def __getitem__(self, index):

    numberical_question =  text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numberical_answer =  text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numberical_question), torch.tensor(numberical_answer)

In [12]:
dataset = QADataset(df, vocab)

In [13]:
dataset[55]

(tensor([ 10,  75, 208]), tensor([209]))

In [14]:
dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)

In [62]:
for question, answer in dataloader:
  print(question, answer)
  print(question.shape, answer.shape)

tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([[162]])
torch.Size([1, 7]) torch.Size([1, 1])
tensor([[10, 75, 76]]) tensor([[77]])
torch.Size([1, 3]) torch.Size([1, 1])
tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([[95]])
torch.Size([1, 6]) torch.Size([1, 1])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([[54]])
torch.Size([1, 6]) torch.Size([1, 1])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([[65]])
torch.Size([1, 8]) torch.Size([1, 1])
tensor([[ 10,  96,   3, 104, 239]]) tensor([[240]])
torch.Size([1, 5]) torch.Size([1, 1])
tensor([[ 10,  75, 208]]) tensor([[209]])
torch.Size([1, 3]) torch.Size([1, 1])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([[317]])
torch.Size([1, 6]) torch.Size([1, 1])
tensor([[  1,   2,   3,   4,   5, 113]]) tensor([[114]])
torch.Size([1, 6]) torch.Size([1, 1])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([[9]])
torch.Size([1, 6]) torch.Size([1, 1])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
torch.Size([1, 9]) torch.Size([1, 1])
tensor([[10

In [16]:
 import torch.nn as nn

In [64]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()

    self.embeddings = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):

    embedeed_question = self.embeddings(question)
    hidden, final = self.rnn(embedeed_question)
    output = self.fc(final.squeeze(0))

    return output

In [65]:
learning_rate = 0.001
epochs = 20

In [66]:
model

SimpleRNN(
  (embeddings): Embedding(324, 50)
  (rnn): RNN(50, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=324, bias=True)
)

In [67]:
model = SimpleRNN(len(vocab))

In [68]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [69]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward function
    output = model(question)

    # loss
    # print(output.shape, answer.shape)
    loss = criterion(output, answer[0])

    # gradiant
    loss.backward()

    # update gradiant
    optimizer.step()

    total_loss = total_loss + loss.item()

  average_loss = total_loss/len(dataloader)
  print(f'epoch: {epoch+1} loss: {average_loss}')



epoch: 1 loss: 5.860182740953234
epoch: 2 loss: 5.078627051247491
epoch: 3 loss: 4.190953969955444
epoch: 4 loss: 3.569341288672553
epoch: 5 loss: 3.0095577107535467
epoch: 6 loss: 2.483989812268151
epoch: 7 loss: 2.000699996948242
epoch: 8 loss: 1.5617680311203004
epoch: 9 loss: 1.2101337095101674
epoch: 10 loss: 0.9273232185178333
epoch: 11 loss: 0.7179942097928789
epoch: 12 loss: 0.5592170594467057
epoch: 13 loss: 0.44258189929856195
epoch: 14 loss: 0.35868818925486673
epoch: 15 loss: 0.2962232795026567
epoch: 16 loss: 0.24167701370186276
epoch: 17 loss: 0.20507945194840432
epoch: 18 loss: 0.17495566349890496
epoch: 19 loss: 0.15033345793684325
epoch: 20 loss: 0.1297742058419519


In [78]:
def predict(model, question, threshold=0.5):

  # convert question into numbers
  numerical_question = text_to_indices(text=question, vocab=vocab)
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)
  output = model(question_tensor)
  probability_output = torch.nn.functional.softmax(output, dim=1)
  value, index = torch.max(probability_output, dim=1)

  if value < threshold:
    print("I don't know")
  else:
    print(list(vocab.keys())[index])


In [86]:
predict(model, 'whtat is the largest planet in our solar system')

jupiter


In [80]:
predict(model, 'What is campusx')

I don't know
