In [171]:
import pandas as pd

df = pd.read_csv("100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [172]:
# tokenize i.e convertign sentence into pieces of words or tokens collection, into list and also remove all unwanted charectors with in

def tokenize(text):

  text = text.lower() # converting all text into small letters
  text = text.replace('?', ' ')  # replaceing charector "?" with nothing
  text = text.replace("'", "")   # replaceing charector "'" with nothing

  return text.split()


In [173]:
# vocab i.e. creating dictionary and store all the uniques words exist in the dataset with unique index value
vocab = {'<UNK>': 0}    # here <UNK> is stored a default value with index 0 that represent all unknown values

In [174]:
def build_vocab(row):
  tokenized_question = tokenize(row["question"])
  tokenized_answer = tokenize(row["answer"])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:
      if token not in vocab:
          vocab[token] = len(vocab)  # adding new token where key is unique word and value is current lenght of vocab dictionary




In [175]:
df.apply(build_vocab, axis=1)    # apply build_vocab funtion on the each row of dataset

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [176]:
# now converting text/words to numerical indices
def text_to_indices(text, vocab):

  indexed_text = []

  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])  # it will add index value or key's value from the vocab dictionary of the token
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text



In [177]:
text_to_indices("what is this", vocab) # checking

[1, 2, 0]

In [178]:
import torch
from torch.utils.data import Dataset, DataLoader

In [179]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):

    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)



In [180]:
dataset = QADataset(df, vocab)

In [181]:
dataset[4]  # checking the changed data type by just passing the 4th row of data

(tensor([ 1,  2,  3, 24, 25,  5, 26, 19, 27]), tensor([28]))

In [182]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=0)

In [183]:
# now creating Neural Netwrok
import torch.nn as nn

In [184]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):

    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn= nn.RNN(input_size=50, hidden_size=64, batch_first=True)
    self.fc = nn.Linear(in_features=64, out_features=vocab_size) # fc mean "fully connected layer"


  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output



In [185]:
learning_rate = 0.001
epochs = 20

In [186]:
model = SimpleRNN(len(vocab))


In [187]:
critrerion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [188]:
for epoch in range(epochs):
  total_loss=0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    loss = critrerion(output, answer[0])

    # gradients
    loss.backward()

    # optimizer
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch: {epoch+1} , Loss: {total_loss :.4f}")

Epoch: 1 , Loss: 520.1616
Epoch: 2 , Loss: 450.0840
Epoch: 3 , Loss: 369.8123
Epoch: 4 , Loss: 307.4976
Epoch: 5 , Loss: 255.5261
Epoch: 6 , Loss: 207.0652
Epoch: 7 , Loss: 163.3311
Epoch: 8 , Loss: 125.9490
Epoch: 9 , Loss: 95.8189
Epoch: 10 , Loss: 72.7748
Epoch: 11 , Loss: 55.8050
Epoch: 12 , Loss: 43.5362
Epoch: 13 , Loss: 34.6604
Epoch: 14 , Loss: 28.1451
Epoch: 15 , Loss: 23.2538
Epoch: 16 , Loss: 19.4895
Epoch: 17 , Loss: 16.5251
Epoch: 18 , Loss: 14.1442
Epoch: 19 , Loss: 12.2008
Epoch: 20 , Loss: 10.5956


In [194]:
def predict(model, question, threshold=0.5):

  # converting question into numerical
  numerical_question = text_to_indices(question, vocab)

  # tensor
  queston_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(queston_tensor)

  # convert logits to probability
  probability =  torch.nn.functional.softmax(output, dim=1)

  # find the index of max prob
  value, index = torch.max(probability, dim=1)

  if value < threshold:
    print("Don't have answer")

  print( list(vocab.keys())[index])


In [212]:
predict(model, "Who directed the movie 'Dark knight'?")

batman


In [209]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango
