<a href="https://colab.research.google.com/github/analyst-ujjwal/ML_project_using_python/blob/main/Answer_Prediction_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [195]:
import pandas as pd

df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")

In [196]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [197]:
def tokenize(text):
  text = text.lower()
  text = text.replace("?","")
  text.replace("'","")
  return text.split()

In [198]:
tokenize("Hello, how are you?")


['hello,', 'how', 'are', 'you']

In [199]:
vocab = {'<UNK>':0}


In [200]:
def build_vocab(row):
  tokenize_question = tokenize(row['question'])
  tokenize_answer = tokenize(row['answer'])
  merged_token = tokenize_question + tokenize_answer
  for token in merged_token:
    if token not in vocab:
      vocab[token] = len(vocab)

In [201]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [202]:
def text_to_indices(text, vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [203]:
text_to_indices("What is campusx", vocab)


[1, 2, 0]

In [204]:
import torch
from torch.utils.data import Dataset, DataLoader
class QADataset(Dataset):
  def __init__(self, df,vocab):
    super().__init__()
    self.df = df
    self.vocab = vocab
  def __len__(self):
    return self.df.shape[0]
  def __getitem__(self,index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'],self.vocab)
    return torch.tensor(numerical_question), torch.tensor(numerical_answer)


In [205]:
dataset = QADataset(df,vocab)

In [206]:
dataloader = DataLoader(dataset, batch_size = 1, shuffle= True)

In [207]:
for question, answer in dataloader:
  print(question, answer[0])

tensor([[1, 2, 3, 4, 5, 8]]) tensor([9])
tensor([[ 10,  75,   3, 298,  19, 299]]) tensor([300])
tensor([[  1,   2,   3, 147,  86,  19, 193, 194]]) tensor([195])
tensor([[  1,   2,   3, 164, 165, 166,  83,  84]]) tensor([167])
tensor([[  1,   2,   3,  69,   5, 156]]) tensor([157])
tensor([[ 10, 140,   3, 141, 172,   5,   3,  70, 173]]) tensor([174])
tensor([[ 10, 140,   3, 141, 142, 143, 144,  83,   3, 145]]) tensor([146])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([91])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([319])
tensor([[ 42, 320,   2,  62,  63,   3, 321,   5, 322]]) tensor([323])
tensor([[  1,   2,   3, 147, 148,  19, 149]]) tensor([150])
tensor([[ 42, 101,   2,   3,  17]]) tensor([102])
tensor([[10, 75, 76]]) tensor([77])
tensor([[ 42, 301, 302, 118,  14, 303, 304, 159, 305, 306, 307, 308]]) tensor([309])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([58])
tensor([[  1,   2,   3,  37, 133,   5,  26]]) tensor([134])
tensor([[ 10,  96,   3, 104, 241]]) tensor([242])
tensor([

In [208]:
import torch.nn as nn
class MysimpleNN(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.RNN = nn.RNN(50,64, batch_first = True)
    self.fc = nn.Linear(64,vocab_size)
  def forward(self,question):
    embedded_question = self.embedding(question)
    hidden, final = self.RNN(embedded_question)
    output = self.fc(final.squeeze(0))
    return output

In [209]:
learning_rate = 0.001
epochs = 20

In [210]:
model = MysimpleNN(len(vocab))
criterian = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [211]:
for epoch in range(epochs):
  total_loss = 0
  for question, answer in dataloader:
    optimizer.zero_grad()
    output = model(question)
    loss = criterian(output, answer[:, 0])
    loss.backward()
    optimizer.step()
    total_loss +=loss.item()
  print(f"Epoch:{epoch+1}, Loss: {total_loss:4f}")

Epoch:1, Loss: 527.529337
Epoch:2, Loss: 460.428238
Epoch:3, Loss: 381.423977
Epoch:4, Loss: 320.994557
Epoch:5, Loss: 269.337564
Epoch:6, Loss: 221.676753
Epoch:7, Loss: 178.073724
Epoch:8, Loss: 139.357476
Epoch:9, Loss: 107.917079
Epoch:10, Loss: 83.194409
Epoch:11, Loss: 64.581208
Epoch:12, Loss: 51.176055
Epoch:13, Loss: 40.608397
Epoch:14, Loss: 32.787470
Epoch:15, Loss: 27.166195
Epoch:16, Loss: 22.659887
Epoch:17, Loss: 18.980316
Epoch:18, Loss: 16.269013
Epoch:19, Loss: 14.008970
Epoch:20, Loss: 12.085532


In [212]:
def predict(model,question,threshold= 0.5):
  numerical_question = text_to_indices(question,vocab)
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)
  output = model(question_tensor)
  prob = torch.nn.functional.softmax(output, dim=1)
  value,index = torch.max(prob, dim=1)
  if value< threshold:
    print("I Don't Know")
  else:
    print(list(vocab.keys())[index])

In [215]:
predict(model, "what is the capital of france")

paris
