In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")

In [None]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [None]:
# TOkenize
def Tokenize(text):
  text = text.lower().replace('?','')
  text = text.replace('!','')
  text = text.replace(',','')
  text = text.replace('.','')
  return text.split()

In [None]:
vocab = {'<UK>':0}

In [None]:
# vocabulary
def build_vacab(row):
  tokenized_question = Tokenize(row['question'])
  tokenized_answer = Tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab)

In [None]:
df.apply(build_vacab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [None]:
vocab

{'<UK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 "'to": 12,
 'kill': 13,
 'a': 14,
 "mockingbird'": 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 "'1984'": 67,
 'george-orwell': 68,
 'currency': 69,
 'u

In [None]:
# Convert words to numerical values
def text_to_indices(text, vocab):
  indexed_text = []
  for token in Tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UK>'])
  return indexed_text


In [None]:
text_to_indices("what is satyam", vocab)

[1, 2, 0]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [None]:
dataset = QADataset(df, vocab)

In [None]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
for question , answer in dataloader:
  print(question, answer)

tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([[53]])
tensor([[  1,   2,   3,   4,   5, 113]]) tensor([[114]])
tensor([[  1,   2,   3,   4,   5, 238, 239]]) tensor([[240]])
tensor([[ 42, 292, 293, 118, 294, 159, 295, 296]]) tensor([[297]])
tensor([[ 42, 301, 302, 118,  14, 303, 304, 159, 305, 306, 307, 308]]) tensor([[309]])
tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tensor([[ 1,  2,  3,  4,  5, 73]]) tensor([[74]])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([[7]])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([[136]])
tensor([[ 42, 217, 118, 218, 219,  19,  14, 220,  43]]) tensor([[221]])
tensor([[ 42, 201,   2,  14, 202, 203, 204, 205]]) tensor([[206]])
tensor([[ 78,  79, 196,  81,  19,   3, 197, 198, 199]]) tensor([[200]])
tensor([[ 78,  79, 151, 152,  14, 153, 154]]) tensor([[155]])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([[72]])
tensor([[ 42,  18, 118,   3, 187, 188]]) tensor([[189]])
tensor([[  1,   2,   3, 

In [None]:
import torch.nn as nn

In [None]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    # Corrected: Added batch_first=True for consistency with embedding output
    self.rnn = nn.RNN(50, 60, batch_first=True)
    # Corrected: in_features should be 60 to match rnn hidden_size
    self.fc = nn.Linear(60, vocab_size)


  def forward(self, question):
    embedded_question = self.embedding(question) # shape: (batch_size, seq_len, embedding_dim)

    # The rnn will now accept batch-first input
    # output_rnn: (batch_size, seq_len, hidden_size) - all hidden states
    # final: (num_layers * num_directions, batch_size, hidden_size) - final hidden state
    output_rnn, final = self.rnn(embedded_question)

    # Corrected: Squeeze the first dimension of final to get (batch_size, hidden_size)
    output = self.fc(final.squeeze(0))
    return output

In [None]:
learning_rate = 0.01
epochs = 20

In [None]:
model = SimpleRNN(len(vocab))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(epochs):
    total_loss = 0.0

    for question, answer in dataloader:
        answer = answer.squeeze(1).long()

        optimizer.zero_grad()

        output = model(question)

        loss = criterion(output, answer)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Loss = {avg_loss:.4f}")


Epoch 1, Loss = 0.0012
Epoch 2, Loss = 0.0011
Epoch 3, Loss = 0.0010
Epoch 4, Loss = 0.0009
Epoch 5, Loss = 0.0010
Epoch 6, Loss = 0.0009
Epoch 7, Loss = 0.0009
Epoch 8, Loss = 0.0009
Epoch 9, Loss = 0.0008
Epoch 10, Loss = 0.0008
Epoch 11, Loss = 0.0008
Epoch 12, Loss = 0.0007
Epoch 13, Loss = 0.0007
Epoch 14, Loss = 0.0007
Epoch 15, Loss = 0.0006
Epoch 16, Loss = 0.0006
Epoch 17, Loss = 0.0006
Epoch 18, Loss = 0.0006
Epoch 19, Loss = 0.0005
Epoch 20, Loss = 0.0005


In [None]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [None]:
predict(model, "What is the largest planet in our solar system?")

jupiter


In [None]:
list(vocab.keys())

['<UK>',
 'what',
 'is',
 'the',
 'capital',
 'of',
 'france',
 'paris',
 'germany',
 'berlin',
 'who',
 'wrote',
 "'to",
 'kill',
 'a',
 "mockingbird'",
 'harper-lee',
 'largest',
 'planet',
 'in',
 'our',
 'solar',
 'system',
 'jupiter',
 'boiling',
 'point',
 'water',
 'celsius',
 '100',
 'painted',
 'mona',
 'lisa',
 'leonardo-da-vinci',
 'square',
 'root',
 '64',
 '8',
 'chemical',
 'symbol',
 'for',
 'gold',
 'au',
 'which',
 'year',
 'did',
 'world',
 'war',
 'ii',
 'end',
 '1945',
 'longest',
 'river',
 'nile',
 'japan',
 'tokyo',
 'developed',
 'theory',
 'relativity',
 'albert-einstein',
 'freezing',
 'fahrenheit',
 '32',
 'known',
 'as',
 'red',
 'mars',
 'author',
 "'1984'",
 'george-orwell',
 'currency',
 'united',
 'kingdom',
 'pound',
 'india',
 'delhi',
 'discovered',
 'gravity',
 'newton',
 'how',
 'many',
 'continents',
 'are',
 'there',
 'on',
 'earth',
 '7',
 'gas',
 'do',
 'plants',
 'use',
 'photosynthesis',
 'co2',
 'smallest',
 'prime',
 'number',
 '2',
 'invent