In [1]:
import pandas as pd


In [3]:
#Load the dataset
df =pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [4]:
#tokenize
def tokenize(text):
   text = text.lower()
   text = text.replace('?','')
   text = text.replace("","")
   return text.split()


In [5]:
tokenize( "what is the capital of Germany")

['what', 'is', 'the', 'capital', 'of', 'germany']

In [12]:
#Vocab
vocab ={'<UNK>':0}


In [13]:
def build_vocab(row):
  print(row['question'], row['answer'])
  tokenize_questions = tokenize(row['question'])
  tokenize_answer = tokenize(row['answer'])
  merge_tokenize = tokenize_questions + tokenize_answer

  for token in merge_tokenize:
    if token not in vocab:
      vocab[token] = len(vocab)

In [14]:
df.apply(build_vocab, axis=1)

What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [15]:
len(vocab)

326

In [16]:
#convert text to numerical

def text_to_numerical(sentence,vocab):
  indexed_text = []
  for token in tokenize(sentence):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text



In [17]:
text_to_numerical("what is campusx",vocab)

[1, 2, 0]

In [20]:
from torch.utils.data import Dataset,DataLoader
import torch

class QADataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        question = self.df.iloc[index]['question']
        answer = self.df.iloc[index]['answer']

        numerical_question = text_to_numerical(question, self.vocab)
        numerical_answer = text_to_numerical(answer, self.vocab)

        return (
            torch.tensor(numerical_question, dtype=torch.long),
            torch.tensor(numerical_answer, dtype=torch.long)
        )


In [21]:
dataset = QADataset(df,vocab)

In [22]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

In [23]:
for question, answer in dataloader:
  print(question,answer)

tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]]) tensor([[52]])
tensor([[  1,   2,   3, 222,   5, 223, 224, 225]]) tensor([[226]])
tensor([[ 42, 301, 302, 118,  14, 303, 304, 159, 305, 306, 307, 308]]) tensor([[309]])
tensor([[ 10,  96,   3, 104, 241]]) tensor([[242]])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([[23]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
tensor([[  1,   2,   3, 235,   5, 236]]) tensor([[237]])
tensor([[ 78,  79, 151, 152,  14, 153, 154]]) tensor([[155]])
tensor([[  1,   2,   3,  69,   5, 156]]) tensor([[157]])
tensor([[  1,   2,   3, 147, 148,  19, 149]]) tensor([[150]])
tensor([[  1,   2,   3,   4,   5, 288]]) tensor([[289]])
tensor([[ 10, 140,   3, 141, 172,   5,   3,  70, 173]]) tensor([[174]])
tensor([[ 42,   2,   3, 276, 212, 277]]) tensor([[278]])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([[41]])
tensor([[ 42, 137,  

In [24]:
import torch.nn as nn


In [50]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(input_size=50, hidden_size=64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)

    def forward(self, question):
        embedded = self.embedding(question)
        output, hidden = self.rnn(embedded)
        logits = self.fc(hidden.squeeze(0))
        return logits





In [66]:
learning_rate = 0.001
epochs = 20

In [67]:
model = SimpleRNN(len(vocab))

In [68]:
criteria = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

In [69]:
#Training Loop

for epoch in range(epochs):
  total_loss = 0

  for question,answer in dataloader:
    optimizer.zero_grad()

    #forward pass

    output = model(question)

    #loss calculation

    loss = criteria(output,answer[0])

    #gradient calculation

    loss.backward()

    #update
    optimizer.step()

    total_loss = total_loss + loss.item()

    print(f"Epoch{epoch+1}, Loss :{total_loss:4f}")




Epoch1, Loss :5.977479
Epoch1, Loss :11.619404
Epoch1, Loss :17.565052
Epoch1, Loss :23.594137
Epoch1, Loss :29.485584
Epoch1, Loss :35.171918
Epoch1, Loss :41.446091
Epoch1, Loss :46.997931
Epoch1, Loss :52.973187
Epoch1, Loss :59.431196
Epoch1, Loss :65.387921
Epoch1, Loss :71.789455
Epoch1, Loss :77.926933
Epoch1, Loss :83.292551
Epoch1, Loss :89.204816
Epoch1, Loss :95.335165
Epoch1, Loss :101.203071
Epoch1, Loss :107.112060
Epoch1, Loss :113.013514
Epoch1, Loss :118.752610
Epoch1, Loss :124.161299
Epoch1, Loss :130.021074
Epoch1, Loss :135.543647
Epoch1, Loss :141.026884
Epoch1, Loss :147.173638
Epoch1, Loss :153.024897
Epoch1, Loss :159.093737
Epoch1, Loss :165.245022
Epoch1, Loss :170.824929
Epoch1, Loss :176.711901
Epoch1, Loss :183.164813
Epoch1, Loss :188.923104
Epoch1, Loss :194.626947
Epoch1, Loss :200.896986
Epoch1, Loss :206.568041
Epoch1, Loss :212.887962
Epoch1, Loss :218.476767
Epoch1, Loss :225.035460
Epoch1, Loss :231.132951
Epoch1, Loss :236.966425
Epoch1, Loss :242

In [106]:
def predict(model,question,threshold=0.6):
  numerical_questions = text_to_numerical(question,vocab)

  questions_tensor = torch.tensor(numerical_questions).unsqueeze(0)

  output = model(questions_tensor)

  probs = torch.nn.functional.softmax(output , dim=1)

  value,index =  torch.max(probs, dim=1)

  if value < threshold:
    print("i don't know")

  prediction = list(vocab.keys())[index]

  return prediction




In [107]:
print( predict(model, "what is the capital of japan"))

tokyo
