In [None]:
import torch
from torch.utils.data import Dataset,DataLoader

In [None]:
import pandas as pd

df=pd.read_csv('/content/100_Unique_QA_Dataset.csv')

df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [None]:
# tokenization
def toekenize(text):
  return text.lower().replace('?','').replace("'",'').split()

In [None]:
toekenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [None]:
vocab={'<UNK>':0}

In [None]:
def build_vocab(row):
  toekenized_question=toekenize(row['question'])
  toekenized_answer=toekenize(row['answer'])
  marged_tokens=toekenized_question+toekenized_answer

  for token in marged_tokens:
    if token not in vocab:
      vocab[token]=len(vocab)

In [None]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [None]:
# convert words to numerical indices
def text_to_indices(text,vocab):
  indexed_text=[]
  for token in toekenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [None]:
text_to_indices('What is the capital of India?',vocab)

[1, 2, 3, 4, 5, 73]

In [None]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    nummerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

    return torch.tensor(nummerical_question),torch.tensor(numerical_answer)

In [None]:
dataset=QADataset(df,vocab)

In [None]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [None]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [None]:
for question,answer in dataloader:
  print(question,answer)

tensor([[ 42,   2,   3, 274, 211, 275]]) tensor([[276]])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([[184]])
tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([[162]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([[36]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([[131]])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([[154]])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]]) tensor([[52]])
tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]]) tensor([[128]])
tensor([[  1,   2,   3,  37, 133,   5,  26]]) tensor([[134]])
tensor([[ 42, 299, 300, 118,  14, 301, 302, 158, 303, 304, 305, 306]]) tensor([[307]])
tensor([[ 10,  75, 111]]) tensor([[112]])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([[316]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tenso

In [None]:
from torch import nn

In [None]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [None]:
learning_rate = 0.001
epochs = 20

In [None]:
model = SimpleRNN(len(vocab))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 525.035896
Epoch: 2, Loss: 448.679077
Epoch: 3, Loss: 372.650608
Epoch: 4, Loss: 315.863257
Epoch: 5, Loss: 265.653503
Epoch: 6, Loss: 216.520490
Epoch: 7, Loss: 172.051905
Epoch: 8, Loss: 134.465436
Epoch: 9, Loss: 102.928721
Epoch: 10, Loss: 78.587404
Epoch: 11, Loss: 60.328188
Epoch: 12, Loss: 47.222810
Epoch: 13, Loss: 37.524258
Epoch: 14, Loss: 30.362027
Epoch: 15, Loss: 25.264956
Epoch: 16, Loss: 21.239214
Epoch: 17, Loss: 18.304376
Epoch: 18, Loss: 15.547821
Epoch: 19, Loss: 13.459711
Epoch: 20, Loss: 11.720937


In [None]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")
  else:
    print(list(vocab.keys())[index])

In [None]:
predict(model, "What is the largest planet in our solar system?")

jupiter


In [None]:
predict(model, "What is the capital of Germany?")

berlin


In [None]:
x=nn.Embedding(324, embedding_dim=10)
x(dataset[10][0]).shape

torch.Size([6, 10])

In [None]:
nn.RNN(10, 64)(nn.Embedding(324, embedding_dim=10)(dataset[10][0]))

(tensor([[ 1.3268e-01, -3.7005e-01, -2.5335e-01, -1.7172e-01,  3.0222e-01,
          -3.0812e-01,  4.5108e-01, -4.5076e-01,  1.8515e-01, -4.1935e-01,
           3.8042e-01,  1.8750e-01, -1.3946e-01,  1.7428e-01, -9.1689e-02,
           1.9235e-02,  3.5503e-01,  1.3101e-01, -4.0827e-01, -6.2103e-02,
          -1.8250e-01,  3.4634e-01, -1.5364e-01, -1.8798e-01, -1.9749e-01,
           4.1045e-01, -1.3071e-01, -3.5750e-01, -1.5391e-01,  1.1992e-01,
          -1.3495e-01,  4.0197e-01, -5.5352e-02,  5.1964e-02,  1.1266e-01,
           4.0077e-01,  2.2518e-02, -1.5778e-01, -6.0768e-01,  4.1087e-01,
           1.5758e-01,  2.9518e-01, -1.1458e-01, -4.7741e-01, -4.1242e-01,
          -4.0421e-02,  4.6304e-02, -5.0064e-01,  2.2927e-01,  3.6061e-01,
           2.6899e-03, -2.5346e-01,  5.2350e-01,  5.0044e-01, -2.0586e-02,
           1.7942e-01, -2.2073e-01,  5.7796e-02,  9.9088e-02, -3.3711e-01,
           5.6770e-01,  2.7455e-01, -5.4607e-02, -3.1337e-01],
         [-2.3672e-02,  4.0704e-01,  

In [None]:
y=nn.RNN(10, 64)(nn.Embedding(324, embedding_dim=10)(dataset[10][0]))
y[0][5],y[1]

(tensor([-0.0139,  0.0480,  0.1607,  0.4661, -0.0915, -0.1745,  0.1955, -0.3491,
          0.1729,  0.1647, -0.1811, -0.4110,  0.2132,  0.2890,  0.3537, -0.2983,
         -0.6836,  0.1660,  0.4011, -0.0265, -0.1760, -0.0553,  0.3738, -0.1576,
         -0.0484,  0.1427, -0.2766, -0.0918, -0.1613,  0.0485, -0.3672,  0.2628,
         -0.2188, -0.0320, -0.0152, -0.0291,  0.2888,  0.0783, -0.2979,  0.1405,
          0.4196,  0.2605,  0.1415, -0.3071, -0.1631, -0.1310, -0.0436,  0.1497,
         -0.1115,  0.4950,  0.0876,  0.1786,  0.1746, -0.2364, -0.2519, -0.2753,
         -0.0537,  0.1468,  0.2176,  0.0472, -0.0940,  0.0996,  0.1269,  0.0336],
        grad_fn=<SelectBackward0>),
 tensor([[-0.0139,  0.0480,  0.1607,  0.4661, -0.0915, -0.1745,  0.1955, -0.3491,
           0.1729,  0.1647, -0.1811, -0.4110,  0.2132,  0.2890,  0.3537, -0.2983,
          -0.6836,  0.1660,  0.4011, -0.0265, -0.1760, -0.0553,  0.3738, -0.1576,
          -0.0484,  0.1427, -0.2766, -0.0918, -0.1613,  0.0485, -0.36

In [None]:
torch.rand(2,3,2)

tensor([[[0.7267, 0.6293],
         [0.8957, 0.9706],
         [0.8192, 0.6885]],

        [[0.6700, 0.0032],
         [0.2618, 0.3503],
         [0.6850, 0.8073]]])

In [None]:
torch.rand(1,3,2).squeeze(0).shape

torch.Size([3, 2])

In [None]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [None]:
dataset[0][0]

torch.Size([6])