In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("100_Unique_QA_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [4]:
# tokenize
def tokenize(text):
    text = text.lower()
    text = text.replace("?", '')
    text = text.replace("'", "")
    return text.split()

In [5]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [6]:
# vocab
vocab = {"<UNK>" : 0}

In [7]:
vocab

{'<UNK>': 0}

In [8]:
def build_vocab(row):

    tokenized_question = tokenize(row["question"])
    tokenized_answer = tokenize(row["answer"])

    merged_tokens = tokenized_question + tokenized_answer

    for token in merged_tokens:

        if token not in vocab:
            vocab[token] = len(vocab)

In [9]:
df.apply(build_vocab, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [10]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [11]:
len(vocab)

324

In [12]:
# converting text to numerical indicies
def text_to_indices(text, vocab):
    indexed_text = []

    for token in tokenize(text):

        if token in vocab:
            indexed_text.append(vocab[token])

        else:
            indexed_text.append(vocab["<UNK>"])

    return indexed_text

In [13]:
text_to_indices("what is ajay", vocab)

[1, 2, 0]

In [14]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

In [16]:
class QADataset(Dataset):

    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        numerical_question = text_to_indices(self.df.iloc[index]["question"], self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[index]["answer"], self.vocab)

        return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [17]:
dataset = QADataset(df, vocab)

In [18]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [19]:
for question, answer in dataloader:
    print(question, answer)

tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[185]])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([[244]])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([[7]])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([[246]])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([[188]])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([[72]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([[113]])
tensor([[  1,   2,   3,  37, 133,   5,  26]]) tensor([[134]])
tensor([[  1,   2,   3,  69,   5, 155]]) tensor([[156]])
tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([[162]])
tensor([[ 42, 200,   2,  14, 201, 202, 203, 204]]) tensor([[205]])
tensor([[  1,   2,   3, 221,   5, 222, 223, 224]]) tensor([[225]])
tensor([[ 78,  79, 195,  81,  19,   3, 196, 197, 198]]) tensor([[199]])
tensor([[ 10,  96,   3, 104, 239]]) tensor([[240]])
tensor([[ 42, 299, 30

In [20]:
from torch import nn

In [86]:
class SimpleRNN(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim = 50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)

    def forward(self, question):
        emdedded_question = self.embeddings(question)
        hidden, final = self.rnn(emdedded_question)
        output = self.fc(final.squeeze(0))
        return output

In [38]:
dataset[3][0]

tensor([ 1,  2,  3, 17, 18, 19, 20, 21, 22])

In [34]:
x = nn.Embedding(324, embedding_dim = 50) 
# it generates the [6,50] embeddings because our vestor has 6 words and we want the every word should be of 50 dim.

In [41]:
print(x(dataset[3][0]).shape)

torch.Size([9, 50])


In [44]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [45]:
print(x(dataset[0][0]).shape)
a = x(dataset[0][0])
a

torch.Size([6, 50])


tensor([[ 2.0228e+00,  1.2902e+00,  4.7001e-01,  1.3188e+00, -1.2374e-01,
         -6.7735e-01,  1.5142e+00, -5.9998e-01, -7.1084e-01, -1.2694e+00,
         -9.3589e-01, -5.0032e-01,  3.6159e-01, -4.5901e-03,  2.1516e-02,
          1.6140e+00,  7.3907e-01,  2.8858e-01, -5.2929e-01,  4.8753e-01,
         -6.3531e-01,  1.1588e+00,  9.8738e-01, -1.3891e-01, -4.0180e-01,
          1.5329e+00, -1.2546e-01, -2.8893e-01,  8.5959e-01,  4.3177e-01,
         -3.6928e-01, -9.7027e-01,  1.4610e+00, -5.5727e-01,  1.2439e+00,
         -8.5883e-01,  7.0716e-01,  2.6460e-01,  1.0229e-01, -2.4431e+00,
         -5.9472e-01,  5.1581e-01, -7.5611e-01, -1.5905e+00,  7.4599e-01,
         -1.0506e+00, -1.4279e-01,  4.7230e-01, -1.7027e+00, -5.2122e-01],
        [ 1.6101e-01,  1.6037e+00, -5.7788e-01,  6.5995e-02, -7.7933e-01,
          4.1706e-01, -9.6231e-01,  2.1237e+00,  1.0935e+00, -4.9366e-01,
          3.2673e-01, -1.0456e+00,  9.4265e-01,  5.5226e-01, -8.9401e-01,
          6.8184e-01,  7.2779e-01, -1

In [46]:
y = nn.RNN(50, 64)

In [47]:
y(a)

(tensor([[ 0.0343,  0.5501, -0.7140,  0.2809,  0.4558,  0.3066, -0.0506, -0.5815,
           0.4267,  0.7010,  0.1682,  0.3963,  0.1139, -0.0600, -0.0263, -0.5276,
           0.3610, -0.2113,  0.3841,  0.3248, -0.0533,  0.5559, -0.5290,  0.0324,
           0.3452, -0.7156, -0.1289, -0.2774,  0.1417, -0.2848,  0.3456,  0.2753,
          -0.0347, -0.6000, -0.5078, -0.1551,  0.5495,  0.4815,  0.0849, -0.2114,
          -0.5026,  0.4782,  0.6379, -0.2080,  0.8566, -0.1086,  0.3251,  0.4758,
          -0.3609, -0.5262, -0.3731, -0.3837, -0.7279,  0.4402,  0.6754,  0.2791,
          -0.4241,  0.5992,  0.0162,  0.6337, -0.5592, -0.4798, -0.4925,  0.4369],
         [-0.0712,  0.2222,  0.4529, -0.2210,  0.7302, -0.3973, -0.0742, -0.6222,
          -0.2711, -0.3236,  0.0349,  0.0576,  0.6769, -0.1995,  0.7648, -0.4688,
           0.7781, -0.1518,  0.5099,  0.1968, -0.6103, -0.1208,  0.0033, -0.3294,
          -0.0727,  0.2108,  0.3317, -0.5843,  0.5853, -0.1612, -0.2516,  0.6024,
          -0.72

In [56]:
# hidden states
y(a)[0].shape
# these are the outputs of intermediate stage in RNNs
# outputs like [o1, o2, o3, o4, o5, o6]

torch.Size([6, 64])

In [63]:
# final output
b = y(a)[1]
y(a)[1].shape
# and this is the final output and we can see that it is equal of last from intermediate stage 
# final output is [o6]

torch.Size([1, 64])

In [64]:
z = nn.Linear(64, 324)

In [66]:
print(z(b).shape)
z(b)

torch.Size([1, 324])


tensor([[ 0.0432,  0.3001,  0.6356,  0.0551,  0.6062, -0.6885, -0.1355, -0.2815,
          0.2893,  0.0504,  0.2582,  0.2666, -0.3728,  0.1008,  0.3112,  0.1854,
         -0.0649,  0.4106, -0.0940,  0.7339, -0.0094,  0.1674, -0.1012,  0.0696,
          0.6726,  0.3734, -0.0992,  0.2714,  0.1761, -0.2645, -0.3378, -0.1272,
         -0.1231,  0.5825,  0.1101, -0.1220, -0.2049,  0.2044,  0.1554,  0.7725,
         -0.0603, -0.0491,  0.2144, -0.3450, -0.5859, -0.5520, -0.3242, -0.5169,
         -0.1089, -0.1414, -0.0416, -0.2338,  0.0865, -0.0989,  0.1894, -0.1820,
         -0.0528,  0.5004, -0.1386,  0.0953,  0.0532,  0.4272,  0.0981, -0.6189,
          0.3088, -0.1921, -0.2097, -0.0521,  0.1431,  0.0316, -0.3511, -0.5057,
         -0.1556, -0.0972,  0.3879, -0.2747, -0.0440, -0.2688,  0.0074,  0.1298,
         -0.0070, -0.1553, -0.3648, -0.8632, -0.0346, -0.3082,  0.2869,  0.0899,
         -0.2173,  0.0471, -0.2480, -0.3319,  0.3620, -0.1030,  0.1714,  0.1729,
         -0.1494, -0.2754, -

In [82]:
# debugging
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d)

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 6, 64])
shape of e: torch.Size([1, 6, 324])


In [81]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [87]:
learning_rate = 0.001
epochs = 20

In [88]:
model = SimpleRNN(len(vocab))

In [89]:
criterian = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

In [93]:
# training loop

for epoch in range(epochs):

    total_loss = 0

    for question, answer in dataloader:

        optimizer.zero_grad()

        # forward pass
        output = model(question)
        # print(output.shape)

        # loss 
        loss = criterian(output, answer[0])

         #backward
        loss.backward()

        # update
        optimizer.step()

        total_loss = total_loss + loss.item()

    print(f"Epoch: {epoch+1} | loss: {total_loss:4f}")

Epoch: 1 | loss: 10.302537
Epoch: 2 | loss: 9.025960
Epoch: 3 | loss: 7.976411
Epoch: 4 | loss: 7.127881
Epoch: 5 | loss: 6.389466
Epoch: 6 | loss: 5.756176
Epoch: 7 | loss: 5.204326
Epoch: 8 | loss: 4.739720
Epoch: 9 | loss: 4.309742
Epoch: 10 | loss: 3.941969
Epoch: 11 | loss: 3.625783
Epoch: 12 | loss: 3.336977
Epoch: 13 | loss: 3.076548
Epoch: 14 | loss: 2.850929
Epoch: 15 | loss: 2.638151
Epoch: 16 | loss: 2.450876
Epoch: 17 | loss: 2.279607
Epoch: 18 | loss: 2.120859
Epoch: 19 | loss: 1.981034
Epoch: 20 | loss: 1.850931


In [127]:
def predict(model, question, thresold=0.5):

    # convert question to numbers
    numerical_question = text_to_indices(question, vocab)

    # convert to tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)

    # send to model
    output = model(question_tensor) # logits

    # convert logits to probs
    probs = nn.functional.softmax(output, dim=1)

    # find index of max probs
    value, index = torch.max(probs, dim=1)

    if value < thresold:
        print("I dont know")

    print(list(vocab.keys())[index])
    print(index, value)

In [129]:
predict(model, "What is largest planet in our solar system")

jupiter
tensor([23]) tensor([0.9644], grad_fn=<MaxBackward0>)


In [130]:
list(vocab.keys())[23]

'jupiter'