In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("100_Unique_QA_Dataset.csv")
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [3]:
#tokenize
def tokenize(text):
    text = text.lower()
    text = text.replace("?",'')
    text = text.replace("'",'')
    return text.split()

In [4]:
tokenize("Who wrote 'Romeo and Juliet'?")

['who', 'wrote', 'romeo', 'and', 'juliet']

In [5]:
#vocabulary
vocab = {'<UNK>':0}
vocab


{'<UNK>': 0}

In [6]:
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer = tokenize(row['answer'])
    merged_tokens = tokenized_question + tokenized_answer
    # print(merged_tokens)
    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

In [7]:
df.iloc[1]['question']

'What is the capital of Germany?'

In [8]:
df.apply(build_vocab,axis = 1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [9]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [10]:
len(vocab)

324

In [11]:
#convert words to num indices
def text_to_indices(text,vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text


In [12]:
text_to_indices("What is campusx?",vocab=vocab)

[1, 2, 0]

In [13]:
import torch
from torch.utils.data import Dataset,DataLoader

In [14]:
class QADataset(Dataset):
    def __init__(self,df,vocab):
        self.df = df 
        self.vocab = vocab
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        numerical_question = text_to_indices(self.df.iloc[index]['question'],self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[index]['answer'],self.vocab)
        return torch.tensor(numerical_question),torch.tensor(numerical_answer)


In [15]:
dataset = QADataset(df,vocab)

In [16]:
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))

In [17]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)
#SINCE batch_size = 1 no need to do padding

In [18]:
for question,answer in dataloader:
    print(question,answer)

tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[ 42, 137,   2, 138,  39, 175, 269]]) tensor([[99]])
tensor([[ 42, 250, 251, 118, 252, 253]]) tensor([[254]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[260]])
tensor([[ 42, 101,   2,   3,  17]]) tensor([[102]])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor([[194]])
tensor([[ 42, 263, 264,  14, 265, 266, 158, 267]]) tensor([[268]])
tensor([[  1,   2,   3,  69,   5, 155]]) tensor([[156]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tensor([[10, 29,  3, 30, 31]]) tensor([[32]])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([[7]])
tensor([[ 10, 140,   3, 141, 142,  12, 143,  83,   3, 144]]) tensor([[145]])
tensor([[ 10, 308,   3, 309, 310]]) tensor([[311]])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([[316]])
tensor([[ 1,  2,  3,  4,  5, 99]]) tensor([[100]])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([[295]])
tensor([[  1,   2,   

In [19]:
#rnn architecture
import torch.nn as nn

class SimpleNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim=50) #converts (batch_size,seq_len) to (batch_size,seq_len,emdedding_dim(50))
        self.rnn = nn.RNN(50,64,batch_first=True) # converts (batch_size,seq_len,emdedding_dim) to (batch_size,seq_len,#hidden_state(64)) as entire hidden state output and (batch_size,1,#hidden_state(64)) as final output
        #input_size=50, hidden_size=64
        #by default RNN expects input of shape (seq_len, batch_size, input_size)
        #but in our data it is (batch_size,seq_len,input_size) so check batch_first=True)
        #returns a tuble with hidden states(including output) and final output 
        #since nn.RNN returns two values we cant use Sequential container
        self.fc = nn.Linear(64,vocab_size)
        #accepts input in form [1,64](even if [64,] is passed)

    def forward(self,question):
        embedded_question = self.embedding(question)
        hidden,final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))

        return output
        


Test Architecture

In [20]:
len(vocab)

324

In [21]:
test_question = torch.tensor([[ 42, 137,2,138,39,175,269]])

In [22]:
embedding = nn.Embedding(324,50)

In [29]:
embedding.weight[4]

tensor([-0.3929, -0.7555, -0.3856, -0.2767,  0.6938,  2.6430, -2.4449,  0.7403,
         2.6030, -0.9837, -1.4852, -1.7795, -1.2435,  1.1410,  0.0836,  0.6653,
         0.1743, -1.4229, -0.5034,  0.8689,  0.2606, -2.2237,  0.9127,  0.2160,
        -0.1072,  0.7021,  0.6581,  0.3915, -0.0606,  0.4988, -0.6211, -0.3967,
        -0.9004,  0.0266,  0.7802, -0.1563,  2.6160, -2.9990, -0.4288,  0.0816,
         0.3562,  0.3013,  0.7703,  2.8769, -0.8720, -0.9588, -0.8056,  0.6607,
        -0.4894, -1.0140], grad_fn=<SelectBackward0>)

In [None]:
test_question.shape

torch.Size([1, 7])

In [None]:
test_embed = embedding(test_question)

In [None]:
test_embed

tensor([[[-9.4084e-02,  1.3554e+00,  9.8286e-01,  2.2110e-01, -8.4011e-02,
           4.5728e-02, -1.2521e+00,  3.9102e-01,  1.1644e+00, -2.4370e-01,
           1.0514e+00,  3.0279e-01,  3.4027e-01,  2.5432e-01,  5.6675e-01,
          -8.6733e-01,  2.4489e+00,  2.2599e-01,  1.9926e+00,  9.0528e-01,
          -1.3234e+00,  1.0407e+00, -1.6158e+00,  1.9492e+00,  5.5116e-01,
           3.7522e-01, -7.1691e-02,  2.2426e-01,  5.5321e-01, -2.7482e+00,
          -1.8221e+00,  1.0434e+00,  1.0565e+00,  1.2589e+00, -5.2421e-01,
           8.1276e-01, -2.4360e+00, -1.9424e+00,  6.4586e-01,  3.9437e-01,
          -1.0603e+00,  9.3096e-03, -3.2104e-01, -1.9822e+00, -1.3030e-02,
           4.9360e-01, -2.3385e-01,  9.0409e-01,  1.6889e-01,  8.1372e-01],
         [-2.1467e-01, -1.3902e+00,  8.4556e-01,  7.8303e-01, -7.1867e-01,
          -8.8965e-01,  1.5101e+00,  7.2903e-01,  1.0124e+00,  7.4131e-01,
           6.6974e-01,  2.2667e-01, -1.2132e+00, -2.1732e-02,  4.1930e-02,
          -1.4682e+00, -

In [None]:
test_embed.shape

torch.Size([1, 7, 50])

In [None]:
rnn = nn.RNN(50,64,batch_first=True) #input_size=50, hidden_size=64
#by default RNN expects input of shape (seq_len, batch_size, input_size)
#but in our data it is (batch_size,seq_len,input_size) so check batch_first=True)

In [None]:
first,second =rnn(test_embed)


In [None]:
first.shape

torch.Size([1, 7, 64])

In [None]:
second.shape

torch.Size([1, 1, 64])

In [None]:
second

tensor([[[ 0.1895, -0.6207, -0.0844,  0.1979, -0.5990, -0.2964, -0.7810,
          -0.7218,  0.5224,  0.8418, -0.6870,  0.8018, -0.5941, -0.5047,
          -0.5172, -0.0828,  0.4543,  0.3174,  0.2996,  0.7059,  0.2459,
          -0.7843,  0.5197,  0.0102, -0.2185,  0.5318, -0.2488,  0.4722,
          -0.0294,  0.0193, -0.6658,  0.3550, -0.6983,  0.3227, -0.0882,
           0.8325,  0.0640,  0.0467, -0.3258,  0.5134, -0.6654,  0.0277,
           0.2671,  0.1848,  0.1400, -0.4369,  0.3591,  0.0673,  0.3006,
           0.1492,  0.3928, -0.1928, -0.6377, -0.8901,  0.2189,  0.9212,
           0.3947,  0.5878, -0.0632,  0.1532, -0.4839,  0.8729, -0.3097,
           0.0690]]], grad_fn=<StackBackward0>)

In [None]:
second.squeeze(0).shape


torch.Size([64])

In [None]:
lr = 0.001
epochs = 20

In [None]:
model = SimpleNN(len(vocab))


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)

In [None]:
#training loop 
for epoch in range(epochs):
    total_loss = 0
    for question,answer in dataloader:
        #forward pass
        output = model(question)
        #loss 
        loss = criterion(output,answer[0])
        #accepts output in shape (batch_size,num_classes) internally used softmax to get the out_class
        #and answer in shape (batch_size)
        optimizer.zero_grad()

        #gradient
        loss.backward()
        
        #update
        optimizer.step()

        total_loss = total_loss + loss.item()
    print(f"Epoch: {epoch+1}, Loss : {total_loss/len(dataloader)}")





Epoch: 1, Loss : 0.11223381703926458
Epoch: 2, Loss : 0.09845897427035702
Epoch: 3, Loss : 0.08685827950636546
Epoch: 4, Loss : 0.0775203652265999
Epoch: 5, Loss : 0.06947166745861372
Epoch: 6, Loss : 0.06289379267642896
Epoch: 7, Loss : 0.056638387776911256
Epoch: 8, Loss : 0.05150830816063616
Epoch: 9, Loss : 0.04696841357896726
Epoch: 10, Loss : 0.04304931296242608
Epoch: 11, Loss : 0.03956645568832755
Epoch: 12, Loss : 0.036360717337164616
Epoch: 13, Loss : 0.033604884965138304
Epoch: 14, Loss : 0.031043976183152862
Epoch: 15, Loss : 0.028827987155980533
Epoch: 16, Loss : 0.026724991264442603
Epoch: 17, Loss : 0.024923955524961152
Epoch: 18, Loss : 0.023213663614458507
Epoch: 19, Loss : 0.021677443705913093
Epoch: 20, Loss : 0.020250137471076516


In [None]:
#predictions
def predict(model,question,threshold=0.5):
    #convert questions to numbers
    numerical_question = text_to_indices(question,vocab)
    #tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)
    #embedding layer accepts input in (batch_size,seq_len) 
    #send to model
    output = model(question_tensor)

    #convert logits to probs
    probs = torch.nn.functional.softmax(output,dim=1)

    #find index of max prob
    value,index = torch.max(probs,dim =1)

    if value < threshold:
        print("I don't know")

    print(list(vocab.keys())[index])
    

In [None]:
predict(model,"What is the capital of france")

paris
