In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')
print(df.head())

                                          question      answer
0                   What is the capital of France?       Paris
1                  What is the capital of Germany?      Berlin
2               Who wrote 'To Kill a Mockingbird'?  Harper-Lee
3  What is the largest planet in our solar system?     Jupiter
4   What is the boiling point of water in Celsius?         100


In [7]:
def tokenize(text):
    text = text.lower()
    text = text.replace('?', '')
    text = text.replace('.', '')
    text = text.replace(',', '')
    text = text.replace('!', '')
    text = text.replace(';', '')
    text = text.replace(':', '')
    text = text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace('(', '')
    return text.split()
tokens = tokenize("This is a sample sentence?")
tokens

['this', 'is', 'a', 'sample', 'sentence']

In [8]:
vocab = {'<UNK>':0}

In [11]:
def build_vocab(row):
    print(row['question'], row['answer'])
    t_qs = tokenize(row['question'])
    t_ans = tokenize(row['answer'])
    
    mergerd_tokens = t_qs + t_ans
    print(mergerd_tokens)
    
    for token in mergerd_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)
    

In [12]:
df.apply(build_vocab, axis=1)

What is the capital of France? Paris
['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
What is the capital of Germany? Berlin
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
Who wrote 'To Kill a Mockingbird'? Harper-Lee
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
What is the largest planet in our solar system? Jupiter
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
What is the boiling point of water in Celsius? 100
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
Who painted the Mona Lisa? Leonardo-da-Vinci
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
What is the square root of 64? 8
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
What is the chemical symbol for gold? Au
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
Which year did World War II end? 1945
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
What is the longe

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [13]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [14]:
def text_to_indices(text, vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text    

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

In [16]:
class QADataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        question = text_to_indices(self.df.iloc[idx]['question'],vocab)
        answer = text_to_indices(self.df.iloc[idx]['answer'], vocab)
        return torch.tensor(question), torch.tensor(answer) 

In [17]:
dataset = QADataset(df, vocab)

In [18]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [19]:
Dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


In [20]:
import torch.nn as nn

In [None]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50,64)
        self.fc = nn.Linear(64, vocab_size)