# Imports

In [1]:
import os, time

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
from plotly import express as px

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

torch.__version__

'2.5.1'

setups

In [2]:
torch.manual_seed(44)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available, so using", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available, so using CPU only.")

CUDA is available, so using NVIDIA GeForce GTX 1660 Ti


# Data Loading

In [3]:
df = pd.read_csv("Data/QnA/100-unique-qna-dataset.csv")
df.columns = ['Question', 'Answer']
df

Unnamed: 0,Question,Answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


# Data Preparation

tokenize

In [4]:
def tokenize(text):
    text = text.strip() # removing whitespaces
    text = text.lower() # lowercase words
    text = text.replace('?', '').replace("'", '') # removing ?, '
    text = text.split(' ') # creating tokens
    text = [word.strip() for word in text] # removing word whitespaces
    
    return text

text = "What is the capital of 'France'?"
tokenize(text)

['what', 'is', 'the', 'capital', 'of', 'france']

vocab space

In [5]:
vocab_space = {'<UNK>': 0} # for out of vocab

def build_vocab_space(row):
    
    que_tokens = tokenize(row['Question'])
    ans_tokens = tokenize(row['Answer'])
    tokens = que_tokens + ans_tokens

    for token in tokens:
        if not vocab_space.get(token):
            vocab_space[token] = len(vocab_space)


_ = df.apply(build_vocab_space, axis=1)

len(vocab_space)

324

vectors

In [6]:
def text_to_vector(text, vocab_space):

    vector = []
    for token in tokenize(text):
        if vocab_space.get(token):
            val = vocab_space[token]
        else:
            val = vocab_space['<UNK>']
        
        vector.append(val)

    return vector


text = "Ahbaz What is the capital of 'France'?"
text_to_vector(text, vocab_space)

[0, 1, 2, 3, 4, 5, 6]

dataset

In [7]:
class QnADataset(Dataset):

    def __init__(self, df, vocab_space):
        self.df = df
        self.vocab_space = vocab_space

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        que_text = self.df.iloc[idx]['Question']
        ans_text = self.df.iloc[idx]['Answer']

        # text -> token -> vector, skipping padding since batch_size=1
        que_vector = text_to_vector(que_text, vocab_space)
        ans_vector = text_to_vector(ans_text, vocab_space)

        # vector -> tensor
        que_tensor = torch.tensor(que_vector)
        ans_tensor = torch.tensor(ans_vector)

        return que_tensor, ans_tensor
    

dataset = QnADataset(df, vocab_space)

dataloader

In [8]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, pin_memory=True, )

# Model Training

defining model

In [9]:
class QnAModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=50) # OHE vector x embedding
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)

    def forward(self, que):
        embedded_que = self.embedding(que) # (batch, seq_len) -> (batch, seq_len, emb)
        hidden, final = self.rnn(embedded_que)
        out = self.fc(final.squeeze(0))

        return out


model = QnAModel(vocab_size=len(vocab_space))
model = model.to(device)

defining loss and optimizer

In [10]:
criterion = nn.CrossEntropyLoss()

epochs = 30
learning_rate = 0.001

optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

training pipeline

In [11]:
model.train() # set the model to training mode

avg_losses = []
for epoch in range(epochs):

    loss = []
    for batch in dataloader:
        batch_x, batch_y = batch
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        # forward pass
        batch_y_pred = model(batch_x)

        # loss calculation
        batch_loss = criterion(batch_y_pred, batch_y[0])

        # backward pass
        optimizer.zero_grad()
        batch_loss.backward()

        # update parameters
        optimizer.step()

        loss.append(batch_loss.item())

    avg_loss = np.mean(loss)
    avg_losses.append(avg_loss)
    print(f"Epoch: {epoch + 1}, Loss: {avg_loss:.4f}")

Epoch: 1, Loss: 5.8311
Epoch: 2, Loss: 5.0880
Epoch: 3, Loss: 4.2272
Epoch: 4, Loss: 3.5061
Epoch: 5, Loss: 2.9173
Epoch: 6, Loss: 2.3700
Epoch: 7, Loss: 1.8747
Epoch: 8, Loss: 1.4504
Epoch: 9, Loss: 1.1078
Epoch: 10, Loss: 0.8453
Epoch: 11, Loss: 0.6513
Epoch: 12, Loss: 0.5102
Epoch: 13, Loss: 0.4073
Epoch: 14, Loss: 0.3313
Epoch: 15, Loss: 0.2739
Epoch: 16, Loss: 0.2295
Epoch: 17, Loss: 0.1942
Epoch: 18, Loss: 0.1656
Epoch: 19, Loss: 0.1423
Epoch: 20, Loss: 0.1231
Epoch: 21, Loss: 0.1074
Epoch: 22, Loss: 0.0944
Epoch: 23, Loss: 0.0836
Epoch: 24, Loss: 0.0745
Epoch: 25, Loss: 0.0667
Epoch: 26, Loss: 0.0601
Epoch: 27, Loss: 0.0544
Epoch: 28, Loss: 0.0495
Epoch: 29, Loss: 0.0451
Epoch: 30, Loss: 0.0413


In [12]:
px.line(x = range(1, epochs + 1), y = avg_losses)

# Model Evaluating

Prediction

In [13]:
def predict(model, question, threshold=0.5):
    
    # que -> vector
    question_vector = text_to_vector(question, vocab_space)

    # vector -> tensor
    question_tensor = torch.tensor(question_vector).unsqueeze(0)

    model.eval()  # set model to evaluation mode

    question_tensor = question_tensor.to(device)

    output = model(question_tensor)
    probs = nn.functional.softmax(output, dim=1)
    value, index = torch.max(probs, dim=1)
    
    if value < threshold:
        print(f"I don't know, because of low confidence {value.item()}")
    else:
        answer = list(vocab_space.keys())[index]
        print(f"Answer: {answer} with confidence {value.item()}")



# predict(model, "Ahbaz?")
predict(model, "What is the capital of 'France'?")

Answer: paris with confidence 0.9526240229606628
