# Imports

In [1]:
import os, time

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
from plotly import express as px

# import nltk
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

torch.__version__

'2.5.1'

setups

In [2]:
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [3]:
torch.manual_seed(44)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available, so using", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available, so using CPU only.")

CUDA is available, so using NVIDIA GeForce GTX 1660 Ti


# Data Loading

In [4]:
with open("Data/Other/campusx-faq-dataset.txt", "r", encoding="utf-8") as f:
    document = f.read()

document = document.lower()

# Data Preparation

tokenize

In [5]:
document = document.lower()

tokens = word_tokenize(document)
unique_tokens = list(set(tokens))

len(tokens), len(unique_tokens)

(1018, 288)

vocab space

In [6]:
vocab_space = {'<UNK>': 0} # for out of vocab

for token in unique_tokens:
    if not vocab_space.get(token):
        vocab_space[token] = len(vocab_space)

len(vocab_space)

289

vectors

In [7]:
def text_to_vector(text, vocab_space):

    tokens = word_tokenize(text.lower())

    vector = []
    for token in tokens:
        if vocab_space.get(token):
            vector.append(vocab_space[token])
        else:
            vector.append(vocab_space['<UNK>'])

    return vector


text_to_vector("You have to attempt all the course assessments.", vocab_space)

[191, 8, 253, 174, 33, 61, 166, 178, 84]

In [8]:
sentences = document.split('\n')

vector_sentences = []
for sentence in sentences:
    vector_sentence = text_to_vector(sentence, vocab_space)
    vector_sentences.append(vector_sentence)

len(vector_sentences)

77

training sequences

In [9]:
training_sequences = []
for id, vector_sentence in enumerate(vector_sentences):
    for i in range(1, len(vector_sentence)):
        training_sequences.append(vector_sentence[:i + 1])

len(training_sequences)

942

padding

In [10]:
sequence_lengths = [len(sequence) for sequence in training_sequences]
longest_sequence = max(sequence_lengths)
longest_sequence

62

In [11]:
padded_training_sequences = []
for sequence in training_sequences:
    pad_list = [0] * (longest_sequence - len(sequence))
    padded_training_sequence = pad_list + sequence
    padded_training_sequences.append(padded_training_sequence)

padded_training_sequences = torch.tensor(padded_training_sequences, dtype=torch.long)
padded_training_sequences.shape

torch.Size([942, 62])

dataset

In [12]:
X = padded_training_sequences[:, :-1]
Y = padded_training_sequences[:, -1]

X.shape, Y.shape

(torch.Size([942, 61]), torch.Size([942]))

In [13]:
class FAQDataset(Dataset):

    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
    

dataset = FAQDataset(X, Y)

dataloader

In [14]:
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)

# Model Training

defining model

In [15]:
class FAQModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=100) # OHE vector x embedding
        self.lstm = nn.LSTM(100, 150, batch_first=True)
        self.fc = nn.Linear(150, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
        out = self.fc(final_hidden_state.squeeze(0))

        return out
    

model = FAQModel(vocab_size=len(vocab_space))
model = model.to(device)

defining loss and optimizer

In [16]:
criterion = nn.CrossEntropyLoss()

epochs = 75
learning_rate = 0.001

optimizer = optim.Adam(params=model.parameters(), lr=learning_rate)

training pipeline

In [17]:
model.train() # set the model to training mode

avg_losses = []
for epoch in range(epochs):

    loss = []
    for batch in dataloader:
        batch_x, batch_y = batch
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        # forward pass
        batch_y_pred = model(batch_x)
        
        # loss calculation
        batch_loss = criterion(batch_y_pred, batch_y)

        # backward pass
        optimizer.zero_grad()
        batch_loss.backward()

        # update parameters
        optimizer.step()

        loss.append(batch_loss.item())

    avg_loss = np.mean(loss)
    avg_losses.append(avg_loss)
    print(f"Epoch: {epoch + 1}, Loss: {avg_loss:.4f}")

Epoch: 1, Loss: 5.6052
Epoch: 2, Loss: 5.1677
Epoch: 3, Loss: 4.7654
Epoch: 4, Loss: 4.4974
Epoch: 5, Loss: 4.2118
Epoch: 6, Loss: 3.9390
Epoch: 7, Loss: 3.6598
Epoch: 8, Loss: 3.3943
Epoch: 9, Loss: 3.1473
Epoch: 10, Loss: 2.8935
Epoch: 11, Loss: 2.6557
Epoch: 12, Loss: 2.4524
Epoch: 13, Loss: 2.2440
Epoch: 14, Loss: 2.0503
Epoch: 15, Loss: 1.8784
Epoch: 16, Loss: 1.7163
Epoch: 17, Loss: 1.5718
Epoch: 18, Loss: 1.4376
Epoch: 19, Loss: 1.3065
Epoch: 20, Loss: 1.2000
Epoch: 21, Loss: 1.0914
Epoch: 22, Loss: 0.9977
Epoch: 23, Loss: 0.9108
Epoch: 24, Loss: 0.8369
Epoch: 25, Loss: 0.7679
Epoch: 26, Loss: 0.7064
Epoch: 27, Loss: 0.6533
Epoch: 28, Loss: 0.6001
Epoch: 29, Loss: 0.5598
Epoch: 30, Loss: 0.5232
Epoch: 31, Loss: 0.4858
Epoch: 32, Loss: 0.4542
Epoch: 33, Loss: 0.4307
Epoch: 34, Loss: 0.4047
Epoch: 35, Loss: 0.3791
Epoch: 36, Loss: 0.3569
Epoch: 37, Loss: 0.3402
Epoch: 38, Loss: 0.3263
Epoch: 39, Loss: 0.3087
Epoch: 40, Loss: 0.2975
Epoch: 41, Loss: 0.2831
Epoch: 42, Loss: 0.2714
E

In [18]:
px.line(x = range(1, epochs + 1), y = avg_losses)