#### **Section 1: Import Libraries**

We import all the necessary libraries. Notice that we no longer need to import `gensim`.



In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import re
import nltk
from sklearn.model_selection import train_test_split

# Ensure reproducibility
torch.manual_seed(42)

# Download necessary NLTK resources
nltk.download('punkt')

# Import word_tokenize explicitly from NLTK
from nltk.tokenize import word_tokenize

# Observations:
# - Added an import for `word_tokenize` explicitly after downloading the NLTK 'punkt' resource.
# - The word_tokenize function is now available globally in the script, and the error will no longer occur.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Girija\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### **Section 2: Load and Prepare Data**

The data loading process remains the same.



In [1]:
# Step 2: Load the Processed Data
#file_path = "../data/processed/customer_support_dataset_processed.csv"  # for complete set
file_path = "../data/processed/customer_support_train_dataset_processed_10%.csv" # for 10% for the complete set - for simple training
df = pd.read_csv(file_path)

# Ensure the dataset contains essential columns
if 'customer_query_cleaned' not in df.columns or 'support_response_cleaned' not in df.columns:
    raise ValueError("Dataset missing required columns: 'customer_query_cleaned' and 'support_response_cleaned'")

# Split data into input and output pairs
queries = df['customer_query_cleaned']
responses = df['support_response_cleaned']

# Split dataset into training and validation sets (90% train, 10% validation)
train_queries, val_queries, train_responses, val_responses = train_test_split(
    queries, responses, test_size=0.1, random_state=42
)

# Observations:
# - Loaded and validated the cleaned dataset.
# - Split the data into training and validation sets, which is critical for model evaluation and avoiding overfitting.


NameError: name 'pd' is not defined

#### **Section 3: Load Pre-trained GloVe Embeddings Without Gensim**

Instead of using Gensim, you will manually download the GloVe embeddings, read them, and then use them to create the embedding matrix.

##### **3.1 Download GloVe Embeddings Manually**

- You can download GloVe embeddings manually from the [GloVe Website](https://nlp.stanford.edu/projects/glove/). Choose, for example, the **glove.6B.zip** file and extract it.
- It contains multiple files like `glove.6B.50d.txt`, `glove.6B.100d.txt`, etc. We'll use `glove.6B.100d.txt` for 100-dimensional word embeddings.

##### **3.2 Load GloVe Embeddings in Python**



In [3]:
# Step 3: Load Pre-trained GloVe Embeddings (Without Gensim)
embedding_dim = 100
glove_path = "../glove.6B.100d.txt"  # Path to the downloaded GloVe file

# Initialize word2idx and embedding matrix lists
word2idx = {}
embedding_matrix = []

# Open the GloVe file and read the embeddings
print("Loading pre-trained GloVe embeddings (may take a few minutes)...")
with open(glove_path, 'r', encoding='utf-8') as f:
    for idx, line in enumerate(f):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        word2idx[word] = idx
        embedding_matrix.append(vector)

# Add special tokens with random embeddings
special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
for token in special_tokens:
    word2idx[token] = len(embedding_matrix)
    embedding_matrix.append(np.random.normal(size=(embedding_dim,)))

# Convert embedding matrix to a tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

# Observations:
# - Loaded GloVe embeddings manually using Python without Gensim.
# - Created an embedding matrix and added random embeddings for special tokens.


Loading pre-trained GloVe embeddings (may take a few minutes)...


  embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)


#### **Section 4: Custom Dataset and DataLoader**

This section remains the same. The custom dataset is responsible for tokenizing the input and padding it to a fixed length.



In [4]:
# Step 4: Custom Dataset and DataLoader
class ChatDataset(Dataset):
    def __init__(self, queries, responses, word2idx, max_len=20):
        # Reset the index of queries and responses to ensure valid indexing
        self.queries = queries.reset_index(drop=True).fillna("")  # Replace NaN with empty string
        self.responses = responses.reset_index(drop=True).fillna("")  # Replace NaN with empty string
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        # Convert text to token ids and pad/truncate to max_len
        try:
            query = self._text_to_sequence(self.queries[idx])
            response = self._text_to_sequence(self.responses[idx])
        except KeyError:
            print(f"KeyError: Index {idx} out of bounds for dataset length {len(self.queries)}")
            raise
        except Exception as e:
            print(f"Unexpected error at index {idx}: {e}")
            raise
        return torch.tensor(query, dtype=torch.long), torch.tensor(response, dtype=torch.long)

    def _text_to_sequence(self, text):
        # Handle non-string inputs
        if not isinstance(text, str):
            print(f"Invalid input detected: {text} (type: {type(text)}). Converting to empty string.")
            text = ""

        tokens = word_tokenize(text)  # Tokenize the text
        sequence = [self.word2idx.get(token, self.word2idx['<UNK>']) for token in tokens]
        sequence = [self.word2idx['<SOS>']] + sequence + [self.word2idx['<EOS>']]
        sequence = sequence[:self.max_len] + [self.word2idx['<PAD>']] * (self.max_len - len(sequence))
        return sequence

# DataLoader instances for training and validation
train_dataset = ChatDataset(train_queries, train_responses, word2idx)
val_dataset = ChatDataset(val_queries, val_responses, word2idx)

# DataLoader setup with reduced batch size to lower memory consumption
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=0, pin_memory=True)


#### **Section 5: Encoder-Decoder Model Design with Attention**

The encoder and decoder design with attention remains largely unchanged, except that we use the manually loaded GloVe embeddings.

##### **5.1 Encoder Definition**



In [5]:
# Encoder Definition
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_matrix, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.rnn = nn.LSTM(embedding_matrix.size(1), hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        # x: [batch_size, seq_len]
        embedded = self.embedding(x)  # embedded: [batch_size, seq_len, embedding_dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell

# Observations:
# - The encoder uses pre-trained GloVe embeddings loaded manually.
# - The embeddings are not frozen (`freeze=False`), meaning they will be fine-tuned during training.


##### **5.2 Decoder with Attention Definition**

The decoder is modified to include the attention layer for better context representation.



In [6]:
# Define Attention Mechanism (Make sure it's defined before the decoder)
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        # Linear layers to compute alignment scores and convert to attention weights
        self.attention = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hidden_size]
        # encoder_outputs: [batch_size, seq_len, hidden_size]
        batch_size = encoder_outputs.shape[0]
        seq_len = encoder_outputs.shape[1]

        # Repeat hidden state seq_len times
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)

        # Concatenate encoder outputs with the repeated hidden state
        energy = torch.tanh(self.attention(torch.cat((hidden, encoder_outputs), dim=2)))
        # Calculate attention scores
        attention = self.v(energy).squeeze(2)

        # Apply softmax to calculate attention weights
        return torch.softmax(attention, dim=1)

# Decoder with Attention Definition (After Attention class is defined)
class DecoderWithAttention(nn.Module):
    def __init__(self, output_size, embedding_matrix, hidden_size, num_layers=1):
        super(DecoderWithAttention, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.attention = Attention(hidden_size)
        self.rnn = nn.LSTM(hidden_size + embedding_matrix.size(1), hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x, hidden, cell, encoder_outputs):
        # x: [batch_size], hidden, cell: [num_layers, batch_size, hidden_size], encoder_outputs: [batch_size, seq_len, hidden_size]
        x = x.unsqueeze(1)  # Add time dimension: [batch_size, 1]
        embedded = self.embedding(x)  # embedded: [batch_size, 1, embedding_dim]

        # Calculate attention weights and apply to encoder outputs to get context vector
        attention_weights = self.attention(hidden[-1], encoder_outputs)
        attention_weights = attention_weights.unsqueeze(1)  # [batch_size, 1, seq_len]
        context = torch.bmm(attention_weights, encoder_outputs)  # [batch_size, 1, hidden_size]

        # Concatenate the context vector with the embedded input word
        rnn_input = torch.cat((embedded, context), dim=2)  # [batch_size, 1, hidden_size + embedding_dim]
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))

        # Use the output of RNN and context vector for prediction
        prediction = self.fc(torch.cat((output.squeeze(1), context.squeeze(1)), dim=1))  # [batch_size, output_size]
        return prediction, hidden, cell

# Observations:
# - The `Attention` class must be defined before it is used by `DecoderWithAttention`.
# - This ensures there is no `NameError` when defining the decoder.


#### **Section 6: Seq2Seq Model Class with Attention Decoder**

The Seq2Seq class integrates the **Encoder** and **DecoderWithAttention** to generate responses.



In [7]:
# Seq2Seq Model Class with Attention Decoder
class Seq2SeqWithAttention(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2SeqWithAttention, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        output_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, target_len, output_size).to(self.device)

        # Pass input through the encoder
        encoder_outputs, hidden, cell = self.encoder(source)

        # First input to the decoder is the <SOS> token
        input = target[:, 0]  # <SOS> token for each batch

        for t in range(1, target_len):
            # Pass the input, hidden state, and encoder outputs to the decoder
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output

            # Determine the next input using teacher forcing
            top1 = output.argmax(1)
            input = target[:, t] if np.random.random() < teacher_forcing_ratio else top1

        return outputs

# Observations:
# - The Seq2Seq model class integrates the encoder and decoder and passes encoder outputs to the decoder for attention.


#### **Section 7: Training the Model with Attention**

This initial step is designed to quickly train the model with only 1 epoch, allowing you to verify that everything is working properly before committing to a longer training time.




In [27]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [28]:
# %%
from tqdm import tqdm  # Import tqdm for progress bar
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
input_size = len(word2idx)  # Vocabulary size for input
output_size = len(word2idx)  # Vocabulary size for output
hidden_size = 256  # Reduced hidden size
embedding_dim = 100  # Reduced embedding dimension to lower memory usage
num_epochs = 1  # Training for one epoch initially
learning_rate = 0.001  # Learning rate for the optimizer


# Instantiate embedding matrix on CPU to save GPU memory
embedding_matrix = torch.randn(len(word2idx), embedding_dim)  # Initialize embeddings with reduced dimensions

# Instantiate encoder, decoder with attention, and Seq2Seq model on CPU
encoder = Encoder(input_size, embedding_matrix, hidden_size).cpu()
decoder = DecoderWithAttention(output_size, embedding_matrix, hidden_size).cpu()
model = Seq2SeqWithAttention(encoder, decoder, device).cpu()

# Move encoder, decoder, and model to GPU (if available)
encoder = encoder.to(device)
decoder = decoder.to(device)
model = model.to(device)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
scaler = GradScaler(device='cuda')  # Mixed precision gradient scaler

# Gradient accumulation steps
accumulation_steps = 2  # Accumulate gradients over 2 mini-batches

# Training loop with validation and checkpointing
best_val_loss = float('inf')  # Initialize the best validation loss

# Initial training for 1 epoch
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0  # Initialize epoch loss

    # Training Phase with Progress Bar
    with tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch") as pbar:
        for i, (source, target) in enumerate(train_loader):
            source = source.to(device)  # Move source to GPU/CPU
            target = target.to(device)  # Move target to GPU/CPU

            optimizer.zero_grad()  # Clear previous gradients

            # Mixed Precision Training
            with autocast(device_type='cuda'):
                output = model(source, target)  # Forward pass

                # Reshape output and target for loss calculation
                output = output[:, 1:].reshape(-1, output.shape[-1])  # Exclude <SOS> token
                target = target[:, 1:].reshape(-1)  # Exclude <SOS> token

                # Loss scaling for accumulation
                loss = criterion(output, target) / accumulation_steps

            # Scale and backpropagate
            scaler.scale(loss).backward()

            # Step optimizer after accumulation steps
            if (i + 1) % accumulation_steps == 0:
                scaler.step(optimizer)  # Update model weights
                scaler.update()  # Update scaler for next step

            epoch_loss += loss.item() * accumulation_steps  # Track cumulative loss
            pbar.set_postfix({"Batch Loss": loss.item() * accumulation_steps})  # Update progress bar
            pbar.update(1)  # Increment progress bar

    avg_train_loss = epoch_loss / len(train_loader)  # Calculate average training loss

    # Validation Phase
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    with torch.no_grad():
        for source, target in val_loader:
            source = source.to(device)  # Move source to GPU/CPU
            target = target.to(device)  # Move target to GPU/CPU

            output = model(source, target)  # Forward pass
            output = output[:, 1:].reshape(-1, output.shape[-1])  # Reshape output
            target = target[:, 1:].reshape(-1)  # Reshape target

            loss = criterion(output, target)  # Calculate validation loss
            val_loss += loss.item()  # Accumulate validation loss

    avg_val_loss = val_loss / len(val_loader)  # Calculate average validation loss

    # Save the best model based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "seq2seq_attention_best_model.pth")  # Save model
        print(f"Model improved. Saving the best model with validation loss: {avg_val_loss:.4f}")

    print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

# Observations:
# - Used mixed precision training using `autocast()` and `GradScaler()` to improve training speed and reduce memory usage.
# - Reduced `hidden_size` and `embedding_dim` to reduce model memory footprint.
# - Initialized encoder and decoder on CPU before transferring to GPU.
# - Reduced batch size for training and validation to further mitigate memory issues.
# - Used `tqdm` to monitor progress during training and validation.


Epoch 1/1: 100%|██████████| 7166/7166 [1:46:45<00:00,  1.12batch/s, Batch Loss=5.07]


Model improved. Saving the best model with validation loss: 4.9798
Epoch [1/1], Train Loss: 5.3375, Val Loss: 4.9798


#### Step 8: Load and Continue Training the Pre-Trained Model (Fine-Tuning)
This step is about fine-tuning the model that was trained for a single epoch in Step 7. The fine-tuning process can be carried out by loading the pre-trained model and training it further for more epochs with adjusted hyperparameters.

In [8]:
# from previous code


# Hyperparameters
input_size = len(word2idx)  # Vocabulary size for input
output_size = len(word2idx)  # Vocabulary size for output
hidden_size = 256  # Reduced hidden size
embedding_dim = 100  # Reduced embedding dimension to lower memory usage
num_epochs = 1  # Training for one epoch initially
learning_rate = 0.001  # Learning rate for the optimizer


In [11]:
# %%
#### Step 8: Load and Continue Training the Pre-Trained Model (Fine-Tuning)

from tqdm import tqdm  # Import tqdm for progress bar
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters for further fine-tuning
num_additional_epochs = 2  # Number of additional epochs for fine-tuning
fine_tune_learning_rate = learning_rate / 10  # Reduce the learning rate for more stable training
batch_size = 8  # Reduced batch size to avoid memory issues
accumulation_steps = 16  # Reduced number of gradient accumulation steps


# Path for saving to Google Drive
tuned_model_path = 'seq2seq_attention_best_model_finetuned.pth'

# Instantiate embedding matrix on CPU to save GPU memory
embedding_dim = 100  # Reduced embedding dimension (use the same as before)
embedding_matrix = torch.randn(len(word2idx), embedding_dim)  # Create embedding matrix

# Instantiate encoder, decoder with attention, and Seq2Seq model
encoder = Encoder(input_size, embedding_matrix, hidden_size).cpu()
decoder = DecoderWithAttention(output_size, embedding_matrix, hidden_size).cpu()
model = Seq2SeqWithAttention(encoder, decoder, device).cpu()

# Move models to GPU
encoder = encoder.to(device)
decoder = decoder.to(device)
model = model.to(device)

# Load the pre-trained model checkpoint
checkpoint_path = "../seq2seq_attention_best_model.pth"
model.load_state_dict(torch.load(checkpoint_path, map_location=device))

# Put the model in training mode to resume training
model.train()

# Define optimizer and loss function for further fine-tuning
optimizer = optim.Adam(model.parameters(), lr=fine_tune_learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
scaler = GradScaler(device='cuda')  # Mixed precision gradient scaler

# DataLoader instances for training and validation (reuse the setup)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

# Set PyTorch CUDA allocation configuration for flexible memory management
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Continue training the model for additional epochs
best_val_loss = float('inf')  # Keep track of the best validation loss

for epoch in range(num_additional_epochs):
    model.train()
    epoch_loss = 0

    # Training Phase with Progress Bar
    with tqdm(total=len(train_loader), desc=f"Fine-Tuning Epoch {epoch + 1}/{num_additional_epochs}", unit="batch") as pbar:
        for i, (source, target) in enumerate(train_loader):
            source = source.to(device)  # Move source to GPU/CPU
            target = target.to(device)  # Move target to GPU/CPU

            optimizer.zero_grad()  # Clear previous gradients

            # Mixed Precision Training
            with autocast(device_type='cuda'):
                output = model(source, target)  # Forward pass

                # Reshape output and target for loss calculation
                output = output[:, 1:].reshape(-1, output.shape[-1])  # Exclude <SOS> token
                target = target[:, 1:].reshape(-1)  # Exclude <SOS> token

                # Loss scaling for accumulation
                loss = criterion(output, target) / accumulation_steps

            # Scale and backpropagate
            scaler.scale(loss).backward()

            # Step optimizer after accumulation steps
            if (i + 1) % accumulation_steps == 0 or i == len(train_loader) - 1:
                scaler.step(optimizer)  # Update model weights
                scaler.update()  # Update scaler for next step

            epoch_loss += loss.item() * accumulation_steps
            pbar.set_postfix({"Batch Loss": loss.item() * accumulation_steps})  # Update progress bar
            pbar.update(1)

            # Manually empty CUDA cache to avoid memory buildup
            torch.cuda.empty_cache()

    avg_train_loss = epoch_loss / len(train_loader)

    # Validation Phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for source, target in val_loader:
            source = source.to(device)
            target = target.to(device)

            # Forward pass for validation
            output = model(source, target)
            output = output[:, 1:].reshape(-1, output.shape[-1])
            target = target[:, 1:].reshape(-1)

            # Calculate validation loss
            loss = criterion(output, target)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    # Save the best model based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), tuned_model_path)
        print(f"Model improved. Saving new best model with validation loss: {avg_val_loss:.4f}")

    print(f'Additional Epoch [{epoch + 1}/{num_additional_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

# Observations:
# - **Batch Size Reduced**: Lowered batch size to `8` to reduce memory consumption.
# - **Gradient Accumulation Steps**: Reduced accumulation steps to `16` to balance memory consumption.
# - **Mixed Precision Training**: Continued use of mixed precision with `autocast()` for efficient memory management.
# - **CUDA Cache Management**: Added `torch.cuda.empty_cache()` to manually clear the CUDA memory to prevent accumulation of unnecessary memory usage.
# - **Flexible Memory Allocation**: Used `PYTORCH_CUDA_ALLOC_CONF` for expandable memory segments to reduce fragmentation issues.
# - **Model Saving in Google Drive**: Saved the model directly to Google Drive for convenience.


  model.load_state_dict(torch.load(checkpoint_path, map_location=device))
Fine-Tuning Epoch 1/2:   0%|          | 6/14332 [01:48<72:06:24, 18.12s/batch, Batch Loss=4.95]


KeyboardInterrupt: 