In [1]:
%pip install scikit-learn transformers datasets pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, random_split

from transformers import BertTokenizer, BertModel
import ast
import pandas as pd
import time

from Dependencies.Early_Stop import EarlyStopping
from Dependencies.AdditionalFunctions import topK_one_hot, smooth_multi_hot
from Dependencies.MovieDataset import MovieGenresDataset
from Dependencies.RNN_model_class import RNN

  from .autonotebook import tqdm as notebook_tqdm


### Initialize Model and Device

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
my_rnn = RNN().to(device)

Using device: cuda:0


### Initialize the Dataset

In [4]:
mgd_ds = MovieGenresDataset()
movie_genre_ds = mgd_ds.getDs()
movie_id_loc = mgd_ds.get_classes()

# This value will be used for padding label sequences
pad_value = -1 # Using -1 is safer than a magic number like 5555

### **Training Functions**

In [None]:
def epoch_train(rnn, optimizer, dev, train_loader, val_loader, batch_size):
    rnn.train() # Set the model to training mode
    loss_arr = []
    l1_grad_sq = []
    l2_grad_sq = []

    # --- MEMORY & EFFICIENCY FIX: BATCH-CENTRIC TRAINING --- 
    # The original code iterated one-by-one inside a batch, which is inefficient 
    # and can lead to memory issues. This new loop processes the entire batch at once.
    i, (movie_ovw_tot, target_tot) = enumerate(train_loader)
    continue_run = True

    while i<len(train_loader) and continue_run:
        es = EarlyStopping()
        # Move data at iteration i to the de-facto device (GPU / CPU)
        movie_ovw_batch = movie_ovw_tot[i].to(dev)
        target_batch = target_tot[i].to(dev)

        # --- Prepare targets for the entire batch ---
        # This list will hold the processed multi-hot encoded targets for each item in the batch
        classes_list = []
        for target in target_batch:
            # Filter out padding values from the target tensor
            valid_targets = target[target != pad_value]
            one_hot_target = topK_one_hot(valid_targets.cpu().numpy(), 19) # Assuming topK works with numpy
            smoothed_target = smooth_multi_hot(torch.tensor(one_hot_target), len(valid_targets))
            classes_list.append(smoothed_target)
        

        
        # Stack the list of tensors into a single batch tensor
        classes = torch.stack(classes_list).to(dev)

        # --- Forward Pass ---
        y_hat = rnn.forward(movie_ovw_batch)

        # --- Loss Calculation ---
        loss_func = nn.BCEWithLogitsLoss()
        loss = loss_func(y_hat, classes)

        # --- Backpropagation ---
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1e-4) # Gradient clipping
        
        # Store the layer's squared gradient norm *before* the optimizer step
        if rnn.rnnL1.weight_hh.grad is not None:
            l1_grad_sq.append(rnn.rnnL1.weight_hh.grad.norm().item()**2)
            l2_grad_sq.append(rnn.rnnL2.weight_hh.grad.norm().item()**2)
        
        # --- Optimizer step ---
        optimizer.step()
        
        loss_arr.append(loss.item())
        
        # Print progress every 10 batches
        if (i + 1) % 10 == 0: 
            print(f"Batch {i+1}/{len(train_loader)},gradient = {rnn.rnnL2.weight_hh.grad.norm().item()**2}, Loss = {loss.item():.4f}")
            # For debugging memory, uncomment the line below:
            # print(torch.cuda.memory_summary(device=dev))

        # Validation and early stopping
        rnn.eval()#set the model to evaluation mode
        with torch.nograd():
            vloss = 
            continue_run = es(rnn, vloss)
        rnn.train()# set the model back to training mode
    

    print("Epoch finished.")
    # Save tracking data
    df = pd.DataFrame({
        'l1_gradient_sq': l1_grad_sq,
        'l2_gradient_sq': l2_grad_sq,
        'loss_arr': loss_arr
    })
    df.to_csv("track.csv", index=False, header=True)

### **Dataset and DataLoader Setup**

In [None]:
from torch.nn.utils.rnn import pad_sequence

class MovieOverviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_overviews, id_loc_set):
        self.tokenized_ovw = tokenized_overviews
        self.id_loc_set = id_loc_set

    def __getitem__(self, idx):
        return self.tokenized_ovw[idx], torch.tensor(self.id_loc_set[idx])

    def __len__(self):
        return len(self.id_loc_set)

def collate_fn(batch):
    sequences, labels = zip(*batch)
    
    # Pad sequences (features)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    # Pad labels (targets)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=pad_value)
    
    return padded_sequences, padded_labels

In [None]:
# IMPORTANT: This cell pre-processes all movie overviews into embeddings.
# This should be run only ONCE to create the 'overview_embs.pt' file.
# Running this every time would be very slow.
# It saves the embeddings to the CPU to avoid taking up GPU memory.

import os

embedding_file = "overview_embs.pt"

if not os.path.exists(embedding_file):
    print("Embedding file not found. Creating embeddings...")
    overview_ds = []
    # Use a temporary model on the correct device for tokenization
    temp_model = RNN().to(device)
    for i, overview in enumerate(movie_genre_ds["overview"]):
        # We move the embeddings to the CPU before storing them in the list
        tokenized_ovw = temp_model.tokenize_input(overview, device=device).cpu()
        overview_ds.append(tokenized_ovw)
        if (i+1) % 100 == 0:
            print(f"Processed {i+1}/{len(movie_genre_ds['overview'])} overviews")
    
    torch.save(overview_ds, embedding_file)
    print(f"Saved embeddings to {embedding_file}")
    del temp_model # Free up memory
else:
    print(f"Loading embeddings from {embedding_file}")

# Load the pre-computed embeddings
tokenized_overview_tensors = torch.load(embedding_file)

### **Train RNN**

In [None]:
#from ignite.engine import Engine, Events
#from ignite.handlers import EarlyStopping

def score_function(engine):
    val_loss = engine.state.metrics['nll']
    return -val_loss



if __name__ == "__main__":
    BATCH_SIZE = 4 # Defining batch size as a variable

    optimizer = optim.Adam(params=my_rnn.parameters(), lr=0.001, weight_decay=1.e-4)

    # Create the dataset instance with the pre-loaded embeddings
    full_dataset = MovieOverviewDataset(tokenized_overview_tensors, movie_id_loc)

    # Split into training and testing sets
    train_size = int(0.8 * len(full_dataset))
    val_size = test_size = int((len(full_dataset) - train_size)/2)

    print(val_size)
    train_ds, test_ds, val_ds = random_split(full_dataset, [train_size, test_size, val_size])

    # Create DataLoaders
    train_loader = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn)
    test_loader = DataLoader(dataset=test_ds, batch_size=1, shuffle=False, num_workers=0, collate_fn=collate_fn)
    val_loader = DataLoader(dataset=test_ds, batch_size=1, shuffle=False, num_workers=0, collate_fn=collate_fn)

    print("Starting training...")
    epoch_train(my_rnn, optimizer=optimizer, dev=device, train_loader=train_loader, val_loader=val_loader, batch_size=BATCH_SIZE)

    #handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
    # Note: the handler is attached to an *Evaluator* (runs one epoch on validation dataset).
    #evaluator.add_event_handler(Events.COMPLETED, handler)
    
    print("Training complete. Saving model...")
    torch.save(my_rnn.state_dict(), "model_parameters.pt")
    print("Model saved to model_parameters.pt")