In [None]:
%pip install scikit-learn transformers datasets pandas

In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, random_split

import numpy as np
from transformers import BertTokenizer, BertModel
import pandas as pd
import math

from Dependencies.Early_Stop import EarlyStopping
from Dependencies.AdditionalFunctions import topK_one_hot, smooth_multi_hot
from Dependencies.MovieDataset import MovieGenresDataset
from Dependencies.RNN_model_class import RNN

  from .autonotebook import tqdm as notebook_tqdm


### Initialize Model and Device

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda:0


### Initialize the Dataset

In [3]:
mgd_ds = MovieGenresDataset()
movie_genre_ds = mgd_ds.getDs()
movie_id_loc = mgd_ds.get_classes()

# This value will be used for padding label sequences
pad_value = -1 # Using -1 is safer than a magic number like 5555

In [4]:
def create_smoothed_list(target_batch,class_lst):
    for target in target_batch:
            # Filter out padding values from the target tensor
            valid_targets = target[target != pad_value]
            one_hot_target = topK_one_hot(valid_targets.cpu().numpy(), 19) # Moving targets to cpu to save memory
            smoothed_target = smooth_multi_hot(torch.tensor(one_hot_target), len(valid_targets))
            class_lst.append(smoothed_target)
    return class_lst

### **Training Functions**

In [None]:
def epoch_train(rnn, optimizer, dev, train_loader, val_loader, batch_size):
    rnn.train() # Set the model to training mode
    loss_arr = []
    l1_grad_sq = []
    l2_grad_sq = []

    i=0
    j=0
    continue_run = True
    enum_train = enumerate(train_loader)

    train_size = len(train_loader) - len(train_loader)%batch_size

    # Loop that processes the entire batch at once with early stopping
    while i-1 < train_size and continue_run:
        i, (movie_ovw_batch, target_batch) = next(enum_train)
        # Setting up an early stopping class
        es = EarlyStopping()
        # batching and loading them onto de-facto device (GPU / CPU)
        movie_ovw_batch = movie_ovw_batch.to(dev)
        target_batch = target_batch.to(dev)

        # --- Prepare targets for the entire batch ---
        # These lists will hold the processed multi-hot encoded targets for each item in the batch
        classes_list = []
        classes_list = create_smoothed_list(target_batch ,classes_list)
        
        # Stack the list of tensors into a single batch tensor
        classes = torch.stack(classes_list).to(dev)

        # --- Forward Pass ---
        y_hat = rnn.forward(movie_ovw_batch)

        # --- Loss Calculation ---
        loss_func = nn.BCEWithLogitsLoss()
        loss = loss_func(y_hat, classes)

        # --- Backpropagation ---
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1.0) # Gradient clipping
        
        # Store the layer's squared gradient norm *before* the optimizer step
        if rnn.rnnL1.weight_hh.grad is not None:
            l1_grad_sq.append(rnn.rnnL1.weight_hh.grad.norm().item()**2)
            l2_grad_sq.append(rnn.rnnL2.weight_hh.grad.norm().item()**2)
        
        # --- Optimizer step ---
        
        """if not math.isnan(rnn.rnnL2.weight_hh.grad.norm().item()**2):
            optimizer.step()
        else:
            print(f"These are the logits{y_hat}\n\nThis is the input: {movie_ovw_batch}")
            problemo_df = pd.DataFrame({
                'problemo_yhat':y_hat,
                'problemo_ovw':movie_ovw_batch
            })
            continue_run=False
        """
        
        loss_arr.append(loss.item())
        
        # Print progress every 10 batches
        if (i + 1) % 10 == 0: 
            print(f"Batch {i+1}/{len(train_loader)},gradient = {rnn.rnnL2.weight_hh.grad.norm().item()**2}, Loss = {loss.item():.4f}")
            
            print(rnn.rnnL2.weight_hh.grad.norm() ,math.isnan(rnn.rnnL2.weight_hh.grad.norm().item()**2))
            # For debugging memory, uncomment the line below:
            # print(torch.cuda.memory_summary(device=dev))


        # ----- Validation Section ----- 
        #     (done per step count)
        
        # Validate model with a validation batch every 50 batches
        """if i%200==0 and i!=0 and j<len(val_loader.dataset):
            rnn.eval()#setting model to evaluation mode

            j, (val_movie_ovw_batch, val_target_batch) = next(enumerate(val_loader))
            val_movie_ovw_batch = val_movie_ovw_batch.to(dev)
            val_target_batch = val_target_batch.to(dev)

            val_class_lst = []
            val_class_lst = create_smoothed_list(val_target_batch, val_class_lst)
            valuation_classes = torch.stack(val_class_lst).to(dev)
            with torch.no_grad():
                print(f"yep. that's the shape - {val_movie_ovw_batch.shape}")
                val_y_hat = rnn.forward(val_movie_ovw_batch)
                vloss = loss_func(val_y_hat, valuation_classes)
                # Set the continue boolean to false if the model worsens
                continue_run = es(rnn, vloss)

            rnn.train()# set the model back to training mode"""


    


    print("Epoch finished.")
    # Save tracking data
    df = pd.DataFrame({
        'l1_gradient_sq': l1_grad_sq,
        'l2_gradient_sq': l2_grad_sq,
        'loss_arr': loss_arr
    })
    """if problemo_df:
        problemo_df.to_csv("problemo.csv", index=True, header=True)"""
    df.to_csv("track.csv", index=False, header=True)

### **Dataset and DataLoader Setup**

In [6]:
from torch.nn.utils.rnn import pad_sequence

class MovieOverviewDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_overviews, id_loc_set):
        self.tokenized_ovw = tokenized_overviews
        self.id_loc_set = id_loc_set

    def __getitem__(self, idx):
        return self.tokenized_ovw[idx], torch.tensor(self.id_loc_set[idx])

    def __len__(self):
        return len(self.id_loc_set)

def collate_fn(batch):
    sequences, labels = zip(*batch)
    
    # Pad sequences (features)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    # Pad labels (targets)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=pad_value)
    
    return padded_sequences, padded_labels

In [7]:
# IMPORTANT: This cell pre-processes all movie overviews into embeddings.
# This should be run only ONCE to create the 'overview_embs.pt' file.
# Running this every time would be very slow.
# It saves the embeddings to the CPU to avoid taking up GPU memory.

import os

embedding_file = "overview_embs.pt"

if not os.path.exists(embedding_file):
    print("Embedding file not found. Creating embeddings...")
    overview_ds = []
    # Use a temporary model on the correct device for tokenization
    temp_model = RNN().to(device)
    for i, overview in enumerate(movie_genre_ds["overview"]):
        # We move the embeddings to the CPU before storing them in the list
        tokenized_ovw = temp_model.tokenize_input(overview, device=device).cpu()
        overview_ds.append(tokenized_ovw)
        if (i+1) % 100 == 0:
            print(f"Processed {i+1}/{len(movie_genre_ds['overview'])} overviews")
    
    torch.save(overview_ds, embedding_file)
    print(f"Saved embeddings to {embedding_file}")
    del temp_model # Free up memory
else:
    print(f"Loading embeddings from {embedding_file}")

# Load the pre-computed embeddings
tokenized_overview_tensors = torch.load(embedding_file)

Loading embeddings from overview_embs.pt


### **Train RNN**

In [8]:
if __name__ == "__main__":
    BATCH_SIZE = 4 # Defining batch size as a variable

    my_rnn = RNN().to(device)
    optimizer = optim.Adam(params=my_rnn.parameters(), lr=2.5e-4, weight_decay=1.5e-4)

    # Create the dataset instance with the pre-loaded embeddings
    full_dataset = MovieOverviewDataset(tokenized_overview_tensors, movie_id_loc)

    # Split dataset into train, test and validation datasets
    train_size = int(0.8 * len(full_dataset))
    val_size = int((len(full_dataset) - train_size)/2) 
    test_size = len(full_dataset) - train_size - val_size
    

    print(val_size)
    train_ds, test_ds, val_ds = random_split(full_dataset, [train_size, test_size, val_size])

    # Create DataLoaders
    train_loader = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_fn)
    test_loader = DataLoader(dataset=test_ds, batch_size=1, shuffle=False, num_workers=0, collate_fn=collate_fn)
    val_loader = DataLoader(dataset=test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_fn)

    np.set_printoptions(threshold=None)
    print("Starting training...")
    epoch_train(my_rnn, optimizer=optimizer, dev=device, train_loader=train_loader, val_loader=val_loader, batch_size=BATCH_SIZE)

    
    print("Training complete. Saving model...")
    torch.save(my_rnn.state_dict(), "model_parameters.pt")
    print("Model saved to model_parameters.pt")

998
Starting training...


NameError: name 'train' is not defined