In [None]:
# Do not run again!
#%pip install scikit-learn 

In [None]:
#!py -m pip install transformers
#!py -m pip install datasets

Consider re-running it to truly install every requirement

In [None]:
"""!py -m pip cache purge
!py -m pip install transformers torchvision torch"""
%pip install huggingface_hub[hf_xet]

In [8]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader

import transformers
from transformers import BertTokenizer
from transformers.models.bert.modeling_bert import BertModel

import threading
import ast # because the id list is in string form
import pandas as pd

from Dependencies.AdditionalFunctions import topK_one_hot, smooth_multi_hot

from Dependencies.MovieDataset import MovieGenresDataset

### Initialize RNN class

In [9]:
from Dependencies.RNN_model_class import RNN
my_rnn = RNN()

if torch.cuda.is_available() : device = "cuda:0"
else : device = "cpu"
my_rnn = RNN().to(device)

### Initialize the Dataset

In [10]:
mgd_ds = MovieGenresDataset()
movie_genre_ds= mgd_ds.getDs() #The entire ds = {title:description:classes}
movie_id_loc =mgd_ds.get_classes() #Indicates the class location in the classifier
genre_ids = movie_genre_ds['genre_ids'].map(ast.literal_eval).tolist() #The actual list of the genre_ids (was a list of strings representing lists)

global pad_value #the value set for padding sequences
pad_value=5555

### **Training Functions**

Epoch Training Function

In [11]:
import time


clear_event = threading.Event()

def subthread_func(iteration):
    time.sleep(0.5)
    print(f"Subthread running... Current Iteration: {iteration}")


def epoch_train(rnn, optimizer, dev, train_ds, batch_size=16):
    loss_arr = [] #a list that stores the gradient squared
    l1_grad_sq = [] #a list that stores the loss of the first rnn layer
    l2_grad_sq = [] #a list that stores the loss of the second rnn layer


    #--------------------TRAINING LOOP--------------------
    i=0
    for movie_ovw_batch, target_batch in train_ds:
        #outputs = rnn.tokenize_input(movie, dev)
        #target = torch.tensor([t for t in target[target!=5555]]).detach() #Padding value is set to be 5555, change accordingly
        avg_loss = 0
        i+=1
        for movie_ovw, target in zip(movie_ovw_batch, target_batch):
            classes = torch.tensor(topK_one_hot(target,19)).detach()
            # Get the smoothed distribution, for using a normalized probability distribution as the target

            y_hat = rnn.forward(movie_ovw.to(dev))# Forward Propagation
            
            classes = smooth_multi_hot(classes,len(target)).to(dev)

            loss_func = nn.BCEWithLogitsLoss() # This loss function applies a multiclass loss function (B.C.E = Binary Cross Entropy)
            loss = loss_func(y_hat,classes)#movie_ovw is already a tokenized vector
            loss_arr.append(loss)
            avg_loss += loss

            del movie_ovw, classes
        avg_loss = loss/batch_size

        # ----------Post_Batch_BackProp----------
        optimizer.zero_grad(set_to_none=True)
        avg_loss.backward(retain_graph=False)
        nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1.0)#gradient clipping
        optimizer.step()

        # Store the layer's squared gradient
        l1_grad_sq.append(my_rnn.rnnL1.weight_hh.grad.norm().item()**2)
        l2_grad_sq.append(my_rnn.rnnL2.weight_hh.grad.norm().item()**2)
        
        del avg_loss
        print(f"Current Iteration: {i*batch_size} (per batch of size {batch_size})")
        
        print(torch.cuda.memory_summary(device=device))
        torch.cuda.empty_cache()

    
    # Export all lists to a csv file 
    df = pd.DataFrame({
        'l1_gradient_sq':l1_grad_sq,'l2_gradient_sq':l2_grad_sq,'loss_arr':loss_arr
    })
    df.to_csv("track.csv", index=False, header=True)



In [12]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

class DS_Struct():
    def __init__(self, overview_set, id_loc_set, dev):
        self.overview_set = overview_set
        self.id_loc_set = id_loc_set
        self.dev = dev
        self.tokenized_ovw = torch.load("Dependencies\overview_embs.pt")#load the tokenized movie overviews from the SAVED file


    def __getitem__(self, idx):
        return self.tokenized_ovw[idx], torch.tensor(self.id_loc_set[idx])

    
    def __len__(self):
        return len(self.id_loc_set)#return the length of the internal list


def collate_fn(batch): 
    sequences, labels = zip(*batch)
    #lengths = [len(seq) for seq in sequences]
    
    padded = pad_sequence(sequences, batch_first=True)
    labels = pad_sequence(labels,batch_first=True,padding_value=pad_value) # the highest length is 9: max([len(id) for idme in movie_id_loc])) == 9
    
    return padded, torch.tensor(labels)

  self.tokenized_ovw = torch.load("Dependencies\overview_embs.pt")#load the tokenized movie overviews from the SAVED file


In [13]:
# Delete/Don't run me
"""overview_ds = []
for i, overview in enumerate(movie_genre_ds["overview"]):
    tokenized_ovw = my_rnn.tokenize_input(overview,device=device)
    overview_ds.append(tokenized_ovw.cpu())
    print(i)
torch.save(overview_ds, "overview_embs.pt")"""

'overview_ds = []\nfor i, overview in enumerate(movie_genre_ds["overview"]):\n    tokenized_ovw = my_rnn.tokenize_input(overview,device=device)\n    overview_ds.append(tokenized_ovw.cpu())\n    print(i)\ntorch.save(overview_ds, "overview_embs.pt")'

### **Train RNN**

In [14]:
from torch.utils.data import random_split
import csv


if __name__ == "__main__":
    optimizer = optim.Adam(params=my_rnn.parameters())

    # Loading the dataset as a class for later batching
    gen_ds = DS_Struct(movie_genre_ds["overview"], movie_id_loc, dev = device)


    train_ds, test_ds = random_split(gen_ds, [0.8,0.2])
    train_loader = DataLoader(dataset=train_ds , batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
    test_loader = DataLoader(dataset=test_ds , batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)

    


    epoch_train(my_rnn, optimizer=optimizer, dev=device, train_ds = train_loader)
    torch.save(my_rnn.state_dict(), "model_parameters.pt")

    #df = pd.DataFrame(test_loader)
    #df.to_csv("test_data.csv", index=False, header=True)
    

  return padded, torch.tensor(labels)


ValueError: not enough values to unpack (expected 3, got 2)