## Imports

In [6]:
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import gzip
from typing import List
import json
from sklearn.metrics import mean_squared_error

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

## Load the movie title data and filter US movies to sample 20000 for training

In [2]:
fname = 'title.akas.tsv.gz'

with gzip.open(fname, 'rb') as f:
    movie_df = pd.read_table(f, sep='\t', na_values=["\\N","nan"])

# print(movie_df.head())

movie_df_sampled = movie_df[movie_df['region']=="US"].sample(20000)
print(movie_df_sampled.head())

  movie_df = pd.read_table(f, sep='\t', na_values=["\\N","nan"])


             titleId  ordering                           title region  \
28073451  tt30826278         1  King of Style: Micheal Jackson     US   
7263107   tt11703044         1                    Gregory Blue     US   
1854642    tt0368868         1               A Mission to Kill     US   
986485     tt0112782         4           The Dallas Connection     US   
31034484   tt5061158         1                  Truth or Scare     US   

         language        types attributes  isOriginalTitle  
28073451      NaN          NaN        NaN              0.0  
7263107       NaN          NaN        NaN              0.0  
1854642       NaN  imdbDisplay        NaN              0.0  
986485        NaN  imdbDisplay        NaN              0.0  
31034484      NaN          NaN        NaN              0.0  


## Statistics about the movie titles

In [3]:
movie_titles = movie_df_sampled['title'].tolist()
title_lengths = [len(title.split()) for title in movie_titles]

mean_length = np.mean(title_lengths)
std_length = np.std(title_lengths)
print("Mean length:",mean_length)
print("Std length:",std_length)
print("Max length",max(title_lengths))

max_len = int(mean_length + 3*std_length)
print("Max length for model:",max_len)

Mean length: 3.49345
Std length: 2.3171657466612094
Max length 36
Max length for model: 10


## Dataloader class

- For training, it encodes `<len> ## <word> asfgads <text> asd jjksd lksda`
- For testing, it encodes `<len> ## <word> asfgads <text> `

In [7]:
class MovieDataset(Dataset):  
    def __init__(self, tokenizer, movie_titles: List, max_len: int, dataset_type: str,max_seq_len: int=30) -> None:
        self.max_len = max_len
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.movies = movie_titles
        self.dataset_type = dataset_type
        self.result = []
        self.populate()


    def __len__(self) -> int:
        return len(self.result)


    def __getitem__(self, item: int) -> torch.Tensor:
        return self.result[item]
    
    def populate(self) -> None:
        for movie in self.movies:
            movie_words = movie.split()
            movie_len = len(movie_words)
            if movie_len > 1:
                prefix = f"<len> {movie_len-1} <word> {movie_words[0]} <text> "
                movie = (" ").join(movie_words[1:])
            else:
                prefix = f"<len> {movie_len} <word> movie <text> "
                movie = (" ").join(movie_words[:])

            encoded_prefix = self.tokenizer.encode(prefix)
            if self.dataset_type=="train":
                encoded_movie = self.tokenizer.encode(movie)
                if len(encoded_movie)>self.max_len:
                    encoded_movie = encoded_movie[:self.max_len]
                encoded_input = encoded_prefix + encoded_movie
                if len(encoded_input)>self.max_seq_len:
                    encoded_input = encoded_input[:self.max_seq_len-1]
                padded = encoded_input + [self.eos_id]*(self.max_seq_len-len(encoded_input))
            elif self.dataset_type=="test":
                padded = encoded_prefix
            # print(len(padded))
            self.result.append(torch.tensor(padded))


## Model Class

In [13]:
class GPT2Movie(torch.nn.Module):
    def __init__(self, device: str, pretrained_model: str=None):
        super().__init__()
        self.model = None
        if pretrained_model:
            self.model = AutoModelWithLMHead.from_pretrained(pretrained_model)
        else:
            self.model = AutoModelWithLMHead.from_pretrained("gpt2")
        self.model = self.model.to(device)
        self.tokenizer = tokenizer = AutoTokenizer.from_pretrained("gpt2")
        self.optimizer = optim.AdamW(self.model.parameters(), lr=5e-4)

    def forward(self, tensor: torch.Tensor) -> torch.Tensor:
        return self.model(tensor)

    def train(self,train_dataloader, epochs: int) -> None:
        for epoch in range(epochs):
            total_loss = 0.0
            for idx, batch in enumerate(train_dataloader):
                with torch.set_grad_enabled(True):
                    self.optimizer.zero_grad()
                    batch = batch.to(device)
                    output = self.model(batch, labels=batch)
                    loss = output[0]
                    loss.backward()
                    self.optimizer.step()
                    # if idx % 100 == 0:
                    #     print("loss: %f, %d"%(loss, idx))
                    total_loss += loss.item()
            print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_dataloader)}")


    def save(self,filepath: str="model/") -> None:
        self.model.save_pretrained(save_directory=filepath)
        self.tokenizer.save_vocabulary(save_directory=filepath)
        
    def topk(self,probs: torch.Tensor, k: int=5) -> int:
        probs = torch.softmax(probs, dim= -1)

        tokensProb, topIx = torch.topk(probs, k=k)
        tokensProb = tokensProb / torch.sum(tokensProb)
        tokensProb = tokensProb.cpu().detach().numpy()

        choice = np.random.choice(k, 1, p = tokensProb)
        tokenId = topIx[choice][0]

        return int(tokenId)
    
    def inference(self, init_token: torch.Tensor, max_length: int=10) -> str:

        sequence = init_token.numpy().tolist()
        init_input = init_token.unsqueeze(0).to(device)

        with torch.set_grad_enabled(False):
            output = self.model(init_input)
            logits = output.logits[0,-1]

            sequence.append(self.topk(logits))

            for i in range(max_length):
                inp = torch.tensor(sequence).unsqueeze(0).to(device)
                output = self.model(inp)
                logits = output.logits[0,-1]
                res_id = self.topk(logits)

                if res_id == self.tokenizer.eos_token_id:
                    return self.tokenizer.decode(sequence)
                else: 
                    sequence.append(res_id)

        return self.tokenizer.decode(sequence)

    def eval(self,test_dataset) -> None:
        results = []
        within_max_len = 0
        within_req_len = 0
        equal_req_len = 0
        req_len = []
        gen_len = []
        for inp in test_dataset:
            ret_seq = self.inference(inp).strip()
            results.append(ret_seq)
            true_len = int(ret_seq.split("<text>")[0].split(" ")[1])
            output = ret_seq.split("<text>")[1].split(" ")[1:]
            # print(req_len,len(output),output)
            if len(output)<=max_len:
                within_max_len+=1
            if len(output)<=true_len:
                within_req_len+=1
                if len(output)==true_len:
                    equal_req_len+=1
            req_len.append(true_len)
            gen_len.append(len(output))
            
        
        result_json = {"within_max_len":within_max_len/len(test_dataset),
                        "within_req_len": within_req_len/len(test_dataset),
                        "equal_req_len":equal_req_len/len(test_dataset),
                        "MSE_genvreq":mean_squared_error(req_len,gen_len),
                        "gen_results":results}
                        
        json_file_path = "eval_results.json"

        with open(json_file_path, "w") as json_file:
            json.dump(result_json, json_file, indent=4)
        
        print(f"Output within max seq length: {within_max_len/len(test_dataset)}")
        print(f"Output within req seq length: {within_req_len/len(test_dataset)}")
        print(f"Output equal req seq length: {equal_req_len/len(test_dataset)}")
        print(f"MSE req vs gen seq length: {mean_squared_error(req_len,gen_len)}")
        print("-"*20)
        print(results[:10])
    

## Load the model

In [14]:
gpt2 = GPT2Movie(device,"model/")



## Load the tokenizer and dataset

In [None]:
dataset = MovieDataset(gpt2.tokenizer, movie_titles, max_len, dataset_type="train")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)
print(len(dataset))

## Train

In [22]:
gpt2.train(train_dataloader=dataloader, epochs=20)

Epoch 1/20, Loss: 1.1102415719032288


## Save the model

In [None]:
gpt2.save("model/")

## Load test set

In [11]:
movie_test = movie_df[movie_df['region']=="US"].sample(1000)
movie_test = movie_test['title'].tolist()
test_dataset = MovieDataset(gpt2.tokenizer, movie_test, max_len, dataset_type="test")

## Evaluate on test set

In [15]:
gpt2.eval(test_dataset)

Output within max seq length: 1.0
Output within req seq length: 0.946
Output equal req seq length: 0.829
MSE req vs gen seq length: 1.386
--------------------
['<len> 1 <word> The <text> Wager', '<len> 2 <word> The <text> Wicked Ones', '<len> 1 <word> Global <text> Addiction', '<len> 3 <word> Chasing <text> a Booming Market', '<len> 6 <word> Walt <text> Disney World Christmas Day Parade', '<len> 5 <word> John <text> Brenkus Presents the GOAT', '<len> 1 <word> movie <text> Honeymooniacs', '<len> 3 <word> Tucker, <text> the Tucker Film', "<len> 2 <word> Her <text> Master's Voice", '<len> 5 <word> Gora: <text> Lad My Very Adventures']
