# Tranformer Pretrained


In [8]:
from transformers import AutoTokenizer, AutoModelWithLMHead
import json
import pandas as pd
from tqdm import tqdm
import torch
import os
from collections import Counter, OrderedDict
import matplotlib
from itertools import islice
from src.model.transformer import Transformer
import pickle
from tokenizers import Tokenizer, SentencePieceBPETokenizer

In [9]:
class _tokenizer():
    def __init__(self, dataset_type):
        self.tokenizer_dir = './dataset/tokenizer/description_bpe'
        self.dataset_type = dataset_type
        self.model = SentencePieceBPETokenizer(os.path.join(self.tokenizer_dir, "{}-vocab.json".format(self.dataset_type)),\
                                            os.path.join(self.tokenizer_dir, "{}-merges.txt".format(self.dataset_type)))
        
        self.encoder = self.model.get_vocab()
    def encode(self, target_str, is_pretokenized=False, add_special_tokens=True):
        return self.model.encode(target_str, pair=None, is_pretokenized=False, add_special_tokens=True).ids

    def decode(self, target_ids, skip_special_tokens=True):
        return self.model.decode(target_ids, skip_special_tokens=True)

In [24]:
description_tokenizer = _tokenizer(dataset_type='mpd')
song_vocab = pickle.load(open(os.path.join("./dataset/tokenizer/track", "mpd_vocab.pkl"), mode="rb"))
description_vocab = pickle.load(open(os.path.join("./dataset/tokenizer/description_split", "mpd_vocab.pkl"), mode="rb"))

input_size = len(song_vocab)
output_size = len(description_vocab)
embed_size=128
hidden_size=256
gpus = 0

# Instantiate the model with the parameters used during training
model = Transformer(input_size = input_size,
                    output_size=output_size,  
                    hidden_size=embed_size,  
                    e_layers=3, 
                    d_layers=3,  
                    heads=8,  
                    pf_dim=hidden_size,  
                    dropout=0.1,  
                    e_pos=False,
                    device=gpus)

# Load the pre-trained weights
# Load the checkpoint
checkpoint = torch.load("/Users/bestricemossberg/Projects/automated-playlist-description-generation-system/transfomer_pt/white/s:True_epos:False/best.ckpt", map_location=torch.device('cpu'))

# Print the keys in the checkpoint
for key in checkpoint.keys():
    print(key)

AssertionError: Torch not compiled with CUDA enabled

In [22]:
# Remove 'model.' prefix from state_dict keys
state_dict = {k.replace('model.', ''): v for k, v in checkpoint['state_dict'].items()}

# Load the state_dict into the model
model.load_state_dict(state_dict)

RuntimeError: Error(s) in loading state_dict for Transformer:
	size mismatch for encoder.tok_embedding.weight: copying a param with shape torch.Size([402527, 128]) from checkpoint, the shape in current model is torch.Size([215220, 128]).
	size mismatch for decoder.tok_embedding.weight: copying a param with shape torch.Size([1888, 128]) from checkpoint, the shape in current model is torch.Size([15063, 128]).
	size mismatch for decoder.fc_out.weight: copying a param with shape torch.Size([1888, 128]) from checkpoint, the shape in current model is torch.Size([15063, 128]).
	size mismatch for decoder.fc_out.bias: copying a param with shape torch.Size([1888]) from checkpoint, the shape in current model is torch.Size([15063]).

In [6]:
def load_mpd_dataset(directory_path, max_files=None):
    json_files = (file for file in os.listdir(directory_path) if file.endswith('.json'))
    playlists = []

    for filename in islice(json_files, max_files):
        with open(os.path.join(directory_path, filename), 'r') as f:
            data = json.load(f)
            playlists.extend(data['playlists'])

    return playlists

# Path to the dataset
dataset_path = "/Users/bestricemossberg/Projects/automated-playlist-description-generation-system/spotify_million_playlist_dataset/data"

# Load a limited dataset for testing (e.g., only the first 50 files)
playlists = load_mpd_dataset(dataset_path, max_files=50)

In [None]:
# Generate descriptions
descriptions = []
for playlist in playlists:
    # Encode the playlist title and add the batch dimension
    input_ids = tokenizer.encode(playlist, return_tensors='pt')

    # Generate text until the output length (which includes the context length) reaches 50
    output = model.generate(input_ids, max_length=50, temperature=0.7)

    # Decode the output and add it to the list of descriptions
    description = tokenizer.decode(output[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    descriptions.append(description)

# Now 'descriptions' is a list of descriptions generated by the model