In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import torch
import numpy as np
from tqdm.auto import tqdm

from preprocessing import tokenize, embeddings

## Loading Data

In [None]:
df = pd.read_csv("Data/df_lyrics.csv")
print(df.shape)
df.head()

In [None]:
for genre in set(df.Genre):
    print(genre, df[df.Genre == genre].shape[0])

## Preprocessing

In [None]:
tokenizer = tokenize.Tokenizer()
tokenizer.load(path = "Weights/tokenizer.json")
tokenizer.tokenize("I'm a little teapot", get_token_ids=True)

In [None]:
##########################################################
MAX_LENGTH = 1024  # max context length for the tokenizer
##########################################################
lyrics = [l[:min(len(l), MAX_LENGTH)] for l in list(df["Lyrics"])]

proprocessed_lyrics = tokenizer.tokenize(lyrics, get_token_ids=True)

In [None]:
lyrics_token_ids = proprocessed_lyrics["token_ids"]
len(lyrics_token_ids)

In [None]:
shuffle_idx = np.random.permutation(len(lyrics_token_ids))

####################################################################
training_size = 45000
validation_size = 15000
test_size = len(lyrics_token_ids) - training_size - validation_size
####################################################################

training_data = [lyrics_token_ids[i] for i in shuffle_idx[:training_size]]
validation_data = [lyrics_token_ids[i] for i in shuffle_idx[training_size : training_size + validation_size]]
test_data = [lyrics_token_ids[i] for i in shuffle_idx[training_size + validation_size : ]]

print(len(training_data), len(validation_data), len(test_data))

In [None]:
embedder = embeddings.Embedding()
embedder.load("Weights/embeddings_w2v.txt")

In [None]:
embedder.embeddings_size

In [None]:
embedder.vocab_size

In [None]:
embedding_matrix = torch.zeros(embedder.vocab_size, embedder.embeddings_size)
for idx in tqdm(tokenizer.index_word):
    word = tokenizer.index_word[idx]
    vec = embedder.model.get_vector(word)
    print(idx, vec)
    embedding_matrix[idx] = torch.tensor(vec)

embedding_matrix.shape

## Modeling

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_layers = 8):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim = embedding_dim)
        self.rnn = torch.nn.RNN(
            input_size = embedding_dim, 
            hidden_size = hidden_dim,
            num_layers = num_layers,
        )
        self.decoder = torch.nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, input, hidden):
        embeds = self.word_embeddings(input)
        output, hidden = self.rnn(embeds, hidden)
        decoded = self.decoder(output)
        return decoded, hidden

In [None]:
rnn = RNN(
    embedding_dim = embedder.embeddings_size,
    hidden_dim = embedder.embeddings_size,
    vocab_size = embedder.vocab_size
)
rnn

In [None]:
model_embedder = rnn.word_embeddings
for p in model_embedder.parameters():
    print(p.shape)

In [None]:
word = "hello"
embed_vector = embedder.model.get_vector(word)
tokenizer.keras_tokenizer.word_index[word]

In [None]:
model_embedder(torch.tensor(tokenizer.keras_tokenizer.word_index[word]))