In [None]:
import torch
import numpy as np
import pyLDAvis
import pandas as pd
import itertools

In [None]:
import os
print(os.getcwd())

In [None]:
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)


# restore np.load for future normal usage
# np.load = np_load_old

In [None]:
def softmax(x):
    # x has shape [batch_size, n_classes]
    e = np.exp(x)
    n = np.sum(e, 1, keepdims=True)
    return e/n

In [None]:
path_to_dir = "models/lyrics/"
n_topics = 10

In [None]:
dataset = pd.read_csv("data/song_df.csv")
# "integer -> word" decoder 
decoder = np.load(path_to_dir+'decoder.npy')[()]

vocab = list(decoder.values())

# for restoring document ids, "id used while training -> initial id"
doc_decoder = np.load(path_to_dir+'doc_decoder.npy')[()]
docs = dataset.iloc[list(doc_decoder.values())]["tokenized_lyrics"].values

doc_lengths = np.array([0 if pd.isna(tok) else len(tok.split()) for tok in docs])

docs_compiled = list(itertools.chain.from_iterable([doc for doc in docs if not pd.isna(doc)]))
term_frequency = np.array([docs_compiled.count(word) for word in vocab])

In [None]:
print(len(doc_decoder))
print(len(doc_lengths))
print(len(term_frequency))
print(len(vocab))

In [None]:
state = torch.load('models/lyrics/lda2vec_models/60_epoch_model_state.pytorch', map_location=lambda storage, loc: storage)

In [None]:
doc_weights = state['doc_weights.weight'].cpu().clone().numpy()
topic_vectors = state['topics.topic_vectors'].cpu().clone().numpy()
resulted_word_vectors = state['neg.embedding.weight'].cpu().clone().numpy()

# distribution over the topics for each document
topic_dist = softmax(doc_weights)

# vector representation of the documents
doc_vecs = np.matmul(topic_dist, topic_vectors)
# rows are topics, cols are vocab, vals are probabilities
topic_term_dist = softmax(np.matmul(topic_vectors, resulted_word_vectors.T))


In [None]:
print(doc_vecs.shape)
print(topic_dist.shape)
print(topic_vectors.shape)
print(resulted_word_vectors.shape)
print(doc_weights.shape)
print(topic_term_dist.shape)

In [None]:
dashboard = pyLDAvis.prepare(topic_term_dist, topic_dist, doc_lengths, vocab, term_frequency)
pyLDAvis.display(dashboard)
pyLDAvis.save_html(dashboard, "plots/lyrics_lda2vec.html")

In [None]:
similarity = np.matmul(topic_vectors, resulted_word_vectors.T)
most_similar = similarity.argsort(axis=1)[:, -10:]

for j in range(n_topics):
    topic_words = ' '.join([decoder[i] for i in reversed(most_similar[j])])
    print('topic', j + 1, ':', topic_words)