In [61]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import torch

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

# https://github.com/DmitryUlyanov/Multicore-TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE

import sys
sys.path.insert(0,'.')
sys.path.insert(0,'/data_big/mlp/custom_lda2vec/lda2vec-pytorch')

from utils.lda2vec_loss import loss, topic_embedding

In [62]:
def softmax(x):
    # x has shape [batch_size, n_classes]
    e = np.exp(x)
    n = np.sum(e, 1, keepdims=True)
    return e/n

# Load data

In [63]:
# dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
# docs = dataset['data']

# store each document with an initial id
# docs = [(i, doc) for i, doc in enumerate(docs)]

docs = np.load('docs.npy')
# "integer -> word" decoder 
decoder = np.load('decoder.npy')[()]

# for restoring document ids, "id used while training -> initial id"
doc_decoder = np.load('doc_decoder.npy')[()]

In [64]:
# original document categories
# targets = dataset['target']
# target_names = dataset['target_names']
# targets = np.array([targets[doc_decoder[i]] for i in range(len(doc_decoder))])

# Load the trained model

In [72]:
# test_data = np.load('test_data.npy')
# batch = torch.from_numpy(test_data).cuda()

# doc_indices = batch[:, 0]
# pivot_words = batch[:, 1]
# target_words = batch[:, 2:]
            
# model = torch.load('model.pytorch')
# model.eval()
# test_doc_vectors = model(doc_indices, pivot_words, target_words)
# print(test_doc_vectors)

state = torch.load('model_state.pytorch', map_location=lambda storage, loc: storage)
n_topics = 10

doc_weights = state['doc_weights.weight'].cpu().clone().numpy()
topic_vectors = state['topics.topic_vectors'].cpu().clone().numpy()
resulted_word_vectors = state['neg.embedding.weight'].cpu().clone().numpy()

# distribution over the topics for each document
topic_dist = softmax(doc_weights)

# vector representation of the documents
doc_vecs = np.matmul(topic_dist, topic_vectors)

# Show topics

In [73]:
similarity = np.matmul(topic_vectors, resulted_word_vectors.T)
most_similar = similarity.argsort(axis=1)[:, -30:]

In [74]:
msl = list(map((lambda x: set(x)), most_similar.tolist()))
ints = set.intersection(*msl)

In [75]:
ints

{0}

In [76]:
most_similar_n = list(map((lambda x: list(x - ints)), msl))

In [77]:
most_similar_n

[[1,
  2,
  3,
  5,
  70,
  7,
  9,
  10,
  11,
  76,
  13,
  14,
  16,
  20,
  21,
  22,
  23,
  86,
  26,
  28,
  92,
  33,
  34,
  40,
  48,
  241,
  50,
  58,
  63],
 [1,
  2,
  4,
  5,
  134,
  7,
  8,
  9,
  10,
  11,
  204,
  141,
  14,
  15,
  16,
  84,
  22,
  23,
  28,
  30,
  31,
  35,
  41,
  42,
  46,
  47,
  48,
  51,
  60],
 [65,
  2,
  6,
  9,
  10,
  11,
  75,
  13,
  14,
  16,
  20,
  21,
  22,
  23,
  89,
  30,
  31,
  57,
  94,
  164,
  37,
  38,
  101,
  49,
  565,
  53,
  631,
  441,
  383],
 [129,
  2,
  3,
  1,
  5,
  7,
  9,
  74,
  11,
  76,
  14,
  16,
  17,
  145,
  19,
  20,
  21,
  22,
  23,
  81,
  26,
  28,
  34,
  37,
  38,
  40,
  50,
  114,
  185],
 [128,
  1,
  3,
  64,
  5,
  6,
  69,
  73,
  12,
  13,
  144,
  18,
  19,
  25,
  545,
  229,
  39,
  553,
  43,
  44,
  235,
  111,
  304,
  368,
  239,
  180,
  54,
  119,
  62],
 [1,
  2,
  4,
  7,
  135,
  9,
  10,
  8,
  11,
  14,
  15,
  16,
  147,
  20,
  21,
  22,
  23,
  84,
  85,
  213,
  30,
  

In [78]:
# similarity = np.matmul(topic_vectors, resulted_word_vectors.T)
# most_similar = similarity.argsort(axis=1)[:, -10:]

for j in range(n_topics):
#     topic_words_list = 
    topic_words = ' '.join([decoder[i] for i in reversed(most_similar_n[j])])
    print('topic', j + 1, ':', topic_words)

topic 1 : bad thing country life speak come talk love stand american time work go know good get want need fire real president think america people obama say mexican like amp
topic 2 : latino truth speak tcot support win makeamericagreatagain tell candidate run american go know party want republican need leader voter president think america vote people democrat say gop like amp
topic 3 : piece club slogan presidential apparent new lol man great see mouth campaign candidate run donaldtrumpforpresident go know good get want need fire way president think america nbc like hair
topic 4 : wall pay country come man great talk american time white go know good get mexico kill illegal want need real president big america people say amp mexican like build
topic 5 : business drop cut follow relationship recent sever ass company macys tie nbcuniversal univision pull officially comment mexico racist late fire immigrant remark call nbc say dump mexican amp end
topic 6 : presidential country elect win 

# Show learned document embeddings

In [None]:
tsne = TSNE(perplexity=200, n_jobs=4)
X = tsne.fit_transform(doc_vecs.astype('float64'))

In [None]:
def plot(X):
    # X has shape [n_documents, 2]
    
    plt.figure(figsize=(16, 9), dpi=120);
    cmap = plt.cm.tab20
    number_of_targets = 3
    
    for i in range(number_of_targets):
        
        label = target_names[i]
        size = 15.0
        linewidths = 0.5
        edgecolors = 'k'
        color = cmap(i)
        
        if 'comp' in label:
            marker = 'x'
        elif 'sport' in label:
            marker = 's'
            edgecolors = 'b'
        elif 'politics' in label:
            marker = 'o'
            edgecolors = 'g'
        elif 'religion' in label:
            marker = 'P'
            size = 17.0
        elif 'sci' in label:
            marker = 'o'
            size = 14.0
            edgecolors = 'k'
            linewidths = 1.0
        elif 'atheism' in label:
            marker = 'P'
            size = 18.0
            edgecolors = 'r'
            linewidths = 0.5
        else:
            marker = 'v'
            edgecolors = 'm'
        
        plt.scatter(
            X[targets == i, 0], 
            X[targets == i, 1], 
            s=size, c=color, marker=marker,
            linewidths=linewidths, edgecolors=edgecolors,
            label=label
        );
    
    leg = plt.legend()
    leg.get_frame().set_alpha(0.3)

In [None]:
plot(X)  # learned document vectors

# different colors and markers represent 
# ground truth labels of each document

# open this image in new tab to see it better

# Show initial document weights (vanilla lda)

In [87]:
doc_weights_init = np.load('doc_weights_init.npy')

In [88]:
tsne = TSNE(perplexity=200, n_jobs=4)
Y = tsne.fit_transform(doc_weights_init.astype('float64'))

KeyboardInterrupt: 

In [None]:
# to initialize topic assignments for lda2vec algorithm 
# I run normal LDA and used output distributions over topics 
# of each document

plot(Y)  # distribution over the topics for each document (output of LDA)

# different colors and markers represent 
# ground truth labels of each document

# open this image in new tab to see it better

# Explore learned topic distributions

In [None]:
tsne = TSNE(perplexity=200, n_jobs=4)
Z = tsne.fit_transform(topic_dist.astype('float64'))

In [None]:
plot(Z)  # learned distribution over the topics for each document

# these are topic assignments as on the plot above 
# but these ones are after the training of lda2vec

# different colors and markers represent 
# ground truth labels of each document

# open this image in new tab to see it better

In [None]:
# distribution of nonzero probabilities
dist = topic_dist.reshape(-1)
plt.hist(dist[dist > 0.01], bins=40);

In [None]:
# distribution of probabilities for some random topic
plt.hist(topic_dist[:, 10], bins=40);

In [None]:
# topic assignments for two random topics
plt.scatter(topic_dist[:, 10], topic_dist[:, 20]);

In [None]:
# correlation of topic assignments
corr = np.corrcoef(topic_dist.transpose(1, 0))
plt.imshow(corr);
plt.colorbar();

# Show a document and its topics

In [60]:
i = 200 # document id
print('DOCUMENT:')
for j, doc, type in docs:
    if (doc_decoder[i] == int(j)):
        print(j)
        print(doc_vecs[i])
        print(len(doc_vecs[i]))
        print(doc, type)
        break
        

# print([doc for j, doc, _ in docs if j == doc_decoder[i]][0], '\n')

print('DISTRIBUTION OVER TOPICS:')
s = ''
for j, p in enumerate(topic_dist[i], 1):
    s += '{0}:{1:.3f}  '.format(j, p)
    if j%6 == 0:
        s += '\n'
print(s)

print('\nTOP TOPICS:')
for j in reversed(topic_dist[i].argsort()[-3:]):
    topic_words = ' '.join([decoder[i] for i in reversed(most_similar[j])])
    print('topic', j + 1, ':', topic_words)

DOCUMENT:
269
[-0.17874435 -0.02110231  0.10985541  0.11945958 -0.1667027   0.15010776
 -0.11709902  0.142265   -0.07686004 -0.16921599  0.1240266  -0.11825462
 -0.09898441 -0.08517006 -0.12140565 -0.09969614  0.07401945  0.13598673
  0.16557884  0.09224206 -0.05806314  0.07982362 -0.14151505 -0.14326194
 -0.09792229 -0.03457102  0.11040005 -0.2025312  -0.11079633  0.08063796
 -0.1334653   0.18217127  0.0946084   0.16695362  0.10984115  0.04027427
 -0.06614006  0.18381554  0.05597054  0.16237077  0.08985563 -0.11956751
  0.14351575 -0.10284287 -0.1738466   0.06360093  0.18626697  0.06131935
  0.15995845 -0.10352875]
50
did you know that the u.s constitution is now under full pledge attack by the obama tyrant administration? wethepeople semst test
DISTRIBUTION OVER TOPICS:
1:0.041  2:0.043  3:0.047  4:0.046  5:0.037  6:0.041  
7:0.037  8:0.039  9:0.038  10:0.041  11:0.037  12:0.043  
13:0.039  14:0.039  15:0.039  16:0.042  17:0.039  18:0.040  
19:0.035  20:0.042  21:0.039  22:0.038  23: