In [None]:
#Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Enable autoreloading from src
%load_ext autoreload
%autoreload 2

In [None]:
#Some plotting libraries
import matplotlib.pyplot as plt
%matplotlib notebook
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE 
output_notebook(resources=INLINE)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from src.data.numba_word_vectorizer import word_word_cooccurence_matrix
from src.data.em_method import em_sparse
import scipy.sparse
from sklearn.preprocessing import normalize
from enstop import PLSA
import umap
import umap.plot

In [None]:
from src import workflow
from src.data import Dataset

### Read in our data

In [None]:
from sklearn.datasets import fetch_20newsgroups

## Transform data

In this case we're going to be doing a joint word-document embedding. All we need are the reviews as a list of separate documents to start with.

This part is from https://github.com/lmcinnes/umap/blob/master/notebooks/Document%20embedding%20using%20UMAP.ipynb

In [None]:
%%time
dataset = fetch_20newsgroups(subset='test',
                             shuffle=True,
                             random_state=42)

In [None]:
print(f'{len(dataset.data)} documents')
print(f'{len(dataset.target_names)} categories')



Here are the categories of documents. As you can see many are related to one another (e.g. 'comp.sys.ibm.pc.hardware' and 'comp.sys.mac.hardware') but they are not all correlated (e.g. 'sci.med' and 'rec.sport.baseball').


In [None]:
dataset.target_names

Let's look at a couple sample documents

In [None]:
for idx, document in enumerate(dataset.data[:3]):
    category = dataset.target_names[dataset.target[idx]]
    
    print(f'Category: {category}')
    print('---------------------------')
    # Print the first 500 characters of the post
    print(document[:500])
    print('---------------------------')

Grab reviews that are long enough

In [None]:
raw_text = pd.DataFrame(dataset.data, columns=['text'])

In [None]:
raw_text

In [None]:
# don't need this. Everything has at least len 100
raw_text = np.unique(np.array(raw_text[raw_text.text.str.len() > 100]))

In [None]:
len(raw_text)

## XXX parse posts into their constituent parts

## Build Document Matrix

We will deal with documents, in this case, newsgroup posts, as follows:

A post is a multinomial distribution over our vocabulary. 

Step-by-step that means:
* A post is a bag of words
* TfidfVectorizer -> bag of words -> bag of normalized multinomial distributions over the vocabulary (i.e. weighted multinomials)
    * If we had used CountVectorizer we would have a bag of multinomial distributions
* Use Expectation-Maximization (EM) to remove the average from the matrix (think of it like projecting away from the global trends of language coming from grammar and common word usage)

In [None]:
%%time
word_matrix, token_to_index, index_to_token = word_word_cooccurence_matrix(raw_text, min_df=50)
raw_doc_matrix = TfidfVectorizer(vocabulary=token_to_index, norm='l1').fit_transform(raw_text)
raw_doc_matrix.eliminate_zeros()
print(raw_doc_matrix.shape)

With remove expectation:

    (18846, 8809)
    CPU times: user 40.1 s, sys: 969 ms, total: 41.1 s
    Wall time: 41.5 s

In [None]:
## remove zero rows - docs that don't have any common words
is_nonempty_row = np.array(raw_doc_matrix.sum(axis=1).T)[0] != 0
text = raw_text[is_nonempty_row]
doc_matrix = raw_doc_matrix[is_nonempty_row]

### Global parameters for the joint embedding

`background_prior`: 
This is a positive number, 1 being neutral, <1 underweight and >1 overweight wrt the strength of the background. Higher will tend to make things more orthogonal, and will cluster things more tightly (in theory) at the expense of global structure.

In [None]:
background_prior = 5.0

`joint_dimension`: We will later learn a word embedding into this dimension with PLSA and then map the documents as an average of word vectors. The higher the better for accuracy, but it will be slower and more memory intensive. 300 is the word2vec range, so we started with that and it seemed good enough.

In [None]:
joint_dimension = 300

## Do EM on the Document Matrix

In [None]:
%%time
D, mix_params = em_sparse(TfidfTransformer(norm='l1').fit_transform(doc_matrix), prior_noise=background_prior)

One of the upshots of using EM is that our matrix is sparser now

In [None]:
print(f'Number of non-zero entries before EM: {raw_doc_matrix.nnz}')
print(f'Number of non-zero entries after EM:   {D.nnz}')

## Build the Word Matrix

Since we're doing a joint embedding, we will treat each word like a document of its context (before it and after it which we treat separately), and then embed the words in the same way that we did the documents.

More precisely, think of a word as a document of "contexts containing that word"; that is, of two sets of documents, the context windowns before the word, and the context window after the word. We treat a word as two documents, and do exactly as we did above for each document (aka. set of context windows), concatinating the result into a vector of length 2 times the size of the vocabulary. 

In [None]:
%%time
W, s_w = em_sparse(TfidfTransformer(norm='l1').fit_transform(word_matrix), prior_noise=background_prior)
Wt, s_wt = em_sparse(TfidfTransformer(norm='l1').fit_transform(word_matrix.T), prior_noise=background_prior)

In [None]:
word_mat_directed = normalize(scipy.sparse.hstack([W, Wt]), norm='l1')

## Give the Word Matrix and Doc Matrix the same basis

Because words are related (not independent), we don't want to think of a document as the average of the 1-hot encoded vectory corresponding to each word. Instead, we want to change basis so that we can consider a document as a weighted linear combination of the word vectors. If we do this naively, we'll end up with a huge dense matrix.

Instead, let's dimension reduce the word vectors, so that we're considering a document as a weighted linear combination of word vector topics. We'll use pLSA for this. Why? It is a linear dimension reduction technique for topic modelling that takes a bag of multinomials to a bag of multinomials. This is what we want. The dimension we reduce to will be the number of latent word-topics.

In [None]:
topicer = PLSA(n_components=joint_dimension)

In [None]:
%%time
topicer.fit(word_mat_directed)

In [None]:
word_by_topic = topicer.embedding_
D_low_temp = D * word_by_topic

Note that our matrix `D_low_temp` is now a dense ndarray

In [None]:
D_low_temp.shape

We've now averaged a bunch of things together again, and we have a central limit effect. We need to separate things away from the mean again. EM to the rescue!

In [None]:
D_low, s_list = em_sparse(scipy.sparse.csr_matrix(D_low_temp), prior_noise=background_prior)
D_low = D_low.todense()

## Make the Joint Embedding

In [None]:
#n_docs = D_low.shape[0]
n_docs = 2000 #died on me with the full amount

## use sqrt and euclidean distance instead of Hellinger
w_and_d = np.sqrt(np.vstack((word_by_topic, D_low[:n_docs])))

In [None]:
w_and_d.shape

## XXX replace this with hellinger later, check that I get the same result

Lets set up some labels to use for hovering over our data

In [None]:
category_labels = [dataset.target_names[x] for x in dataset.target]
hover_df = pd.DataFrame(category_labels, columns=['category'])

In [None]:
hover_df

In [None]:
def doc_top_words(row):
    inds = row.indices
    data = row.data
    order = np.argsort(-data)
    return inds[order]

In [None]:
col_indices = [doc_top_words(doc_matrix.getrow(i)) for i in range(doc_matrix.shape[0])]
supported_words_array = np.array([" ".join([index_to_token[index_list[i]] for i in range(min(10, len(index_list)))]) for index_list in col_indices])

In [None]:
word_array = np.array([index_to_token[x] for x in range(W.shape[0])])
wd_labels = np.hstack((np.zeros(word_by_topic.shape[0]), np.ones(n_docs)))
wd_hover_df = pd.DataFrame({'text': np.hstack([word_array, supported_words_array])})

And finally, use UMAP to embed the words and docs

In [None]:
%%time
mapping = umap.UMAP(n_neighbors=10, random_state=42)

In [None]:
%%time
embedding = mapping.fit(w_and_d)

In [None]:
p = umap.plot.interactive(embedding, hover_data=wd_hover_df, labels=wd_labels, width=800, height=800, point_size=5);
show(p)