If you haven't yet, start by setting up your environment and datasets by following the instructions in the README. It should be something like:
* `make create_environment`
* `conda activate covid_nlp`
* `make update_environment`
* `make data`

Several common packages that you may want to use (e.g. UMAP, HDBSCAN, enstop, sklearn) have already been added to the `covid_nlp` environment via `environment.yml`. To add more, edit that file and do a:
  ` make update_environment`

In [None]:
# Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Automatically pick up code changes in the `src` module
%load_ext autoreload
%autoreload 2

In [None]:
import json
import pandas as pd
import numpy as np

In [None]:
# Useful imports from easydata
from src import paths
from src.data import Dataset
from src import workflow

In [None]:
from src.data.numba_word_vectorizer import word_word_cooccurence_matrix
from src.data.em_method import em_sparse
from src.utils import RankedPoints

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from scipy import sparse
from sklearn.preprocessing import normalize
from enstop import PLSA
import umap
import umap.plot
import hdbscan
from wordcloud import WordCloud

In [None]:
# Some plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE
output_notebook(resources=INLINE)

## Load up the dataset

The metadata has been augmented with where the files can be found relative to `paths["interim_data_path"]`

In [None]:
workflow.available_datasets()

If the previous cell returned an empty list, go back and re-run `make data` as described at the top of this notebook.

In [None]:
ds_name = 'covid_nlp_20200319'

In [None]:
# Load the dataset
meta_ds = Dataset.load(ds_name)

In [None]:
print(meta_ds.DESCR[:457])

In [None]:
# The processed dataframe is the `data` method of this data source 
meta_df = meta_ds.data
meta_df.head()

## Basics on the dataset

The JSON files given in the `path` column of the metadata dataframe are the papers in `json` format (as dicts)
that include the following keys:
* `paper_id`
* `metadata`
* `abstract`
* `body_text`
* `bib_entries`
* `ref_entries`
* `back_matter`

where the `paper_id` is the sha hash from the medadata.

For example:

In [None]:
filename = paths['interim_data_path'] / ds_name / meta_df['path'][0]
file = json.load(open(filename, 'rb'))
file.keys()

In [None]:
abstracts = meta_df.abstract.dropna()

In [None]:
abstracts[:5]

In [None]:
len(abstracts)

Shorten abstracts for display

In [None]:
max_abs_length = 140
short_abstracts = [a[:max_abs_length] for a in abstracts]
meta_df['abstract_length'] = meta_df.abstract.str.len()
data_df = meta_df[meta_df.abstract_length > 0].reset_index()
data_df['short_abstracts'] = short_abstracts

### XXXX Hack around a zero row in the word matrix coming out of word_word_cooccurence_matrix
EM doesn't handle zero rows...

In [None]:
data_df = data_df[~data_df.abstract.str.contains("subsp")].reset_index()

## Build word matrix

In [None]:
raw_text = data_df.abstract

### Initial vectorization to the word-word matrix

This replaces the normal CountVectorizer step from TfidfVectorizer (CountVectorizer+TfidfTransformer)

In [None]:
%%time
raw_word_matrix, token_to_index, index_to_token = word_word_cooccurence_matrix(raw_text, min_df=50)

In [None]:
# labels of the word matrix
word_array = np.array([index_to_token[x] for x in range(raw_word_matrix.shape[0])])
hover_df = pd.DataFrame(word_array, columns=['word'])

In [None]:
# Without the above hack we get a zero row...
zero_rows = np.where(raw_word_matrix.getnnz(1)==0)[0]
len(zero_rows)

In [None]:
raw_word_matrix.shape

In [None]:
%%time
word_matrix_before = TfidfTransformer(norm='l1').fit_transform(raw_word_matrix)
word_matrix_after = TfidfTransformer(norm='l1').fit_transform(raw_word_matrix.T)

In [None]:
naive_word_matrix = normalize(sparse.hstack([word_matrix_before, word_matrix_after]), norm='l1')

In [None]:
naive_word_matrix

## Run EM

In [None]:
background_prior = 5.0

In [None]:
%%time
word_matrix_before, w_before = em_sparse(word_matrix_before, prior_noise=background_prior)
word_matrix_after, w_after = em_sparse(word_matrix_after, prior_noise=background_prior)

In [None]:
word_matrix = normalize(sparse.hstack([word_matrix_before, word_matrix_after]), norm='l1')

In [None]:
word_matrix

## Get word topics

In [None]:
topic_dimension = 30

In [None]:
topicer = PLSA(n_components=topic_dimension)

In [None]:
%%time
topicer.fit(word_matrix)

In [None]:
word_by_topic = topicer.embedding_

In [None]:
word_by_topic.shape

## Dimension reduce with UMAP

In [None]:
mapping = umap.UMAP(n_components=2, n_neighbors=10, random_state=42, metric='hellinger')

In [None]:
%%time
embedding_2d = mapping.fit(word_by_topic)

In [None]:
min_cluster_size=15

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
clusterer.fit_predict(embedding_2d.embedding_)
labels = clusterer.labels_

In [None]:
hover_df['cluster'] = labels
value_counts = hover_df.cluster.value_counts()
print(f"Number of clusters: {len(value_counts)}")
print(f"Cluster value counts:\n{value_counts}")

In [None]:
f = umap.plot.interactive(embedding_2d, labels=hover_df['cluster'],
                          hover_data=hover_df, point_size=3);
show(f)

<img src="../reports/figures/05-WordMAP-abstracts.png" alt="WordMAP embedding visualization" title="WordMAP embedding visualization" width="800"/>

### Rank points based on distance to a representative point

In [None]:
examples = RankedPoints(embedding_2d.embedding_, clusterer, metric='euclidean')

In [None]:
examples.calculate_all_distances_to_center()
examples.get_all_cluster_rankings()

In [None]:
hover_df['rank_in_cluster'] = examples.embedding_df['rank_in_cluster']

In [None]:
num_points = 50
top_cluster_points = {}
top_cluster_points_freq = {}

grouped_by_cluster = hover_df.groupby('cluster')

for cluster_id, group in grouped_by_cluster:
    top_points = group.sort_values('rank_in_cluster', ascending=True).head(num_points)
    top_points['inverse_rank'] = top_points.rank_in_cluster.apply(lambda x: num_points - x)
    top_cluster_points_freq[int(cluster_id)] = dict(zip(top_points.word, top_points.inverse_rank))
    top_cluster_points[int(cluster_id)] = '<ol>' + ''.join([f'<li>{r.word}</li>' for _, r in top_points.head(min_cluster_size).iterrows()]) + '</ol>'

### Generate word clouds based on ranking

In [None]:
# generate word cloud for word topic 
def generate_wordcloud(topic_words, topic_num):
    plt.figure(figsize=(16,4))
    plt.axis("off")
    plt.imshow(WordCloud(width=1600, height=400, background_color='black').generate_from_frequencies(topic_words))
    plt.title("Topic " + str(topic_num), loc='left', fontsize=20)

In [None]:
cluster=1

In [None]:
top_cluster_points[1]

<ol><li>32</li><li>sd</li><li>133</li><li>79</li><li>320</li><li>46</li><li>115</li><li>85</li><li>107</li><li>112</li><li>18</li><li>72</li><li>64</li><li>38</li><li>39</li></ol>

In [None]:
generate_wordcloud(top_cluster_points_freq[1], 1)

<img src="../reports/figures/05-WordMAP-topic1.png" title="WordMAP topic 1 visualization" width="800"/>

## View largest word topics

In [None]:
num_topics = 10
top_clusters = value_counts.index[1:num_topics + 1]

In [None]:
for cluster in top_clusters:
    generate_wordcloud(top_cluster_points_freq[cluster], cluster)