If you haven't yet, start by setting up your environment and datasets by following the instructions in the README. It should be something like:
* `make create_environment`
* `conda activate covid_nlp`
* `make update_environment`
* `make data`

Several common packages that you may want to use (e.g. UMAP, HDBSCAN, enstop, sklearn) have already been added to the `covid_nlp` environment via `environment.yml`. To add more, edit that file and do a:
  ` make update_environment`

## Document embedding of abstracts
In this notebook we'll follow https://github.com/ddangelov/Top2Vec/blob/master/notebooks/top2vec_covid19_example.ipynb to embed abstracts using https://github.com/ddangelov/Top2Vec. 

The inital work was done on the inidividual sections of the papers, here we only use abstracts.

In [None]:
# Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Automatically pick up code changes in the `src` module
%load_ext autoreload
%autoreload 2

In [None]:
import json
import pandas as pd
import numpy as np

In [None]:
# Useful imports from easydata
from src import paths
from src.data import Dataset
from src import workflow

In [None]:
# other packages

# embedding + clustering
from top2vec import Top2Vec

In [None]:
# Some plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE
output_notebook(resources=INLINE)

from wordcloud import WordCloud

## Load up the dataset

The metadata has been augmented with where the files can be found relative to `paths["interim_data_path"]`

In [None]:
#paths['interim_data_path']

In [None]:
workflow.available_datasets()

If the previous cell returned an empty list, go back and re-run `make data` as described at the top of this notebook.

In [None]:
ds_name = 'covid_nlp_20200319'

In [None]:
# Load the dataset
meta_ds = Dataset.load(ds_name)

In [None]:
print(meta_ds.DESCR[:457])

In [None]:
# The processed dataframe is the `data` method of this data source 
meta_df = meta_ds.data
meta_df.head()

In [None]:
# filter it down to published papers with a cc-by license

meta_df.file_type.value_counts()

In [None]:
meta_df = meta_df[(meta_df.file_type=='comm_use_subset') | (meta_df.file_type=='noncomm_use_subset')]

## Basics on the dataset

The JSON files given in the `path` column of the metadata dataframe are the papers in `json` format (as dicts)
that include the following keys:
* `paper_id`
* `metadata`
* `abstract`
* `body_text`
* `bib_entries`
* `ref_entries`
* `back_matter`

where the `paper_id` is the sha hash from the medadata.

For example:

In [None]:
filename = paths['interim_data_path'] / ds_name / meta_df['path'][0]
file = json.load(open(filename, 'rb'))
file.keys()

# Embedding abstracts

In [None]:
abstracts = meta_df.abstract.dropna()

In [None]:
abstracts[:5]

In [None]:
len(abstracts)

## Train Top2Vec Model

Create a joint word and document embedding with Doc2Vec

In [None]:
import gensim

In [None]:
from src import paths

In [None]:
# train doc2vec model
train_corpus = []

for index, abstract in enumerate(abstracts):
    train_corpus.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(abstract), [index]))

In [None]:
%%time
model = gensim.models.doc2vec.Doc2Vec(documents=train_corpus, vector_size=300, min_count=50,
                                      window=15, sample=10e-5, negative=5, hs=0, workers=80,
                                      epochs=40, dm=0, dbow_words=1)

In [None]:
model_name = "top2vec_abstracts"
path = paths['processed_data_path'] / model_name

In [None]:
model.save(str(path))

### XXX optional break point

In [None]:
model = gensim.models.doc2vec.Doc2Vec.load(str(path))

## Reduce dimension with UMAP

In [None]:
doc_matrix = np.vstack([model.docvecs[i] for i in range(model.docvecs.count)])

In [None]:
%%time
umap_model_2D = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine', random_state=42).fit(doc_matrix)

## Cluster with HDBSCAN

In [None]:
%%time
cluster = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom').fit(umap_model_2D.embedding_)
labels = pd.Series(cluster.labels_)
print("Number of -1 labels: ", len(labels[labels==-1])/len(labels))
print("Number of clusters: ", len(set(labels)))

In [None]:
fig_clust = umap.plot.interactive(umap_model_2D,labels=labels, theme='darkgreen')
show(fig_clust)

In [None]:
data_df = pd.DataFrame(abstracts, columns=['abstract']).reset_index()

In [None]:
data_df.drop('index', axis=1, inplace=True)

In [None]:
# Generate top2vec vectors and topic words 
# 
# Topic vectors are centroids of dense areas of documents
# Topic words are word vectors closest to topic vector

doc_group_mapping = data_df.copy()
doc_group_mapping["label"] = labels

topic_vectors = []
sim_words_l = []
top_words_l = []
topic_index = 0

# remove outlier documents as they are noise 
lables_list = list(set(labels))
lables_list.remove(-1)

for group in lables_list:
    
    # generate topic vector
    topic_vector = [0]*300
    vec_indices = doc_group_mapping[doc_group_mapping.label==group].index.tolist()
    for vec_index in vec_indices:
           topic_vector = topic_vector + model.docvecs[vec_index]
    topic_vector = topic_vector/len(vec_indices)
    
    topic_vectors.append(topic_vector)
    
    
    # find closest word vectors to topic vector
    sim_words = model.most_similar(positive=[topic_vector], topn=50)
    sim_words_l.append(sim_words)
    top_words_l.append([word[0] for word in sim_words])



### Explore the Topics

In [None]:
# generate word cloud for topic 
def generate_wordcloud(top_words, top_num):
    plt.figure(figsize=(16,4))
    plt.axis("off")
    plt.imshow(WordCloud(width=1600, height=400, background_color='black').generate_from_frequencies(dict(top_words)), interpolation='bilinear');
    plt.title("Topic " + str(top_num), loc='left', fontsize=20)

In [None]:
# order topics by size(number of documents in dense cluster)
label_df = pd.DataFrame(doc_group_mapping["label"].value_counts()).sort_values(by="label", ascending=False)
lables_list = list(label_df.index)
lables_list.remove(-1)

for group in lables_list[0:20]:
    generate_wordcloud(sim_words_l[group], group)

## Now to Search by Topic