# Intro 

From our previous investigation into using unsupervised clustering approaches to extracting the main themes of feedback comments of those on the extremely vulnerable form pages, we concluded LDA with [t-SNE](https://www.machinegurning.com/rstats/tsne/) might be a useful approach.  

We use the standard language of NLP and call our comments / feedback documents. The corpus represents all documents or feedback.

In this notebook we: 

* Topic modeling with LDA
* Visualizing topic models with pyLDAvis
* Visualizing LDA results with t-SNE and bokeh  

# Objective

Our objective is to 



In [None]:
%pylab inline

import src.utils.regex as regex
import os
import pandas as pd
import pickle as pk
from scipy import sparse as sp
from collections import OrderedDict

import sys
sys.path.append("..")

# pre-process and vectorize
import re
import nltk
import src.utils.regex as regex

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
from bokeh.models import Label
from bokeh.io import output_notebook

import seaborn as sns
import matplotlib.colors as mcolors
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# progress bar
from tqdm import tqdm, tqdm_notebook

# instantiate
tqdm.pandas(tqdm_notebook)

# pandas options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

# input and pre processing

* read in as pandas df  
* custom cleaning
* filter rows for relevant pages  
* lematize tokens
* convert comments column into array

In [None]:

data_path = '../data'
feedback_data_path = os.path.join(data_path, 'joined_uis_all_of_march.csv')
df = pd.read_csv(feedback_data_path)



In [None]:
q3 = "Q3"
df['q3_copy'] = df[q3]

# These are terms that are functionally the same but people use different terms, this standardises them
# to be improved..., 
same_terms = {
    "travelling": "travel",
    "travellers": "travel",
    "holiday": "travel",
    "self-isolation": "quarantine",
    "selfisolation": "quarantine",
    "self isolation": "quarantine",
    "isolation": "quarantine",
    "statuatory sick pay": "ssp",
    "sick pay": "ssp",
}

def clean_text(text):
    text = str(text)
    # We'll be removing non alphabetical characters but we want to keep the non emergency phone number 
    # '111' in, so we'll just replace that with text
    text = text.replace("111", "oneoneone")
    # Same for 999
    text = text.replace("999", "nineninenine")
    # Remove non alphabetical or space characters
    text = re.sub("[^a-zA-Z\s:]", "", text)
    text = text.lower()
    text = re.sub(regex.coronavirus_misspellings_and_typos_regex() + "|virus", "", text)
    # People using different terms for "I want to know", so just remove those
    text = re.sub("wanted to find out|to look up about|to get an update|to find infos|to find info|to find out|to understand|to read the|check on advice|to check|ti get advice|to get advice|for information on", "", text)
    for word_to_replace, word_to_replace_with in same_terms.items():
        text.replace(word_to_replace, word_to_replace_with)
    return text

# df[q3] = df[q3].apply(clean_text) # this takes a while, progress_map let's us see progress
df[q3] = df[q3].progress_map(clean_text)

print("The number of rows and columns after cleaning: ", df.shape)
# Remove rows without a page sequence
df = df[df['PageSequence'].notnull()].reset_index(drop=True)
print("The number of rows and columns after dropping nulls for PageSequence: ", df.shape)


In [None]:
corona_slugs = ['/coronavirus-extremely-vulnerable', '/done/coronavirus-extremely-vulnerable']


In [None]:
corona_related_items_regex = regex.coronavirus_misspellings_and_typos_regex() + '|sick pay|ssp|sick|isolation|closures|quarantine|closure|cobra|cruise|hand|isolat|older people|pandemic|school|social distancing|symptoms|cases|travel|wuhan|care|elderly|care home|carehome'

# We only want to cluster rows that are relevant to corona stuff
# so we have the column 'has_corona_page'
# It is only true if they have visted a corona page AND included a relevant term in the feedback
# (there was some irrelevant stuff about passports, we may want to remove the need for a relevant term
# as people may be using terms not in that list and we might miss out on some insights)
for index, row in df.iterrows():
    has_corona_page = False
    # if re.search(corona_related_items_regex, df.at[index, q3]) is not None:
    for slug in row['PageSequence'].split(">>"):
        if slug in corona_slugs:
            has_corona_page = True
    df.at[index, 'has_corona_page'] = has_corona_page
df = df[df['has_corona_page']].reset_index(drop=True)

# Remove duplicate users
df = df.drop_duplicates('intents_clientID')

print(df.shape)
df.head()

In [None]:
docs = array(df['Q3'])


# Preprocess and vectorise

In [None]:
def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one or two character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

In [None]:
docs = docs_preprocessor(docs)


## Compute bigrams/trigrams
Since topics are very similar, as per our clustering experiment, phrases rather than single/individual words we help us distinguish topics. 



In [None]:
# Add bigrams and trigrams to docs (only ones that appear 10 times or more).
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

## Remove rare and common tokens

In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
print('Number of unique words in initital documents:', len(dictionary))

# Filter out words that occur less than 10 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.2)
print('Number of unique words after removing rare and common words:', len(dictionary))

## Vectorize data
The first step is to get a back-of-words representation of each doc. With the bag-of-words corpus, we can move on to learn our topic model from the documents.



In [None]:
corpus = [dictionary.doc2bow(doc) for doc in docs]


In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# Train LDA model
LDA is an unsupervised technique, meaning that we don't know prior to running the model how many topics exist in our corpus. [Topic coherence](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf), is one of the main techniques used to estiamte the number of topics. We also used the findings of the clustering notebook.

In [None]:
# to make this reproducible between runs
# https://stackoverflow.com/questions/34831551/ensure-the-gensim-generate-the-same-word2vec-model-for-different-runs-on-the-sam

# Set training parameters.
num_topics = 4
chunksize = 500 # size of the doc looked at every pass
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

# Visualise the topics
These topics were visualised using a package called LDAvis, which shows the intertopic distance (i.e. how similar or distinct the topics are), and the relative sizes of the topics. The figure below shows the output of the topic model using this visualisation, and it shows the words most strongly associated with each topic. The occurrence of specific words within each topic can also be visualised by selecting a word instead of a topic.

In [None]:
pyLDAvis.gensim.prepare(model, corpus, dictionary)


## Interpreting the viz

The left panel, labeld Intertopic Distance Map, circles represent different topics and the distance between them. Similar topics appear closer and the dissimilar topics farther. The relative size of a topic's circle in the plot corresponds to the relative frequency of the topic in the corpus. An individual topic may be selected for closer scrutiny by clicking on its circle, or entering its number in the "selected topic" box in the upper-left.

The right panel, include the bar chart of the top 30 terms. When no topic is selected in the plot on the left, the bar chart shows the top-30 most "salient" terms in the corpus. A term's saliency is a measure of both how frequent the term is in the corpus and how "distinctive" it is in distinguishing between different topics. Selecting each topic on the right, modifies the bar chart to show the "relevant" terms for the selected topic. Relevence is defined as in footer 2 and can be tuned by parameter  λ , smaller  λ  gives higher weight to the term's distinctiveness while larger  λ s corresponds to probablity of the term occurance per topics.

Therefore, to get a better sense of terms per topic we'll use  λ =0.

## Suitable number of clusters?
Too many, too few? Try tweaking the number of clusters. We also need an objective way to evaluate the topic modelling, we try to do that below. We have to be creative in defining ways to evaluate. I do this in two steps:

* divide each document in two parts and see if the topics assign to them are simialr. => the more similar the better
* compare randomly chosen docs with each other. => the less similar the better


In [None]:
df['tokenz'] = docs

docs1 = df['tokenz'].apply(lambda l: l[:int0(len(l)/2)])
docs2 = df['tokenz'].apply(lambda l: l[int0(len(l)/2):])

In [None]:
# transform the data
corpus1 = [dictionary.doc2bow(doc) for doc in docs1]
corpus2 = [dictionary.doc2bow(doc) for doc in docs2]

# Using the corpus LDA model tranformation
lda_corpus1 = model[corpus1]
lda_corpus2 = model[corpus2]

In [None]:
from collections import OrderedDict
def get_doc_topic_dist(model, corpus, kwords=False):
    
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    '''
    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys += [array(vals).argmax()]

    return array(top_dist), keys

In [None]:
top_dist1, _ = get_doc_topic_dist(model, lda_corpus1)
top_dist2, _ = get_doc_topic_dist(model, lda_corpus2)

print("Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):")
print(mean([cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0] for c1,c2 in zip(top_dist1, top_dist2)]))

random_pairs = np.random.randint(0, len(df['Q3']), size=(400, 2))

print("Inter similarity: cosine similarity between random parts (lower is better):")
print(np.mean([cosine_similarity(top_dist1[i[0]].reshape(1, -1), top_dist2[i[1]].reshape(1, -1)) for i in random_pairs]))

Depending on these results, we could revisit and adjust the number of clusters.

## What is the Dominant topic and its percentage contribution in each document?
In LDA models, each document is composed of multiple topics. But, typically only one of the topics is dominant. The below code extracts this dominant topic for each sentence and shows the weight of the topic and the keywords in a nicely formatted output.

This way, you will know which document belongs predominantly to which topic.

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=dictionary):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=model, corpus=corpus, texts=dictionary)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

## The most representative sentence for each topic
Sometimes you want to get samples of sentences that most represent a given topic. This code gets the most exemplar sentence for each topic.

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 150

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

## Frequency Distribution of Word Counts in Documents
When working with a large number of documents, you want to know how big the documents are as a whole and by topic. Let’s plot the document word counts distribution.

In [None]:
doc_lens = [len(str(d)) for d in df_dominant_topic.Text]


In [None]:
# Plot
plt.figure(figsize=(16,7), dpi=160)
plt.hist(doc_lens, color='navy')
plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(750,  90, "Median : " + str(round(np.median(doc_lens))))
plt.text(750,  80, "Stdev   : " + str(round(np.std(doc_lens))))
plt.text(750,  70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(750,  60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
#plt.xticks(np.linspace(0,1000,9))
plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.show()

Using seaborn for the different topics

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

fig, axes = plt.subplots(2,2,figsize=(16,14), dpi=160, sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):    
    df_dominant_topic_sub = df_dominant_topic.loc[df_dominant_topic.Dominant_Topic == i, :]
    doc_lens = [len(str(d)) for d in df_dominant_topic_sub.Text]
    ax.hist(doc_lens, color=cols[i], bins = 20)
    ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
    sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
    ax.set(xlabel='Document Word Count')
    ax.set_ylabel('Number of Documents', color=cols[i])
    ax.set_title('Topic: '+str(i), fontdict=dict(size=16, color=cols[i]))

fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(np.linspace(0,1000,9))
fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
plt.show()

## Common terms by topic
These don't seem to marry up with the topic labels from the interactive viz which is unhelpful...


In [None]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, a topic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [None]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
# start at 1
for i in range(0, num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

From above, it's possible to inspect each topic and assign a human-interpretable label to it. Here I labeled them as follows:



## Word Clouds of Top N Keywords in Each Topic
Though you’ve already seen what are the topic keywords in each topic, a word cloud with the size of the words proportional to the weight is a pleasant sight. The coloring of the topics I’ve taken here is followed in the subsequent plots as well.

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stopwords,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

## Creating human readable labels for topics

In [None]:
top_labels = {0: 'High risk with underlying health issue',
              1:'Shopping and online shopping problems',
              2:'Home delivery slot access',
              3:'Vulnerable but not extremely, help',
             10: 'NA'}


## t-SNE

In [None]:
stops = set(stopwords.words('english'))

def paper_to_wordlist( paper, remove_stopwords=True ):
    '''
        Function converts text to a sequence of words,
        Returns a list of words.
    '''
    lemmatizer = WordNetLemmatizer()
    # 1. Remove non-letters
    paper_text = re.sub("[^a-zA-Z]"," ", paper)
    # 2. Convert words to lower case and split them
    words = paper_text.lower().split()
    # 3. Remove stop words
    words = [w for w in words if not w in stops]
    # 4. Remove short words
    words = [t for t in words if len(t) > 2]
    # 5. lemmatizing
    words = [nltk.stem.WordNetLemmatizer().lemmatize(t) for t in words]

    return(words)

In [None]:
tvectorizer = TfidfVectorizer(input='content', analyzer = 'word', lowercase=True, stop_words='english',\
                                  tokenizer=paper_to_wordlist, ngram_range=(1, 3), min_df=40, max_df=0.20,\
                                  norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

dtm = tvectorizer.fit_transform(df['Q3']).toarray()

In [None]:
top_dist =[]
for d in corpus:
    tmp = {i:0 for i in range(num_topics)}
    tmp.update(dict(model[d]))
    vals = list(OrderedDict(tmp).values())
    top_dist += [array(vals)]

In [None]:
top_dist, lda_keys= get_doc_topic_dist(model, corpus, True)
features = tvectorizer.get_feature_names()

In [None]:
top_ws = []
for n in range(len(dtm)):
    inds = int0(argsort(dtm[n])[::-1][:4])
    tmp = [features[i] for i in inds]
    
    top_ws += [' '.join(tmp)]
    
df['Text_Rep'] = pd.DataFrame(top_ws)
# replace NA with cluster 10.0
df['clusters'] = pd.DataFrame(lda_keys)
df['clusters'].fillna(10, inplace=True)
# used as key later, need int
df['clusters'] = df['clusters'].astype(int)

cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

df['colors'] = df['clusters'].apply(lambda l: cluster_colors[l])

In [None]:
#?TSNE

In [None]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(top_dist)

In [None]:
df['X_tsne'] =X_tsne[:, 0]
df['Y_tsne'] =X_tsne[:, 1]

In [None]:
output_notebook()


In [None]:
df['clusters']

In [None]:
source = ColumnDataSource(dict(
    x=df['X_tsne'],
    y=df['Y_tsne'],
    color=df['colors'],
    #label = df['Q3'].progress_map(top_labels),
    label=df['clusters'].apply(lambda l: top_labels[l]),
#     msize= df['marker_size'],
    topic_key= df['clusters'],
    title= df[u'Q3'],
    content = df['Text_Rep']
))

In [None]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source,
                 color='color', alpha=0.8, size=10)#'msize', )

#plot_lda.scatter(x='x', y='y', legend='label', source=source,
#                 color='color', alpha=0.8, size=10)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "Title: @title, KeyWords: @content - Topic: @topic_key "}
plot_lda.legend.location = "top_left"

show(plot_lda)

#save the plot
# save(plot_lda, '{}.html'.format(title))

## Alterantive tsne approach


In [None]:
# Get topic weights
topic_weights = []
for i, row_list in enumerate(model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

In [None]:
# Get topic weights and dominant topics ------------

# Get topic weights
topic_weights = []
for i, row_list in enumerate(model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 4
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

# References

* https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html
* https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization
* https://tech.goibibo.com/key-topics-extraction-and-contextual-sentiment-of-users-reviews-20e63c0fd7ca  
* https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/