# Preliminaries

Here's some of my functions that print things nicely.

In [None]:
class ListTable(list):
    def _repr_html_(self):
        html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
        for row in self:
            html.append("<tr>")
            for col in row:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)
    
def check_float(potential_float):
    try:
        float(potential_float)
        return True
    except ValueError:
        return False

def round_if_float(v, prec=3):
    if check_float(v):
        return round(float(v), prec)
    return v
    
from IPython.core.display import display, HTML
def list_table(the_list, color_nums=False):
    html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
    for row in the_list:
        html.append("<tr>")
        for col in row:
            if color_nums and check_float(col) and not float(col) == 0:
                html.append("<td align='left' style='border: .5px solid gray; color: {1}; font-weight: bold'>{0}</td>".format(round_if_float(col), color_nums))
            else:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(round_if_float(col)))
        html.append("</tr>")
    html.append("</table>")
    return display(HTML(''.join(html)))
    
import numpy
class MultiTable(list):
    def _repr_html_(self):
        html = []
        for l in self:
            html.append("<table style= 'border: 1px solid black; display:inline-block; margin-right: 10px;'>")
            for row in l:
                html.append("<tr>")
                for col in row:
                    if isinstance(col, numpy.float32):
                        col = str(round(col, 3))
                    html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(col))
                html.append("</tr>")
            html.append("</table>")
        return ''.join(html)

In [None]:
!pip install pyLDAvis  # Only execute this once

# Latent Dirichlet Allocation

A bunch of the usual imports

The big new libraries here are `gensim` and `pyLDAvis`.

In [None]:
import wikipediaapi
import re
from string import punctuation
import nltk
import matplotlib.pyplot as plt
%matplotlib inline

from gensim import corpora
from gensim import models
import pyLDAvis

import warnings
warnings.filterwarnings('ignore')

## Use the api to get some articles from wikipedia about the American civil war

In [None]:
pages = [
    "American Civil War",
    "Abraham Lincoln",
    "Slavery in the United States",
    "Slave states and free states",
    "Emancipation Proclamation",
    "Robert E. Lee",
    "Ulysses S. Grant",
    "Conclusion of the American Civil War",
    "Origins of the American Civil War",
    "Issues of the American Civil War"
]

In [None]:
import re

def underscorize(pagename):
    return re.sub(" ", "_", pagename)

https://pypi.org/project/Wikipedia-API/

In [None]:
wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

Grab the articles and put them in a dictionary.

We'll split the articles up by paragraphs.

In [None]:
page_dict = {}
for page in pages:
    pagename = underscorize(page)
    print(pagename)
    p_wiki = wiki_wiki.page(pagename)
    page_text = p_wiki.text.split("\n")
    page_paras = [para for para in page_text if len(para) > 1]
    page_dict[pagename] = page_paras

## Wordify each paragraph of each article of the articles

We are keeping one big list of all of the wordified paragraphs, as well as storing the wordified paragraphs for each article in a dictionary.

We'll use one of my highly tinkered with wordifiers.

In [None]:
def bruces_alpha_only(text, stem_text=False, alpha_only=True, lower_case=True):
    import re
    from nltk import stem  # @UnresolvedImport
    
    def is_contraction(the_text):
        contraction_patterns = re.compile(r"(?i)(.)('ll|'re|'ve|n't|'s|'m|'d)\b")
        return contraction_patterns.search(the_text)

    def return_alpha_only (ltext):
        return [w for w in ltext if (len(w) > 0) and (w.isalpha() or w[0]=='<' or is_contraction(w))]
    
    stemmer = stem.PorterStemmer()
    if lower_case:
        text = text.lower()
    punctuation_class = r"([\.\-\/&\";:\(\)\?\!\]\[\{\}\*#])"
    
    # Separate most punctuation at end of words

    text = re.sub(r"(\w)" + punctuation_class, r'\1 \2 ', text)
    
    # Separate most punctuation at start of words
    text = re.sub(punctuation_class + r"(\w)", r'\1 \2', text)
    
    # Separate punctuation from other punctuation
    text = re.sub(punctuation_class + punctuation_class, r'\1 \2 ', text)
    
    # Put spaces between + and = signs and digits. Also %s that follow a digit, $s that come before a digit
    text = re.sub(r"(\d)([+=%])", r'\1 \2 ', text)
    text = re.sub(r"([\$+=])(\d)", r'\1 \2', text)
    
    # Separate commas if they're followed by space.
    # (E.g., don't separate 2,500)
    text = re.sub(r"(,\s)", r' \1', text)
    
    #when we have two double quotes make it 1.
    #
    text = re.sub("\"\"", "\"", text)

    # Separate leading and trailing single and double quotes .
    text = re.sub(r"(\'\s)", r' \1', text)
    text = re.sub(r"(\s\')", r'\1 ', text)
    text = re.sub(r"(\"\s)", r' \1', text)
    text = re.sub(r"(\s\")", r'\1 ', text)
    text = re.sub(r"(^\")", r'\1 ', text)
    text = re.sub(r"(^\')", r'\1 ', text)
    text = re.sub(r"('\'$)", r' \1', text)
    text = re.sub(r"('\"$)", r' \1', text)

    #Separate parentheses where appropriate
    text = re.sub(r"(\)\s)", r' \1', text)
    text = re.sub(r"(\s\()", r'\1 ', text)

    # Separate periods that come before newline or end of string.
    text = re.sub('\. *(\n|$)', ' . ', text)
    
    # separate single quotes in the middle of words
    # text = re.sub(r"(\w)(\')(\w)", r'\1 \2 \3', text)
    
    # separate out 's at the end of words
    text = re.sub(r"(\w)(\'s)(\s)", r"\1 s ", text)
    split_text = text.split()
    
    if stem_text:
        result = [stemmer.stem(w) for w in split_text]
        split_text = result

    if alpha_only:
        split_text = return_alpha_only(split_text)
    return split_text

In [None]:
wordified_paras = []
wordified_page_dict = {}
for name, page in page_dict.items():
    wordified_page_paras = []
    for para in page:
        wordified_page_paras.append(bruces_alpha_only(para))
    wordified_page_dict[name] = wordified_page_paras
    wordified_paras += wordified_page_paras

In [None]:
len(wordified_paras)

In [None]:
print(wordified_paras[0])

## Remove stopwords as usual

In [None]:
f = open("lists/stop-words_english_1_en.txt")
stop_list = f.read().split("\n")
stop_list += list('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’')
stop_list += list("abcdefghijklmnopqrstuvwxyz0123456789")
stop_list = set(stop_list)

In [None]:
docs_no_stop = []
for doc in wordified_paras:
    new_doc = [w for w in doc if w not in stop_list]
    docs_no_stop.append(new_doc)

## Get the corpus in the form that Gensim linkes

A gensim dictionary maps every token (i.e, word) to a number. It also computes some frequencies for us.

https://radimrehurek.com/gensim/corpora/dictionary.html

In [None]:
gensim_dict = corpora.Dictionary(docs_no_stop) 

In [None]:
gensim_dict.token2id["lincoln"]

In [None]:
gensim_dict[23]

In [None]:
gensim_dict.dfs[23]

In [None]:
print(docs_no_stop[0])

A gensim dictionary can be used to create a vector from a document.

Gensim does a better job of representing sparse vectors - vectors with lots of zeros in them.

In [None]:
gensim_dict.doc2bow(docs_no_stop[25])

### Convert all of the documents to bags of words, in the gensim manner

In [None]:
cw_bows = [gensim_dict.doc2bow(doc) for doc in docs_no_stop] 

## Do the LDA

In [None]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.CRITICAL)

In [None]:
# logging.getLogger().setLevel(logging.INFO)
logging.getLogger().setLevel(logging.CRITICAL)

We just have to pass our bags of words and gensim dictionary to gensim.

But there are many, many parameters to specify. And many different ways we might pre-process. And they make a difference.

Also, this won't give the same result every time, unless we specify `random_state`.

In [None]:
lda_m1 = models.LdaModel(cw_bows, 
                         id2word=gensim_dict, 
                         num_topics=8,
                         passes=10,
                         update_every=0,
                          chunksize=2000,
                          iterations=100,
                          gamma_threshold=.001,
                          decay=.5,
                          offset=1,
                          random_state=1,
                          alpha="auto")

## Examine the results

In [None]:
lda_m1.print_topics()

Here's a couple of functions that print the results a bit more nicely.

In [None]:
def top_words_from_topic(lda_model, gdict, n, to_print=10, min_oc=10):
    result = [["word", "weight"]]
    topic_words = lda_model.show_topic(n, 100)
    cnt = 0
    i = 0 
    while cnt < to_print:
        res = topic_words[i]
        w = res[0]
        df = gdict.dfs[gdict.token2id[w]]
        if df > min_oc:
            result.append([res[0], round(res[1], 3)])
            cnt += 1
        i += 1
    return result

def top_words_from_topics(lda_model, gdict, to_print=10, min_oc=10):
    result = []
    for n in range(lda_model.num_topics):
        result.append(top_words_from_topic(lda_model, gdict, n, to_print, min_oc))
    return result

In [None]:
MultiTable(top_words_from_topics(lda_m1, gensim_dict, min_oc=1))

## Run with some different settings

Let's change the `number of topics` to 5.

Also let's `filter extremes` (words that appear too much or too little).

And, let's up the number of `passes`.

In [None]:
gensim_dict.filter_extremes(no_below=5, no_above=0.25) 
cw_bows = [gensim_dict.doc2bow(doc) for doc in docs_no_stop] 

In [None]:
lda_m2 = models.LdaModel(cw_bows, 
                         id2word=gensim_dict, 
                         num_topics=5,
                          passes=50,
                          update_every=0,
                          chunksize=2000,
                          iterations=100,
                          gamma_threshold=.001,
                          decay=.5,
                          offset=1,
                          random_state=2,
                          alpha="auto")

In [None]:
MultiTable(top_words_from_topics(lda_m2, gensim_dict, min_oc=1))

### pyLDAvis
pyLDAvis is a library that draws some useful diagrams

In [None]:
import pyLDAvis.gensim_models as gensimvis

vis_data = gensimvis.prepare(lda_m2, cw_bows, gensim_dict)

In [None]:
pyLDAvis.display(vis_data)

## Examine the topics in individual documents

### First gather all of the info we'll need

* each wordified page
* bow for each page
* topics for each overall page

In [None]:
pagenames = list(wordified_page_dict.keys())
wordified_pages = {}
article_bows = {}
article_topics = {}
for pagename in pagenames:
    wordified_page_rows = wordified_page_dict[pagename]
    flat_list = []
    for r in wordified_page_rows:
        for item in r:
            flat_list.append(item)
    wordified_pages[pagename] = flat_list
    article_bows[pagename] = gensim_dict.doc2bow(flat_list)
    article_topics[pagename] = lda_m2[article_bows[pagename]]

In [None]:
article_topics[pagename]

### Put the topics for the pages in a matrix

In [None]:
import numpy as np
tarray = np.zeros([len(pagenames), lda_m2.num_topics])
for n, pagename in enumerate(pagenames):
    art_topics = article_topics[pagename]
    for t in art_topics:
        tarray[n, t[0]] = t[1]
tarray.shape

### Display a heatmap for the matrix

In [None]:
topic_labels = []
for topic in top_words_from_topics(lda_m2, gensim_dict, min_oc=1):
    label = topic[1][0] + "-" + topic[2][0]
    topic_labels.append(label)
topic_labels

In [None]:
import matplotlib
import matplotlib.cm as cm
fig = matplotlib.pyplot.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
dialogs = []
(nrows, ncols) = tarray.shape
cax = ax.imshow(tarray, cmap=cm.gist_yarg, aspect="auto", interpolation='nearest')

ind = np.arange(ncols)
ax.set_xticks(ind, minor=False)
ax.set_xticks(ind + .5, minor=True)
ax.get_xaxis().set_ticklabels(topic_labels, size="medium", rotation="vertical")

ind = np.arange(nrows)

ax.set_yticks(ind, minor=False)
ax.set_yticks(ind + .5, minor=True)
ax.get_yaxis().set_ticklabels(pagenames, size="small", rotation="horizontal")

ax.grid(True, which='minor', linestyle=':')

fig.set_facecolor("white")

### Look at the topics in an individual article

In [None]:
# wordified_page = wordified_page_dict["American_Civil_War"]
wordified_page = wordified_page_dict["Emancipation_Proclamation"]
article_bows = [gensim_dict.doc2bow(para) for para in wordified_page]
article_topics = lda_m2[article_bows]

In [None]:
import numpy as np
tarray = np.zeros([lda_m2.num_topics, len(article_topics)])
for n, para in enumerate(article_topics):
    for t in para:
        tarray[t[0], n] = t[1]
tarray.shape

In [None]:
import matplotlib
import matplotlib.cm as cm
fig = matplotlib.pyplot.figure(figsize=(25, 10))
ax = fig.add_subplot(111)
dialogs = []
nrows, ncols = tarray.shape
cax = ax.imshow(tarray, cmap=cm.gist_yarg, aspect="auto", interpolation='nearest')
indices = np.arange(ncols)
ax.set_xticks(indices, minor=False)
ax.set_xticks(indices + .5, minor=True)
ax.get_xaxis().set_ticklabels(list(range(ncols)), size="medium", rotation="vertical")
indices = np.arange(nrows)
ax.set_yticks(indices, minor=False)
ax.set_yticks(indices + .5, minor=True)
ax.get_yaxis().set_ticklabels(topic_labels, size="small", rotation="horizontal")

ax.grid(True, which='minor', linestyle=':')

fig.set_facecolor("white")

## Find the top paragraphs for each topic

First we build up a data structure. This will include the topic weights for each paragraph, as well as the text of each paragraph.

In [None]:
para_topics = []
for pagename, wordified_page in wordified_page_dict.items():
    print(pagename)
    wordified_page = wordified_page_dict[pagename]
    article_bows = [gensim_dict.doc2bow(para) for para in wordified_page]
    article_topics = lda_m2[article_bows]
    page_data = []
    for i, topics in enumerate(article_topics):
        para_topics.append([topics, pagename, i, wordified_page[i], page_dict[pagename][i]])

These functions sort this big data structure by the weights of one of the topics and then print the top results.

In [None]:
import copy
def get_topic_weight(tlist, topicnum):
    for tup in tlist:
        if tup[0] == topicnum:
            return tup[1]
        return -1

def sort_by_topic(ptopics, topicnum):
    def sfunc(item):
        return get_topic_weight(item[0], topicnum)
    nptopics = copy.deepcopy(ptopics)
    nptopics.sort(key=sfunc, reverse=True)
    return nptopics

def print_top_for_topic(ptopics, topicnum, to_print=10):
    sorted_paras = sort_by_topic(para_topics, topicnum)
    result_table = [["page", "text", "weight"]]
    for presult in sorted_paras[:to_print]:
        result_table.append([presult[1], presult[4], round(get_topic_weight(presult[0], topicnum), 3)])
    return ListTable(result_table)

In [None]:
print_top_for_topic(para_topics, 0)