In [1]:
import numpy as np
import subprocess
import os
import string
from itertools import chain
from nltk.corpus import stopwords

### Get the data

Note: This requires `wget` and `unzip` to be installed on your system.

It retrieves data from `https://archive.org/download/Inaugural-Address-Corpus-1789-2009/inaugural.zip`

In [2]:
subprocess.call("./get_data.sh", shell=True);

### Read in the data

This reads in each speech as a list of words. In particular, it removes all punctuation, stopwords, lower-cases all the words, and splits on white space. Please be aware that you must have `nltk` installed on your system.

In [3]:
def read_data():
    speech_names = os.listdir('inaugural')
    
    docs = []
    for n in speech_names:
        with open('inaugural/' + n, 'rb') as f:
            docs.append(str(f.read()))
            
    translator = str.maketrans('', '', string.punctuation)
    stops = set(stopwords.words("english"))
    docs = [doc.translate(translator).lower().split() for doc in docs]
    docs = [[w for w in doc if w not in stops] for doc in docs]
    return docs

### Implement the collapsed gibbs sampler

See slides [here](https://n-s-f.github.io/talks/lda.html) for more details.

In [4]:
def gibbs(docs, num_topics, iterations=5000):
    num_docs = len(docs)
    
    corpus = list(set(chain(*docs)))
    num_words = len(corpus)
    corpus_lookup = dict(zip(corpus, range(num_words)))
    
    alpha = 1 / num_topics  # flat prior
    beta = 1 / num_words  # flat prior
    
    current_topics = [np.random.randint(0, num_topics, len(words))
                      for words in docs]
    
    topic_counts = np.zeros(num_topics)
    for doc_topics in current_topics:
        for topic in doc_topics:
            topic_counts[topic] += 1

    document_counts = np.zeros((num_docs, num_topics))    
    for d, topics in enumerate(current_topics):
        for t in range(num_topics):
            document_counts[d][t] = sum(topics == t)
            
    word_counts = np.zeros((num_topics, num_words))
    for d, doc in enumerate(docs):
        for w, word in enumerate(doc):
            topic = current_topics[d][w]
            word_id = corpus_lookup[word]
            word_counts[topic][word_id] += 1
            
    for it in range(iterations):
        if it % 10 == 0:
            print('iteration:', it)
        
        for d, doc in enumerate(docs):
            for w, word in enumerate(doc):
                topic = current_topics[d][w]
                document_counts[d][topic] -= 1
                word_counts[topic][corpus_lookup[word]] -= 1
                topic_counts[topic] -= 1

                probs = np.zeros(num_topics)
                for j in range(num_topics):
                    probs[j] = ((
                    (document_counts[d][j] + alpha)
                     * (word_counts[j][corpus_lookup[word]] + beta))
                     / (topic_counts[j] + (beta * num_words)))

                probs = probs / np.sum(probs)
                new_topic = np.where(np.random.multinomial(1, probs))[0][0]
                current_topics[d][w] = new_topic
                document_counts[d][new_topic] += 1
                word_counts[new_topic][corpus_lookup[word]] += 1
                topic_counts[new_topic] += 1

    return current_topics, document_counts, word_counts, topic_counts, corpus

### Reconstruct topics

Given how often each word appears in a topic, we can reconstruct the distributions of words for each topic.

In [5]:
def reconstruct_topics(results):
    topic_distributions = []
    for topic in range(results[2].shape[0]):
        percentages = results[2][topic] / np.sum(results[2][topic])
        dist = sorted(list(zip(results[4], percentages)), key=lambda x: -x[1])
        topic_distributions.append(dist)
    return topic_distributions

### Analyze the data

We'll assume there are 15 topics overall in the inauguration speeches.

In [6]:
docs = read_data()
results = gibbs(docs, num_topics=15, iterations=50)

LookupError: 
**********************************************************************
  Resource 'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/home/noam/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

In [None]:
topic_distributions = reconstruct_topics(results)

In [None]:
[t[:10] for t in topic_distributions]