In [4]:
import pandas as pd
import altair as alt

import spacy
import re
import string

import gensim 
from gensim.models import LdaModel

import gensim.corpora as corpora
from gensim.corpora import Dictionary

from gensim import models

import pyLDAvis.gensim

import timeit
from pandarallel import pandarallel

pd.set_option('display.max_colwidth', 100)

In [5]:
# import briefings 
briefings_df = pd.read_csv('../data/whtfb/all_briefings.csv')

In [6]:
# Load sEnglish spacy model and stop words
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

In [7]:
# function for preprocessing each paragraph of transcript speach 
def preprocess(text, 
               min_token_len = 2, 
               irrelevant_pos = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']): 
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text 
    and return a preprocessed list of strings. 
    
    Parameters
    -------------
    text : (str) 
        the text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    
    Returns
    -------------
    (list) the preprocessed text as a list of strings
    """
    
    # convert input string to lowercase
    text = text.lower()
    
    # remove some of the obviously irrelevant character patterns
    text = re.sub('from', '', text)
    text = re.sub('subject', '', text)
    text = re.sub('organization', '', text)
    text = re.sub('distribution', '', text)
    text = re.sub('lines', '', text)
    text = re.sub('article', '', text)

    # remove multiple whitespace characters
    text = re.sub(r'\s+',' ', text)
    
    # tokenize with spacy, exluding stop words, short tokens, 
    # irrelevant POS, emails, urls, and strings containing 
    # non-alphanumeric chars
    
    doc = nlp(text)
    token_list = []
    for token in doc:
        if token.is_stop == False and len(token.text)>=min_token_len \
            and token.pos_ not in irrelevant_pos and token.like_email == False \
            and token.like_url == False and token.text.isalnum():
            token_list.append(token.lemma_)
        
    return token_list

#### Pre-process the raw text cells:

In [8]:
# parallelize preprocessing to reduce execution time
pandarallel.initialize(verbose=False)

# apply preprocessor to each row of text
briefings_df['pp_text'] = briefings_df.text.parallel_apply(preprocess)

In [9]:
briefings_df['pp_text']

0       [thank, thank, begin, like, extend, deep, condolence, victim, family, milwaukee, wisconsin, toda...
1       [lot, people, think, turn, good, thing, number, priority, standpoint, health, safety, american, ...
2       [total, 15, take, japan, hear, american, citizen, quarantine, get, well, feel, obligation, 42, f...
3       [china, know, start, speak, president, xi, great, talk, work, work, hard, count, report, come, c...
4       [bring, specialist, regarded, specialist, tomorrow, work, state, department, talented, want, und...
                                                       ...                                                 
7171    [follow, approach, follow, approach, think, million, people, dead, sweden, have, lot, difficulty...
7172    [minimum, number, reason, number, think, american, people, discipline, think, possible, guess, w...
7173    [okay, number, cut, half, people, right, unacceptable, people, spend, money, stimulus, care, com...
7174    [say, get, work, get

#### Create dictionary and document-term co-occurence matrix

In [23]:
# build dictionary
corpus = briefings_df['pp_text'].tolist()
dictionary = corpora.Dictionary(corpus)
len(dictionary)

6794

In [24]:
# build dictionary and filter extremes,
# removing tokens that appear in fewer than 2 paragraphs AND more than 5% of paragraphs
dictionary = corpora.Dictionary(corpus)
dictionary.filter_extremes(no_below = 2, no_above = 0.05)
len(dictionary)

4315

In [25]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

#### Build the topic model

In [26]:
num_topics = [7]
passes = [10]

for num in num_topics:
    for num_pass in passes:
        lda = models.LdaModel(corpus=doc_term_matrix, 
                      id2word=dictionary, 
                      num_topics=num, 
                      passes=num_pass,
                      random_state=123)
        print("Topics =",num,", Passes =", num_pass)
        print(lda.print_topics(num_words=8))
        print()
        print()

Topics = 7 , Passes = 10
[(0, '0.018*"healthcare" + 0.015*"supply" + 0.014*"mask" + 0.012*"worker" + 0.010*"government" + 0.010*"force" + 0.009*"national" + 0.009*"task"'), (1, '0.032*"ventilator" + 0.015*"million" + 0.015*"thousand" + 0.011*"testing" + 0.009*"inaudible" + 0.009*"sir" + 0.009*"mike" + 0.008*"send"'), (2, '0.031*"business" + 0.021*"money" + 0.020*"company" + 0.016*"billion" + 0.016*"small" + 0.014*"pay" + 0.013*"worker" + 0.009*"dollar"'), (3, '0.011*"united" + 0.011*"spread" + 0.010*"states" + 0.010*"guide" + 0.010*"oil" + 0.010*"open" + 0.010*"border" + 0.009*"place"'), (4, '0.018*"yeah" + 0.018*"china" + 0.016*"year" + 0.012*"life" + 0.011*"okay" + 0.011*"bad" + 0.011*"problem" + 0.010*"deal"'), (5, '0.031*"area" + 0.019*"jersey" + 0.013*"city" + 0.012*"system" + 0.012*"washington" + 0.011*"send" + 0.010*"louisiana" + 0.010*"african"'), (6, '0.017*"datum" + 0.012*"model" + 0.012*"community" + 0.012*"testing" + 0.011*"birx" + 0.010*"important" + 0.009*"fauci" + 0.009*

#### Visualize the topic model:

In [27]:
pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, sort_topics=False)
viz

### Ideas for topic exploration:
 - Ventilators
 - China (spec: Wuhan)
 - PPE
 - Economy
 - Science