In [1]:
import pandas as pd
import altair as alt
import spacy
import re
import string

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim import models

import pyLDAvis.gensim

import timeit
from pandarallel import pandarallel

pd.set_option('display.max_colwidth', 100)

In [2]:
# import briefings 
briefings_df = pd.read_csv('../data/scored_briefings.csv')

In [3]:
# Load English spacy model and stop words
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
# function for preprocessing each paragraph of transcript text
def preprocess(text, 
               min_token_len = 2, 
               irrelevant_pos = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']): 
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text 
    and return a preprocessed list of strings. 
    
    Parameters
    -------------
    text : (str) 
        the text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    
    Returns
    -------------
    (list) the preprocessed text as a list of strings
    """
    
    # convert input string to lowercase
    text = text.lower()

    # remove multiple whitespace characters
    text = re.sub(r'\s+',' ', text)
    
    # tokenize with spacy, exluding stop words, short tokens, 
    # irrelevant POS, emails, urls, and strings containing 
    # non-alphanumeric chars
    
    doc = nlp(text)
    token_list = []
    for token in doc:
        if token.is_stop == False and len(token.text)>=min_token_len \
            and token.pos_ not in irrelevant_pos and token.like_email == False \
            and token.like_url == False and token.text.isalnum():
            token_list.append(token.lemma_)
        
    return token_list

#### Pre-process the raw text prior to topic modelling:

In [5]:
# parallelize preprocessing to reduce execution time
pandarallel.initialize(verbose=False)

# apply preprocessor to each row of text
briefings_df['pp_text'] = briefings_df.text.parallel_apply(preprocess)

In [6]:
briefings_df['pp_text']

0       [thank, thank, begin, like, extend, deep, condolence, victim, family, milwaukee, wisconsin, toda...
1       [lot, people, think, turn, good, thing, number, priority, standpoint, health, safety, american, ...
2       [total, 15, take, japan, hear, american, citizen, quarantine, get, well, feel, obligation, 42, f...
3       [china, know, start, speak, president, xi, great, talk, work, work, hard, count, report, come, c...
4       [bring, specialist, regarded, specialist, tomorrow, work, state, department, talented, want, und...
                                                       ...                                                 
9674                                                                                            [crosstalk]
9675                                                                                                  [let]
9676    [american, president, lose, americans, course, week, die, entirety, vietnam, war, deserve, reelect]
9677    [yeah, lose, lot, pe

#### Create dictionary and document-term co-occurence matrix

In [7]:
# build dictionary
corpus = briefings_df['pp_text'].tolist()
dictionary = corpora.Dictionary(corpus)
len(dictionary)

7911

In [8]:
# build dictionary and filter extremes, removing tokens that appear in
# either: fewer than 2 paragraphs, or in more than 10% of all paragraphs
dictionary = corpora.Dictionary(corpus)
dictionary.filter_extremes(no_below = 2, no_above = 0.1)
len(dictionary)

5072

In [9]:
# define words to be removed and retrieve their indexes
# after some initial experimentation i found some irrelevant words that were having a negative impact
remove_words = ['crosstalk', 'question', 'inaudible', 'mr', 'sir', 'dr']
del_indexes = [k for k,v in dictionary.items() if v in remove_words]

# remove unwanted word ids from the dictionary
dictionary.filter_tokens(bad_ids=del_indexes)

In [10]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

#### Build and visualize the topic model:

In [11]:
lda = models.LdaModel(corpus=doc_term_matrix,
                      id2word=dictionary,
                      num_topics=5,
                      passes=20,
                      random_state=123)

pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, sort_topics=False)
viz

In [12]:
topic_labels = {0:'New York Outbreak',
                1:'Testing',
                2:'Ventilators',
                3:'Economy',
                4:'General'
               }

### Use the topic model to predict a topic for each text:

In [13]:
def get_most_prob_topic(unseen_document, model = lda):
    """
    Given an unseen_document, and a trained LDA model, this function
    finds the most likely topic (topic with the highest probability) from the 
    topic distribution of the unseen document and returns the best topic
    
    Parameters
    ------------
    unseen_document : (str) 
        the document to be labeled with a topic
    model : (gensim ldamodel) 
        the trained LDA model
    
    Returns: 
    -------------
        (str) the most likely topic label
    
    Examples:
    ----------
    >> get_most_prob_topic("We're building so so so many ventilators.", 
                            model = lda)
    Ventilators
    """
    
    # preprocess unseen text and obtain bow vector
    unseen_doc_pp = preprocess(unseen_document)
    bow_vector = dictionary.doc2bow(unseen_doc_pp)
    
    # calculate topic scores for unseen text
    scores_df = pd.DataFrame(lda[bow_vector], columns =['topic', 'score']) 
    
    # find topic name of max score
    topic_name = topic_labels[scores_df.loc[scores_df['score'].idxmax(), 'topic']]
    
    return topic_name

In [14]:
# create empty list to store prediction strings
predictions = []

# call function for each unseen text, appending predictions to list
for text in briefings_df['text'].tolist():
    predictions.append(get_most_prob_topic(text))

# add predicted topics to test df
briefings_df['topic_pred'] = predictions

In [15]:
briefings_df

Unnamed: 0,date,timestamp,speaker,text,tb_polarity,tb_subjectivity,v_compound_polarity,anger,anticipation,disgust,fear,joy,nrc_negative,nrc_positive,sadness,surprise,trust,pp_text,topic_pred
0,2020-02-26,05:39,Donald Trump,"Thank you very much everybody. Thank you very much. Before I begin, I’d like to extend my deepes...",0.078559,0.562093,0.7316,0.136364,0.136364,0.090909,0.181818,0.090909,0.272727,0.181818,0.136364,0.090909,0.090909,"[thank, thank, begin, like, extend, deep, condolence, victim, family, milwaukee, wisconsin, toda...",General
1,2020-02-26,06:59,Donald Trump,"A lot of people thought we shouldn’t have done it that early and we did, and it turned out to be...",0.284714,0.431381,0.9510,0.055556,0.222222,0.111111,0.166667,0.055556,0.166667,0.222222,0.055556,0.055556,0.166667,"[lot, people, think, turn, good, thing, number, priority, standpoint, health, safety, american, ...",New York Outbreak
2,2020-02-26,07:51,Donald Trump,"We have a total of 15. We took in some from Japan, you heard about that, because they’re America...",0.221088,0.506516,0.9888,0.000000,0.130435,0.000000,0.043478,0.173913,0.000000,0.391304,0.043478,0.086957,0.217391,"[total, 15, take, japan, hear, american, citizen, quarantine, get, well, feel, obligation, 42, f...",General
3,2020-02-26,09:58,Donald Trump,"China you know about. Where it started. I spoke with President Xi, we had a great talk. He’s wor...",-0.038796,0.439352,0.9124,0.142857,0.214286,0.000000,0.142857,0.214286,0.214286,0.357143,0.000000,0.071429,0.214286,"[china, know, start, speak, president, xi, great, talk, work, work, hard, count, report, come, c...",General
4,2020-02-26,10:52,Donald Trump,"We’re bringing in a specialist, a very highly regarded specialist tomorrow, who works actually a...",0.036440,0.635832,-0.8626,0.000000,0.153846,0.000000,0.076923,0.076923,0.230769,0.153846,0.076923,0.000000,0.153846,"[bring, specialist, regarded, specialist, tomorrow, work, state, department, talented, want, und...",General
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9674,2020-04-27,01:01:23,Unnamed (Reporter),[crosstalk 00:13:23].,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[crosstalk],New York Outbreak
9675,2020-04-27,01:01:24,Donald Trump,"Let’s do one more. Please, in the back.",0.250000,0.250000,0.3804,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[let],General
9676,2020-04-27,01:01:26,Unnamed,If an American president loses more Americans over the course of six weeks than died in the enti...,0.066667,0.200000,-0.8689,0.333333,0.333333,0.000000,0.000000,0.000000,0.000000,0.666667,0.000000,0.000000,0.666667,"[american, president, lose, americans, course, week, die, entirety, vietnam, war, deserve, reelect]",Testing
9677,2020-04-27,01:01:36,Donald Trump,"So yeah, we’ve lost a lot of people, but if you look at what original projections were, 2.2 mill...",0.241667,0.527083,0.9225,0.071429,0.142857,0.000000,0.071429,0.071429,0.142857,0.214286,0.071429,0.071429,0.071429,"[yeah, lose, lot, people, look, original, projection, million, head, 60, thousand, 70, thousand,...",General


In [16]:
# save topics df to csv
briefings_df.to_csv("../data/topic_scored_briefings.csv",index=False)