In [18]:
import pandas as pd
import spacy
import re
import string

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim import models

import pyLDAvis.gensim

from pandarallel import pandarallel

pd.set_option('display.max_colwidth', 100)

In [19]:
# import briefings 
briefings_df = pd.read_csv('../data/scored_briefings.csv')

In [20]:
# Load English spacy model and stop words
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS

In [21]:
# function for preprocessing each paragraph of transcript text
def preprocess(text, 
               min_token_len = 2, 
               irrelevant_pos = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']): 
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text 
    and return a preprocessed list of strings. 
    
    Parameters
    -------------
    text : (str) 
        the text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    
    Returns
    -------------
    (list) the preprocessed text as a list of strings
    """
    
    # convert input string to lowercase
    text = text.lower()

    # remove multiple whitespace characters
    text = re.sub(r'\s+',' ', text)
    
    # tokenize with spacy, exluding stop words, short tokens, 
    # irrelevant POS, emails, urls, and strings containing 
    # non-alphanumeric chars
    
    doc = nlp(text)
    token_list = []
    for token in doc:
        if token.is_stop == False and len(token.text)>=min_token_len \
            and token.pos_ not in irrelevant_pos and token.like_email == False \
            and token.like_url == False and token.text.isalnum():
            token_list.append(token.lemma_)
        
    return token_list

#### Pre-process the raw text prior to topic modelling:

In [22]:
# parallelize preprocessing to reduce execution time
pandarallel.initialize(verbose=False)

# apply preprocessor to each row of text
briefings_df['pp_text'] = briefings_df.text.parallel_apply(preprocess)

In [23]:
briefings_df['pp_text']

0       [thank, thank, begin, like, extend, deep, condolence, victim, family, milwaukee, wisconsin, toda...
1       [lot, people, think, turn, good, thing, number, priority, standpoint, health, safety, american, ...
2       [total, 15, take, japan, hear, american, citizen, quarantine, get, well, feel, obligation, 42, f...
3       [china, know, start, speak, president, xi, great, talk, work, work, hard, count, report, come, c...
4       [bring, specialist, regarded, specialist, tomorrow, work, state, department, talented, want, und...
                                                       ...                                                 
9674                                                                                            [crosstalk]
9675                                                                                                  [let]
9676    [american, president, lose, americans, course, week, die, entirety, vietnam, war, deserve, reelect]
9677    [yeah, lose, lot, pe

#### Create dictionary and document-term co-occurence matrix

In [24]:
# build dictionary
corpus = briefings_df['pp_text'].tolist()
dictionary = corpora.Dictionary(corpus)
len(dictionary)

7911

In [25]:
# build dictionary and filter extremes, removing tokens that appear in
# either: fewer than 5 paragraphs, or in more than 10% of all paragraphs
dictionary = corpora.Dictionary(corpus)
dictionary.filter_extremes(no_below = 10, no_above = 0.1)
len(dictionary)

2037

In [26]:
# define words to be manually removed and retrieve their indexes
remove_words = ['crosstalk', 'question', 'inaudible', 'mr', 'sir', 'dr']
del_indexes = [k for k,v in dictionary.items() if v in remove_words]

# remove unwanted word ids from the dictionary
dictionary.filter_tokens(bad_ids=del_indexes)

In [27]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

#### Build and visualize the topic model:

In [28]:
lda = models.LdaModel(corpus=doc_term_matrix,
                      id2word=dictionary,
                      num_topics=6,
                      passes=20,
                      random_state=123)

pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda, doc_term_matrix, dictionary, sort_topics=False)
viz

In [29]:
topic_labels = {0:'Economy',
                1:'International',
                2:'Policy & Guidelines',
                3:'Testing',
                4:'Ventilators & NY Outbreak',
                5:'PPE'
               }

### Use the topic model to predict a topic for each text:

In [30]:
def get_most_prob_topic(unseen_document, model = lda):
    """
    Given an unseen_document, and a trained LDA model, this function
    finds the most likely topic (topic with the highest probability) from the 
    topic distribution of the unseen document and returns the best topic
    
    Parameters
    ------------
    unseen_document : (str) 
        the document to be labeled with a topic
    model : (gensim ldamodel) 
        the trained LDA model
    
    Returns: 
    -------------
        (str) the most likely topic label
    
    Examples:
    ----------
    >> get_most_prob_topic("We're building so so so many ventilators.", 
                            model = lda)
    Ventilators
    """
    
    # preprocess unseen text and obtain bow vector
    unseen_doc_pp = preprocess(unseen_document)
    bow_vector = dictionary.doc2bow(unseen_doc_pp)
    
    # calculate topic scores for unseen text
    scores_df = pd.DataFrame(lda[bow_vector], columns =['topic', 'score']) 
    
    # find topic name of max score
    topic_name = topic_labels[scores_df.loc[scores_df['score'].idxmax(), 'topic']]
    best_score = scores_df['score'].max()
    
    return topic_name, best_score;

In [31]:
# create empty lists to store prediction strings
predictions = []
scores = []

# call function for each unseen text, appending predictions to list
for text in briefings_df['pp_text'].tolist():
    # we'll only predict a topic for texts where there are4 or more token words present
    if len(text) > 4:
        topic, value = get_most_prob_topic(' '.join(text))
        predictions.append(topic)
        scores.append(value)
    else:
        predictions.append(None)
        scores.append(None)

# add prediction values to main df
briefings_df['topic_pred'] = predictions
briefings_df['topic_score'] = scores

In [32]:
briefings_df

Unnamed: 0,date,timestamp,speaker,text,anger,anticipation,disgust,fear,joy,nrc_negative,nrc_positive,sadness,surprise,trust,pp_text,topic_pred,topic_score
0,2020-02-26,05:39,Donald Trump,"Thank you very much everybody. Thank you very much. Before I begin, I’d like to extend my deepes...",0.136364,0.136364,0.090909,0.181818,0.090909,0.272727,0.181818,0.136364,0.090909,0.090909,"[thank, thank, begin, like, extend, deep, condolence, victim, family, milwaukee, wisconsin, toda...",International,0.544385
1,2020-02-26,06:59,Donald Trump,"A lot of people thought we shouldn’t have done it that early and we did, and it turned out to be...",0.055556,0.222222,0.111111,0.166667,0.055556,0.166667,0.222222,0.055556,0.055556,0.166667,"[lot, people, think, turn, good, thing, number, priority, standpoint, health, safety, american, ...",Policy & Guidelines,0.573967
2,2020-02-26,07:51,Donald Trump,"We have a total of 15. We took in some from Japan, you heard about that, because they’re America...",0.000000,0.130435,0.000000,0.043478,0.173913,0.000000,0.391304,0.043478,0.086957,0.217391,"[total, 15, take, japan, hear, american, citizen, quarantine, get, well, feel, obligation, 42, f...",International,0.337955
3,2020-02-26,09:58,Donald Trump,"China you know about. Where it started. I spoke with President Xi, we had a great talk. He’s wor...",0.142857,0.214286,0.000000,0.142857,0.214286,0.214286,0.357143,0.000000,0.071429,0.214286,"[china, know, start, speak, president, xi, great, talk, work, work, hard, count, report, come, c...",International,0.599763
4,2020-02-26,10:52,Donald Trump,"We’re bringing in a specialist, a very highly regarded specialist tomorrow, who works actually a...",0.000000,0.153846,0.000000,0.076923,0.076923,0.230769,0.153846,0.076923,0.000000,0.153846,"[bring, specialist, regarded, specialist, tomorrow, work, state, department, talented, want, und...",Testing,0.322573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9674,2020-04-27,01:01:23,Unnamed (Reporter),[crosstalk 00:13:23].,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[crosstalk],,
9675,2020-04-27,01:01:24,Donald Trump,"Let’s do one more. Please, in the back.",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[let],,
9676,2020-04-27,01:01:26,Unnamed,If an American president loses more Americans over the course of six weeks than died in the enti...,0.333333,0.333333,0.000000,0.000000,0.000000,0.000000,0.666667,0.000000,0.000000,0.666667,"[american, president, lose, americans, course, week, die, entirety, vietnam, war, deserve, reelect]",Policy & Guidelines,0.586939
9677,2020-04-27,01:01:36,Donald Trump,"So yeah, we’ve lost a lot of people, but if you look at what original projections were, 2.2 mill...",0.071429,0.142857,0.000000,0.071429,0.071429,0.142857,0.214286,0.071429,0.071429,0.071429,"[yeah, lose, lot, people, look, original, projection, million, head, 60, thousand, 70, thousand,...",International,0.455969


In [33]:
# save topics df to csv
briefings_df.to_csv("../data/topic_scored_briefings.csv",index=False)

In [34]:
briefings_df[briefings_df['topic_pred'].isnull()]

Unnamed: 0,date,timestamp,speaker,text,anger,anticipation,disgust,fear,joy,nrc_negative,nrc_positive,sadness,surprise,trust,pp_text,topic_pred,topic_score
20,2020-02-26,25:35,Donald Trump,[inaudible 00:25:35] please.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[inaudible],,
21,2020-02-26,25:38,Dr. Anthony Fauci,I just want to give you a very quick update on the…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[want, quick, update]",,
22,2020-02-26,25:41,Unnamed,[inaudible 00:04:41].,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[inaudible],,
27,2020-02-26,28:04,Donald Trump,"Okay, thank you [inaudible 00:07:05].",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[okay, thank, inaudible]",,
34,2020-02-26,30:08,Crowd,[crosstalk 00:09:08].,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[crosstalk],,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9670,2020-04-27,01:01:12,Unnamed,… that Kim Jong-un-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[kim, jong]",,
9672,2020-04-27,01:01:22,Unnamed,But anyways-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[],,
9673,2020-04-27,01:01:22,Donald Trump,"Okay, go ahead.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[okay],,
9674,2020-04-27,01:01:23,Unnamed (Reporter),[crosstalk 00:13:23].,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[crosstalk],,
