In [1]:
import pickle
import pandas as pd
import numpy as np
import os

## Train a LDA Topic Model

In [2]:
texts = pickle.load(open('data/texts_processed_lda.pkl', 'rb'))
corpus, id2word = pickle.load(open('data/corpus_lda.pkl', 'rb'))

In [7]:
def train_lda_model(corpus, id2word, texts, model_type='lda', start=10, limit=50, step=3):
    import os
    import gensim
    import numpy as np
    from gensim.models import CoherenceModel
    
    if model_type == 'mallet' and not os.path.exists('mallet-2.0.8.zip'):
        os.system('wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip')
        os.system('unzip mallet-2.0.8.zip')
    mallet_path = 'mallet-2.0.8/bin/mallet'
    
    import threading
    class MyThread(threading.Thread):
        def __init__(self, func, args=()):
            super(MyThread, self).__init__()
            self.func = func
            self.args = args

        def run(self):
            self.result = self.func(*self.args)

        def get_result(self):
            try:
                return self.result
            except Exception:
                return None
    
    def lda_model_func(corpus, id2word, num_topics):
        print(f'Training model with {num_topics} topics')
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
        coh_v = coherence_model.get_coherence()
        print(f'num of topic: {num_topics}/{limit}, coherence value: {coh_v}')
        return model, coh_v
    
    
    def mallet_model_func(mallet_path, corpus, num_topics, id2word):
        print(f'Training model with {num_topics} topics')
        model = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                     corpus=corpus, 
                                                     num_topics=num_topics, 
                                                     id2word=id2word, 
                                                     iterations=50,
                                                     workers=1,
                                                     random_seed=42)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
        coh_v = coherence_model.get_coherence()
        print(f'num of topic: {num_topics}/{limit}, coherence value: {coh_v}')
        return model, coh_v
    
    threads = []
    for num_topics in range(start, limit+1, step):
        if model_type == 'lda':
            thread = MyThread(func=lda_model_func, args=(corpus, id2word, num_topics))
        else:
            thread = MyThread(func=mallet_model_func, args=(mallet_path, corpus, num_topics, id2word))
        threads.append(thread)
        thread.start()
    
    for thread in threads:
        thread.join()
    
    coherence_values_dict = {}
    coherence_values = []
    model_dict = {}
    
    for thread, n in zip(threads, range(start, limit+1, step)):
        model, coh_v = thread.get_result()
        model_dict[n] = model
        coherence_values_dict[n] = coh_v
        coherence_values.append(coh_v)
    
    import matplotlib.pyplot as plt
    plt.plot(range(start, limit+1, step), coherence_values, marker='o')
    plt.xlabel('n_topics')
    plt.ylabel('coherence score')
    plt.savefig(f'{model_type}_coherence_values_topics.png')
    plt.close()
    
    max_idx = np.argmax(coherence_values)
    
    return model_dict, coherence_values_dict, max_idx+start

In [5]:
# train LDA topic models with different K (n_topics)
# lda_models is a dictionary {K:model_K, K+1: model_K+1, .....}
if not os.path.exists('data/lda_models.pkl'):
    lda_models, coh_values, max_idx = train_lda_model(corpus, id2word, texts, model_type='mallet', start=10, limit=50, step=1)
else:
    lda_models, coh_values, max_idx = pickle.load(open('data/lda_models.pkl', 'rb'))

In [15]:
# print the coherecen values of different K (n_topics)
# K=15 gives the best coherence value but the topics are not good after manual inspection
df_ntopics_coh = pd.DataFrame(list(coh_values.items()), columns=['n_topics', 'coh_val'])
df_ntopics_coh.sort_values(by='coh_val', ascending=False)

Unnamed: 0,n_topics,coh_val
15,25,0.547594
29,39,0.541422
21,31,0.530214
24,34,0.528043
22,32,0.527386
12,22,0.526305
18,28,0.524733
25,35,0.523584
38,48,0.523282
13,23,0.522512


In [11]:
# choose the model with K=39
lda_model = lda_models[39]

In [12]:
# show the top-10 topic keywords for each topic and their corresponding probs
lda_models[39].show_topics(num_topics=-1, formatted=False)

[(0,
  [('percent', 0.07609897872683971),
   ('find', 0.029951963831590846),
   ('vaccine', 0.026649981835062366),
   ('accord', 0.024672021959391274),
   ('study', 0.01910951439066726),
   ('datum', 0.017050821458846326),
   ('high', 0.016889355346546644),
   ('research', 0.013829572518467687),
   ('researcher', 0.013530860210713277),
   ('show', 0.012957655512049408)]),
 (1,
  [('police', 0.060845002219442314),
   ('officer', 0.032516847584843224),
   ('man', 0.03183083814212501),
   ('black', 0.029692102820709415),
   ('protest', 0.02858641701303418),
   ('people', 0.014398127597756346),
   ('arrest', 0.011686372624187886),
   ('kill', 0.011637948428231307),
   ('protester', 0.010814737096969453),
   ('matter', 0.01015293975222953)]),
 (2,
  [('coronavirus', 0.04370887372606212),
   ('pandemic', 0.02627741028178551),
   ('federal', 0.02575654982030314),
   ('supply', 0.024966578120388213),
   ('government', 0.024037710297411325),
   ('make', 0.022292827751445386),
   ('effort', 0.01

In [13]:
# index each topic
topics = {x:y for x,y in lda_model.show_topics(num_topics=-1, num_words=10, formatted=False)}
df_topic = []
for i in range(len(topics)):
    df_topic.append([i, [each[0] for each in topics[i]]])
df_topic

[[0,
  ['percent',
   'find',
   'vaccine',
   'accord',
   'study',
   'datum',
   'high',
   'research',
   'researcher',
   'show']],
 [1,
  ['police',
   'officer',
   'man',
   'black',
   'protest',
   'people',
   'arrest',
   'kill',
   'protester',
   'matter']],
 [2,
  ['coronavirus',
   'pandemic',
   'federal',
   'supply',
   'government',
   'make',
   'effort',
   'ventilator',
   'response',
   'agency']],
 [3,
  ['travel',
   'flight',
   'home',
   'airline',
   'passenger',
   'day',
   'air',
   'hotel',
   'trip',
   'water']],
 [4,
  ['team',
   'season',
   'player',
   'game',
   'play',
   'league',
   'sport',
   'fan',
   'year',
   'start']],
 [5,
  ['coronavirus',
   'virus',
   'test',
   'covid',
   'people',
   'tested_positive',
   'testing',
   'positive',
   'symptom',
   'spread']],
 [6,
  ['case',
   'report',
   'number',
   'death',
   'health',
   'coronavirus',
   'confirm',
   'official',
   'accord',
   'covid']],
 [7,
  ['event',
   'plan',
 

## Analyze the topic model

In [14]:
import pandas as pd

df_news = pd.read_csv('data/df_news.csv')
sources = df_news['source'].values.tolist()
dates = df_news['date'].values.tolist()
months = df_news['month'].values.tolist()
weeks = df_news['week'].values.tolist()
titles = df_news['title'].values.tolist()

In [15]:
# rank topics by the mass of probabilities
def rank_topics(lda_model, corpus):
    idx_prob = [[i, 0] for i in range(lda_model.num_topics)]
    for idx_doc, rows in enumerate(lda_model[corpus]):
        for j, (idx_topic, prob) in enumerate(rows):
            idx_prob[idx_topic][1] += prob
    idx_prob.sort(key=lambda x:x[1], reverse=True)
    return idx_prob

In [16]:
topic_ranks = rank_topics(lda_model, corpus)
topic_ranks

[[10, 2061.0078541963785],
 [6, 1921.8196428800134],
 [12, 1834.9685792438377],
 [8, 1831.638274807569],
 [4, 1813.6535288510402],
 [5, 1804.6654841283748],
 [24, 1793.5117213467095],
 [34, 1792.4524913080115],
 [26, 1791.5410840021204],
 [33, 1783.6205681449187],
 [23, 1751.1746272875278],
 [27, 1749.2953584470933],
 [11, 1710.9470210139702],
 [35, 1706.8106171505997],
 [0, 1698.7306810362497],
 [21, 1698.0432775828679],
 [1, 1694.9830529963629],
 [28, 1694.818070154643],
 [30, 1687.0208108210722],
 [13, 1669.0697457320612],
 [7, 1662.0866777675242],
 [32, 1660.9618627095483],
 [37, 1657.8836238821452],
 [36, 1657.5687942359689],
 [15, 1654.076354500273],
 [19, 1636.1253505695197],
 [25, 1633.6468073963586],
 [20, 1623.9863913720485],
 [2, 1623.3422341343485],
 [29, 1619.5717911290537],
 [18, 1619.3414882852069],
 [22, 1616.7250545453246],
 [38, 1614.2181218587525],
 [9, 1614.2012280913223],
 [14, 1610.6713650047177],
 [17, 1598.5037906547725],
 [3, 1598.5001286205097],
 [31, 1593.379

In [19]:
# create a dataframe for the topics, including the index, the keywords, and the mass probility
for i in range(39):
    df_topic[i].append(topic_ranks[i][1])
df_topic = pd.DataFrame(df_topic, columns=['topic_idx', 'topic_stems', 'probs'])
df_topic

Unnamed: 0,topic_idx,topic_stems,probs
0,0,"[percent, find, vaccine, accord, study, datum,...",1698.730681
1,1,"[police, officer, man, black, protest, people,...",1694.983053
2,2,"[coronavirus, pandemic, federal, supply, gover...",1623.342234
3,3,"[travel, flight, home, airline, passenger, day...",1598.500129
4,4,"[team, season, player, game, play, league, spo...",1813.653529
5,5,"[coronavirus, virus, test, covid, people, test...",1804.665484
6,6,"[case, report, number, death, health, coronavi...",1921.819643
7,7,"[event, plan, june, announce, due, july, hold,...",1662.086678
8,8,"[state, order, reopen, county, california, gov...",1831.638275
9,9,"[post, twitter, video, facebook, tweet, social...",1614.201228


In [20]:
# figure out the the probability that each document is associated with each topic
# only keep those pairs with prob>=threshold (0.15 here)
def get_doc2topics(ldamodel, corpus, sources, titles, dates, months, weeks, threshold=0.15):
    data = []
    for idx_doc, rows in enumerate(ldamodel[corpus]):
        for j, (idx_topic, prob) in enumerate(rows):
            if prob < 0.15:
                continue
            data.append([idx_doc, idx_topic, prob, sources[idx_doc], titles[idx_doc],
                         dates[idx_doc], months[idx_doc], weeks[idx_doc]])
    df = pd.DataFrame(data, columns=['idx_doc', 'idx_topic', 'prob', 'source', 'title', 'date', 'month', 'week'])
    return df

df_doc_topic_all = get_doc2topics(lda_model, corpus, sources, titles, dates, months, weeks, 1)
df_doc_topic = get_doc2topics(lda_model, corpus, sources, titles, dates, months, weeks)
df_doc_topic.head()

Unnamed: 0,idx_doc,idx_topic,prob,source,title,date,month,week
0,0,26,0.243999,fox,Celebrity livestreams you can watch while soci...,2020-04-05 23:55:51+00:00,4,14
1,1,27,0.304364,cnn,US stock futures rise and oil drops 8.5% after...,2020-04-05 23:49:03+00:00,4,14
2,2,2,0.15431,cnn,Washington governor slams Trump administration...,2020-04-05 23:49:03+00:00,4,14
3,5,4,0.289828,nyp,Mets might be MLB’s biggest loser in the coron...,2020-04-05 23:42:10+00:00,4,14
4,6,1,0.155861,nyp,Hasidic funerals flout social distancing rules...,2020-04-05 23:42:01+00:00,4,14


In [21]:
df_doc_topic.shape[0], df_doc_topic['idx_doc'].unique().shape

(31874, (30571,))

In [24]:
from collections import Counter

In [25]:
# count the # documents associated with each topic
Counter(df_doc_topic['idx_topic'].values.tolist())

Counter({26: 734,
         27: 1306,
         2: 371,
         4: 2647,
         1: 1224,
         8: 1087,
         11: 941,
         33: 1228,
         20: 673,
         10: 2035,
         24: 1616,
         6: 1139,
         5: 603,
         23: 1137,
         25: 594,
         13: 1336,
         12: 2296,
         16: 735,
         3: 741,
         18: 863,
         28: 575,
         34: 1456,
         30: 566,
         14: 625,
         0: 794,
         15: 59,
         38: 129,
         29: 774,
         7: 426,
         9: 451,
         19: 541,
         36: 115,
         22: 455,
         31: 321,
         37: 380,
         32: 141,
         17: 227,
         35: 458,
         21: 75})

In [29]:
# create a validation set for finetuning the language model
# articles that is not assigned to any topic (prob < 0.15) are in this set
idxes_doc_val = set(df_news['idx'].unique().tolist()) - set(df_doc_topic['idx_doc'].unique().tolist())

In [31]:
# count how many documents in each month
Counter(df_doc_topic['month'].values.tolist())

Counter({4: 8416, 3: 6896, 2: 1061, 1: 355, 5: 5973, 7: 4997, 6: 4176})

In [32]:
# count how many documents from each source
Counter(df_doc_topic['source'].values.tolist())

Counter({'fox': 10076,
         'cnn': 6272,
         'nyp': 6058,
         'nyt': 3406,
         'breit': 3558,
         'huff': 2504})

In [83]:
# save the data
df_ntopics_coh.to_excel('../data_processing/data/coh_values.xlsx', index=False)
df_doc_topic.to_csv('../data_processing/data/df_doc_topic.csv', index=False)
df_topic.to_csv('../data_processing/data/df_topics.csv', index=False)
pickle.dump(topic_ranks, open('../data_processing/data/topic_ranks.pkl', 'wb'))
pickle.dump(lda_model, open('../data_processing/data/lda_model.pkl', 'wb'))
pickle.dump((lda_models, coh_values, max_idx), open('../data_processing/data/lda_models.pkl', 'wb'))
pickle.dump(idxes_doc_val, open('../data_processing/data/idxes_val.pkl', 'wb'))
# pickle.dump(idxes_doc_val2, open('../data_processing/data/idxes_val2.pkl', 'wb'))
pickle.dump(lda_model.show_topics(num_topics=-1, num_words=10, formatted=False), 
            open('../data_processing/data/topics.pkl', 'wb'))
pickle.dump(topic_labels, open('../data_processing/data/topic_labels.pkl', 'wb'))

## Prepare for document-contextuliazed topic embeddings
#### Create a mask for each document. This mask indicates how much each token of the document will contribute to the document-contextualized topic embedding (the weights). The length of the mask is equal to the # of tokens in the dcoument (tokenized by BERT). Each element in the mask indicates the weight of the token.

In [None]:
# tokenize the documents by BERT
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts_bert = pickle.load(open('../data_processing/data/texts_processed_bert.pkl', 'rb'))
text_encodings = tokenizer(pd.Series(texts_bert).apply(lambda x: ' '.join(x)).tolist(), padding=True, 
                           truncation=True)['input_ids']
text_encodings = pd.Series(text_encodings)

In [191]:
len(text_encodings[0])

512

In [243]:
text_encodings[0]

[101,
 15883,
 1035,
 2007,
 2545,
 24667,
 2078,
 2360,
 2016,
 1055,
 3046,
 2000,
 2022,
 5776,
 2007,
 2155,
 2076,
 21887,
 23350,
 24209,
 20486,
 10196,
 15798,
 2045,
 2022,
 7564,
 1997,
 8958,
 3268,
 25379,
 2000,
 2562,
 5470,
 11494,
 3251,
 2027,
 2128,
 2298,
 2005,
 2189,
 2831,
 2265,
 2030,
 2242,
 2842,
 3542,
 2100,
 1035,
 16123,
 2022,
 3677,
 1037,
 3679,
 2831,
 2265,
 2006,
 16021,
 23091,
 3444,
 8958,
 4113,
 6302,
 2011,
 17229,
 5754,
 7318,
 9581,
 3351,
 2182,
 1055,
 1037,
 2298,
 2012,
 1996,
 3268,
 25379,
 2250,
 2076,
 1996,
 21887,
 23350,
 24209,
 20486,
 10196,
 4908,
 1055,
 2444,
 2012,
 2188,
 2186,
 2029,
 2064,
 2022,
 5460,
 2006,
 9130,
 3073,
 5470,
 2007,
 2444,
 3371,
 2275,
 4685,
 2011,
 1996,
 2066,
 1997,
 6498,
 1035,
 24665,
 16429,
 2319,
 10294,
 14855,
 5397,
 25698,
 8183,
 5558,
 1998,
 2062,
 3582,
 2198,
 14687,
 2006,
 16021,
 23091,
 2000,
 2131,
 10651,
 2006,
 2010,
 2831,
 2265,
 2783,
 6888,
 3782,
 6676,
 3677,
 1037,

In [241]:
def search_in_list(list1, list2):
    '''
    search the indices of the topic keywords in the tokenized document -- a list of tokens
    '''
    idxes = []
    for i in range(len(list2)):
        if list1[0] == list2[i]:
            if (i + len(list1) <= len(list2)) and list2[i:i+len(list1)] == list1:
                idxes += list(range(i, i+len(list1)))
    return idxes

topic_masks = np.zeros((text_encodings.shape[0], 512))  # (n_docs, 512)
for topic in lda_model.show_topics(num_topics=-1, num_words=50, formatted=False):
    idx = topic[0]
    topic_stems = [each[0] for each in topic[1]]
    stem_probs = [each[1] for each in topic[1]]
    stem_probs = np.array(stem_probs)
    stem_probs /= stem_probs.sum()  # normalize the weights of the top-n keywords
    stem_encodings = tokenizer(topic_stems, truncation=True)['input_ids'] # encode topic keywords using BERT
    doc_idxes = df_doc_topic[df_doc_topic['idx_topic'] == idx]['idx_doc'].to_list() # find the documents associated with the topic
    doc_encodings = text_encodings[doc_idxes]
    for doc_idx in doc_idxes:
        doc_encoding = doc_encodings[doc_idx]
        topic_mask = topic_masks[doc_idx]
        for stem_input_ids, stem_prob in zip(stem_encodings, stem_probs):
            idxes = search_in_list(stem_input_ids[1:-1], doc_encoding) # search each keyword in the document
            if idxes:
                # if found multiple occurrences of the keywords, 
                # then the weight of each occurrence will be devalued
                topic_mask[idxes] += stem_prob / len(idxes) 
        if topic_mask.mean() > 0:
            topic_mask /= topic_mask.sum()   # normalize the mask

In [240]:
doc_encoding

[101,
 1996,
 1057,
 1055,
 9068,
 1998,
 7521,
 2326,
 15529,
 2483,
 9499,
 2049,
 2976,
 7904,
 2008,
 6519,
 23743,
 5603,
 2022,
 2272,
 2251,
 2138,
 1997,
 1037,
 2502,
 5166,
 1035,
 2460,
 13976,
 3426,
 2011,
 1996,
 21887,
 23350,
 6090,
 3207,
 7712,
 2149,
 7521,
 4034,
 2360,
 2009,
 2453,
 2025,
 2191,
 26854,
 2302,
 7740,
 2393,
 15529,
 2483,
 2134,
 1035,
 1056,
 3073,
 1996,
 2193,
 1997,
 7904,
 7461,
 2021,
 4419,
 2739,
 2022,
 2425,
 2008,
 4034,
 3003,
 12367,
 2070,
 3095,
 2008,
 1996,
 6519,
 23743,
 5603,
 2071,
 4254,
 2055,
 7904,
 2030,
 2062,
 2084,
 2048,
 1035,
 2353,
 1997,
 1996,
 14877,
 15802,
 2000,
 1037,
 3120,
 1035,
 5220,
 2007,
 1996,
 3116,
 3053,
 1057,
 1055,
 9068,
 1998,
 7521,
 2326,
 15529,
 2483,
 7904,
 2022,
 2227,
 6519,
 23743,
 5603,
 2149,
 7521,
 2436,
 2485,
 2127,
 2258,
 2058,
 21887,
 23350,
 3571,
 15529,
 2483,
 2880,
 2022,
 2145,
 2907,
 2041,
 3246,
 2005,
 7740,
 2895,
 2000,
 4652,
 1996,
 6519,
 23743,
 5603,
 234

In [242]:
topic_masks[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.23186858, 0.        , 0.        , 0.        , 0.01271488,
       0.01271488, 0.01271488, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.0146984 , 0.        , 0.        ,
       0.        , 0.        , 0.02120842, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.06423558,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.01436782, 0.01436782, 0.        , 0.0146984 ,
       0.        , 0.05157156, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [244]:
pickle.dump(topic_masks.tolist(), open('../data_processing/data/topic_masks.pkl', 'wb'))