In [1]:
import nltk
from nltk.corpus import wordnet
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import string
import random
import re
import gensim
import pickle
from nltk import word_tokenize

from nltk.stem import WordNetLemmatizer
word_lemm = WordNetLemmatizer()

from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')

# nltk downloads
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#initialize tqdm
tqdm.pandas()

DATA_DIR = '../../data/reddit/Article_data_2019/'

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kippy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/kippy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
df = pd.read_pickle(DATA_DIR + 'reddit_2019_06_15_with_article_text.pkl')
df = df.append(pd.read_pickle(DATA_DIR + 'reddit_2019jun16tojul1_articleurls_with_text.pkl'))

In [15]:
# Unstack article data dictionary into columns
df = pd.concat( [df.drop(['article_data'],axis=1), df['article_data'].progress_apply(pd.Series)]
                      ,axis = 1)
df.drop(0, axis = 1, inplace=True)


def df_column_uniquify(df):
    df_columns = df.columns
    new_columns = []
    for item in df_columns:
        counter = 0
        newitem = item
        while newitem in new_columns:
            counter += 1
            newitem = "{}_{}".format(item, counter)
        new_columns.append(newitem)
    df.columns = new_columns
    return df

df = df_column_uniquify(df)
df = df[(~df['title_1'].isnull()) & (~df['text'].isnull())] #remove nulls
df = df.drop_duplicates(subset=['url']) #okay to drop duplicates since we have text for all articles

HBox(children=(IntProgress(value=0, max=141539), HTML(value='')))




In [75]:
def convert_to_valid_pos(x):
    """Converts the pos tag returned by the nltk.pos_tag function to a format accepted by wordNetLemmatizer"""
    x = x[0].upper() # extract first character of the POS tag
    
    # define mapping for the tag to correct tag.
    tag_dict = {"J": wordnet.ADJ,
               "N": wordnet.NOUN,
               "R": wordnet.ADV,
               "V": wordnet.VERB}
    
    return tag_dict.get(x, wordnet.NOUN)

def get_lemma(sentence):
    """Given a sentence, derives the lemmatized version of the sentence"""
    pos_tagged_text = nltk.pos_tag(word_tokenize(sentence))
    
    lemm_list = []

    for (word, tag) in pos_tagged_text:
        lemm_list.append(word_lemm.lemmatize(word, pos = convert_to_valid_pos(tag)))
    
    return lemm_list

def prepare_text_for_lda(text):
    """Lemmatizes text, removes stopwords and short words from given text."""
    lemm_list = get_lemma(text)
    
    tokens = [i for i in lemm_list if i not in en_stopwords]
    
    tokens = [token.lower() for token in tokens if len(token) > 3]
    
    return tokens

In [76]:
df['title_lemmas'] = df['title_1'].progress_map(prepare_text_for_lda)
df['text_lemmas'] = df['text'].progress_map(prepare_text_for_lda)

HBox(children=(IntProgress(value=0, max=19423), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19423), HTML(value='')))

In [77]:
candidates = ["SANDERS","DELANEY","WARREN","HARRIS","GILLIBRAND","O'ROURKE","KLOBUCHAR","BOOKER",
    "BUTTIGIEG","GABBARD","YANG","INSLEE","HICKENLOOPER","WILLIAMSON","TULSI","CASTRO","BIDEN", "BERNIE",
    "BETO"]
candidates = set([x.lower() for x in candidates])

def intersection(lemma):
    lemma = [token.lower() for token in lemma]
    return candidates.intersection(lemma)

df['candidate_title'] = df['title_lemmas'].progress_map(intersection)
df['candidate_text'] = df['text_lemmas'].progress_map(intersection)

HBox(children=(IntProgress(value=0, max=19423), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19423), HTML(value='')))

In [78]:
!mkdir {DATA_DIR + 'LDA/'}
filename = DATA_DIR + 'LDA/df_with_candidate_intersections.pkl'
df.to_pickle(filename)
# df = pd.read_pickle(filename)

mkdir: ../../data/reddit/Article_data_2019/LDA/: File exists


In [79]:
### Let's do a run on articles about candidates only

df = df[df.progress_apply(lambda x: len(x.candidate_text) != 0, axis=1)]

HBox(children=(IntProgress(value=0, max=19423), HTML(value='')))

In [1]:
def save_corpus(field, directory, data_name):
    dictionary = gensim.corpora.Dictionary(df[field])
    corpus = list(df[field].progress_map(dictionary.doc2bow))
    pickle.dump(corpus, open(directory + data_name + '_corpus.pkl', 'wb'))
    dictionary.save(directory + data_name + '_dictionary.gensim')

save_corpus('title_lemmas', DATA_DIR + 'LDA/', 'article_title_2019_candidate_only')
save_corpus('text_lemmas', DATA_DIR + 'LDA/', 'article_text_2019_candidate_only')

NameError: name 'DATA_DIR' is not defined

In [2]:
dictionary = gensim.corpora.Dictionary.load(DATA_DIR + 'LDA/article_text_2019_candidate_only_dictionary.gensim')
corpus = pickle.load(open(DATA_DIR + 'LDA/article_text_2019_candidate_only' + '_corpus.pkl', 'rb'))

2019-07-20 09:09:45,972 : INFO : loading Dictionary object from ../../data/reddit/Article_data_2019/LDA/article_text_2019_candidate_only_dictionary.gensim
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-20 09:09:46,047 : INFO : loaded ../../data/reddit/Article_data_2019/LDA/article_text_2019_candidate_only_dictionary.gensim


In [3]:
print(dictionary)

Dictionary(147039 unique tokens: ['authorize', 'bernie', 'candidate', 'committee', 'contributions']...)


In [4]:
Topic_list = []
num_topics = 100
passes = 10
iterations = 400
eval_every = None
chunksize = 2000
# https://radimrehurek.com/gensim/models/ldamulticore.html
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus,
                                                   num_topics = num_topics, 
                                                   id2word = dictionary, 
                                                   passes=passes,
                                                   chunksize = chunksize,
                                                   eval_every = eval_every,
                                                   iterations = iterations,
                                                   workers = 3) #set this to cores - 1


2019-07-20 09:09:54,250 : INFO : using symmetric alpha at 0.01
2019-07-20 09:09:54,252 : INFO : using symmetric eta at 0.01
2019-07-20 09:09:54,279 : INFO : using serial LDA version on this node
2019-07-20 09:09:55,763 : INFO : running online LDA training, 100 topics, 10 passes over the supplied corpus of 19423 documents, updating every 6000 documents, evaluating every ~0 documents, iterating 400x with a convergence threshold of 0.001000
2019-07-20 09:09:55,764 : INFO : training LDA model using 3 processes
2019-07-20 09:09:55,843 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/19423, outstanding queue size 1
2019-07-20 09:09:56,269 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/19423, outstanding queue size 2
2019-07-20 09:09:56,278 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/19423, outstanding queue size 3
2019-07-20 09:09:56,290 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/19423, outst

2019-07-20 09:15:47,343 : INFO : merging changes from 6000 documents into a model of 19423 documents
2019-07-20 09:15:48,639 : INFO : topic #2 (0.010): 0.012*"climate" + 0.007*"people" + 0.007*"green" + 0.007*"change" + 0.006*"deal" + 0.006*"would" + 0.006*"buttigieg" + 0.005*"make" + 0.004*"energy" + 0.004*"year"
2019-07-20 09:15:48,641 : INFO : topic #51 (0.010): 0.030*"harris" + 0.014*"abortion" + 0.008*"california" + 0.008*"woman" + 0.006*"president" + 0.006*"court" + 0.006*"attorney" + 0.006*"kamala" + 0.006*"francisco" + 0.006*"would"
2019-07-20 09:15:48,644 : INFO : topic #86 (0.010): 0.058*"warren" + 0.008*"bankruptcy" + 0.005*"make" + 0.005*"elizabeth" + 0.005*"trump" + 0.004*"american" + 0.004*"campaign" + 0.004*"would" + 0.004*"president" + 0.004*"time"
2019-07-20 09:15:48,647 : INFO : topic #59 (0.010): 0.011*"biden" + 0.009*"barr" + 0.008*"trump" + 0.007*"woman" + 0.006*"mueller" + 0.005*"swalwell" + 0.005*"also" + 0.005*"democratic" + 0.005*"campaign" + 0.005*"call"
2019-

2019-07-20 09:17:58,477 : INFO : topic #73 (0.010): 0.007*"facility" + 0.005*"child" + 0.005*"trump" + 0.004*"group" + 0.004*"people" + 0.004*"would" + 0.004*"include" + 0.004*"member" + 0.003*"house" + 0.003*"tell"
2019-07-20 09:17:58,479 : INFO : topic #35 (0.010): 0.030*"police" + 0.027*"prison" + 0.026*"crime" + 0.023*"criminal" + 0.021*"justice" + 0.019*"officer" + 0.012*"incarceration" + 0.011*"reform" + 0.010*"sentence" + 0.010*"bill"
2019-07-20 09:17:58,504 : INFO : topic diff=0.474933, rho=0.280480
2019-07-20 09:18:05,088 : INFO : merging changes from 1423 documents into a model of 19423 documents
2019-07-20 09:18:05,882 : INFO : topic #6 (0.010): 0.110*"williamson" + 0.052*"marianne" + 0.027*"young" + 0.026*"people" + 0.020*"food" + 0.019*"love" + 0.017*"white" + 0.011*"woman" + 0.011*"team" + 0.009*"spiritual"
2019-07-20 09:18:05,884 : INFO : topic #87 (0.010): 0.050*"israel" + 0.014*"trump" + 0.012*"palestinian" + 0.012*"jewish" + 0.011*"israeli" + 0.010*"palestinians" + 0.

2019-07-20 09:19:32,497 : INFO : PROGRESS: pass 4, dispatched chunk #0 = documents up to #2000/19423, outstanding queue size 1
2019-07-20 09:19:32,825 : INFO : PROGRESS: pass 4, dispatched chunk #1 = documents up to #4000/19423, outstanding queue size 2
2019-07-20 09:19:32,826 : INFO : PROGRESS: pass 4, dispatched chunk #2 = documents up to #6000/19423, outstanding queue size 3
2019-07-20 09:19:32,827 : INFO : PROGRESS: pass 4, dispatched chunk #3 = documents up to #8000/19423, outstanding queue size 4
2019-07-20 09:19:32,828 : INFO : PROGRESS: pass 4, dispatched chunk #4 = documents up to #10000/19423, outstanding queue size 5
2019-07-20 09:19:32,829 : INFO : PROGRESS: pass 4, dispatched chunk #5 = documents up to #12000/19423, outstanding queue size 6
2019-07-20 09:19:33,056 : INFO : PROGRESS: pass 4, dispatched chunk #6 = documents up to #14000/19423, outstanding queue size 7
2019-07-20 09:19:33,649 : INFO : PROGRESS: pass 4, dispatched chunk #7 = documents up to #16000/19423, outst

2019-07-20 09:21:32,461 : INFO : topic #54 (0.010): 0.014*"logan" + 0.012*"politico" + 0.006*"official" + 0.006*"tlaib" + 0.005*"campaign" + 0.005*"sanders" + 0.005*"sign" + 0.005*"people" + 0.005*"staff" + 0.004*"state"
2019-07-20 09:21:32,463 : INFO : topic #1 (0.010): 0.069*"trump" + 0.031*"white" + 0.029*"press" + 0.028*"house" + 0.020*"president" + 0.019*"sanders" + 0.017*"secretary" + 0.012*"sarah" + 0.010*"grisham" + 0.010*"johnson"
2019-07-20 09:21:32,465 : INFO : topic #47 (0.010): 0.028*"attack" + 0.017*"iran" + 0.016*"muslim" + 0.016*"terrorist" + 0.015*"drone" + 0.015*"antifa" + 0.014*"studies" + 0.014*"iranian" + 0.013*"shoot" + 0.009*"white"
2019-07-20 09:21:32,493 : INFO : topic diff=0.371675, rho=0.252285
2019-07-20 09:21:57,482 : INFO : merging changes from 6000 documents into a model of 19423 documents
2019-07-20 09:21:58,938 : INFO : topic #7 (0.010): 0.029*"trump" + 0.019*"house" + 0.014*"democrats" + 0.011*"more" + 0.010*"senate" + 0.009*"president" + 0.008*"commit

2019-07-20 09:23:29,686 : INFO : topic diff=0.267232, rho=0.244620
2019-07-20 09:23:53,634 : INFO : merging changes from 6000 documents into a model of 19423 documents
2019-07-20 09:23:54,582 : INFO : topic #88 (0.010): 0.025*"democrats" + 0.020*"vote" + 0.017*"house" + 0.016*"senate" + 0.014*"democratic" + 0.014*"bill" + 0.013*"republicans" + 0.011*"would" + 0.010*"republican" + 0.009*"party"
2019-07-20 09:23:54,585 : INFO : topic #94 (0.010): 0.066*"brown" + 0.046*"golf" + 0.035*"club" + 0.011*"lecturer" + 0.011*"course" + 0.010*"jones" + 0.010*"andrews" + 0.010*"obama" + 0.009*"base" + 0.008*"golftus"
2019-07-20 09:23:54,587 : INFO : topic #59 (0.010): 0.029*"woman" + 0.023*"swalwell" + 0.013*"propublica" + 0.011*"illustration" + 0.009*"pressley" + 0.008*"march" + 0.008*"right" + 0.007*"10-15" + 0.007*"women" + 0.006*"9,500"
2019-07-20 09:23:54,589 : INFO : topic #86 (0.010): 0.166*"warren" + 0.030*"elizabeth" + 0.012*"massachusetts" + 0.009*"senator" + 0.008*"native" + 0.008*"ameri

2019-07-20 09:25:31,784 : INFO : topic #93 (0.010): 0.043*"north" + 0.022*"korea" + 0.015*"korean" + 0.015*"china" + 0.013*"border" + 0.013*"trump" + 0.013*"p.m." + 0.011*"sunday" + 0.009*"side" + 0.009*"meet"
2019-07-20 09:25:31,786 : INFO : topic #50 (0.010): 0.024*"health" + 0.022*"medicare" + 0.021*"2019" + 0.018*"care" + 0.012*"group" + 0.010*"organization" + 0.010*"american" + 0.008*"association" + 0.008*"industry" + 0.007*"lobbyist"
2019-07-20 09:25:31,788 : INFO : topic #70 (0.010): 0.060*"state" + 0.021*"vote" + 0.010*"electoral" + 0.010*"california" + 0.008*"election" + 0.008*"voter" + 0.007*"would" + 0.007*"college" + 0.007*"candidate" + 0.007*"presidential"
2019-07-20 09:25:31,808 : INFO : topic diff=0.205564, rho=0.237614
2019-07-20 09:25:31,834 : INFO : PROGRESS: pass 8, dispatched chunk #0 = documents up to #2000/19423, outstanding queue size 1
2019-07-20 09:25:32,058 : INFO : PROGRESS: pass 8, dispatched chunk #1 = documents up to #4000/19423, outstanding queue size 2
2

2019-07-20 09:27:21,274 : INFO : PROGRESS: pass 9, dispatched chunk #9 = documents up to #19423/19423, outstanding queue size 9
2019-07-20 09:27:25,543 : INFO : merging changes from 6000 documents into a model of 19423 documents
2019-07-20 09:27:27,033 : INFO : topic #73 (0.010): 0.012*"group" + 0.010*"member" + 0.007*"facility" + 0.006*"include" + 0.005*"post" + 0.005*"write" + 0.005*"clint" + 0.005*"homestead" + 0.004*"camp" + 0.004*"center"
2019-07-20 09:27:27,036 : INFO : topic #55 (0.010): 0.175*"booker" + 0.059*"cory" + 0.029*"jersey" + 0.022*"housing" + 0.014*"senator" + 0.014*"home" + 0.013*"newark" + 0.010*"mayor" + 0.009*"city" + 0.007*"latinx"
2019-07-20 09:27:27,040 : INFO : topic #74 (0.010): 0.011*"sleep" + 0.009*"chavez" + 0.009*"party" + 0.008*"populist" + 0.008*"social" + 0.007*"populism" + 0.007*"drug" + 0.006*"right" + 0.006*"democracy" + 0.005*"have"
2019-07-20 09:27:27,043 : INFO : topic #65 (0.010): 0.049*"reuters" + 0.038*"trump" + 0.025*"president" + 0.024*"phot

In [6]:
!mkdir {DATA_DIR + 'LDA/models'}
ldamodel.save(DATA_DIR + '/LDA/models/{}_model_{}_{}_len4.gensim'.format(num_topics, passes, 'article_text_2019_candidate_only'))



mkdir: ../../data/reddit/Article_data_2019/LDA/models: File exists


2019-07-20 10:46:19,091 : INFO : saving LdaState object under ../../data/reddit/Article_data_2019//LDA/models/100_model_10_article_text_2019_candidate_only_len4.gensim.state, separately None
2019-07-20 10:46:19,093 : INFO : storing np array 'sstats' to ../../data/reddit/Article_data_2019//LDA/models/100_model_10_article_text_2019_candidate_only_len4.gensim.state.sstats.npy
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-20 10:46:19,342 : INFO : saved ../../data/reddit/Article_data_2019//LDA/models/100_model_10_article_text_2019_candidate_only_len4.gensim.state
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-20 10:46:19,445 : INFO : saving LdaMulticore object under ../../data/reddit/Article_data_2019//LDA/models/100_model_10_article_text_2019_candidate_only_len4.gensim, separately ['expElogbeta', 'sstats']
2019-07-20 10:46:19,446 : INFO : storing np array 'expElogbeta' to ../../data/reddit/Article_data_2019//LDA/models/100_model_10_a

In [9]:
import csv

topics = ldamodel.print_topics(num_words = 20, num_topics=100)
for topic in topics:
    Topic_list.append(topic[1])
# save the topics for later use.
topic_df = pd.DataFrame({'topics':Topic_list})

def clean_topic_words(x):
    """Clean topic words as output by the algorithm"""
    clean_topic = re.findall("\".*?\"", x)
    clean_topic = [s.replace('\"', '') for s in clean_topic]
    return clean_topic

topic_df['topics'] = topic_df['topics'].map(clean_topic_words)

!mkdir {DATA_DIR + 'LDA/topics'}
topic_df.to_csv(DATA_DIR + "/LDA/topics/Topics_List_{}_model_{}_{}.csv".format(num_topics, passes, 'article_text_2019_candidate_only'),
                index=False, header=False, 
                quoting=csv.QUOTE_NONE, sep = '\n', escapechar='\\') # write out for later use

2019-07-21 08:37:14,041 : INFO : topic #0 (0.010): 0.051*"trump" + 0.040*"more" + 0.038*"2020" + 0.036*"biden" + 0.018*"slam" + 0.016*"project" + 0.013*"democrats" + 0.012*"july" + 0.012*"democratic" + 0.011*"advertisement" + 0.010*"reason" + 0.010*"speech" + 0.010*"optimism" + 0.010*"hill" + 0.010*"reveal" + 0.009*"word" + 0.009*"beyond" + 0.009*"what" + 0.009*"president" + 0.009*"report"
2019-07-21 08:37:14,043 : INFO : topic #1 (0.010): 0.084*"trump" + 0.038*"white" + 0.035*"house" + 0.030*"press" + 0.029*"president" + 0.025*"sanders" + 0.018*"secretary" + 0.012*"sarah" + 0.010*"reporter" + 0.009*"tweet" + 0.008*"tell" + 0.008*"grisham" + 0.008*"first" + 0.008*"donald" + 0.008*"conway" + 0.008*"official" + 0.007*"huckabee" + 0.007*"news" + 0.007*"administration" + 0.006*"last"
2019-07-21 08:37:14,046 : INFO : topic #2 (0.010): 0.036*"climate" + 0.019*"change" + 0.015*"green" + 0.013*"deal" + 0.012*"energy" + 0.009*"plan" + 0.009*"fuel" + 0.008*"fossil" + 0.007*"would" + 0.007*"polic

2019-07-21 08:37:14,092 : INFO : topic #21 (0.010): 0.052*"impeachment" + 0.017*"trump" + 0.017*"president" + 0.013*"democrats" + 0.013*"pelosi" + 0.013*"house" + 0.013*"congress" + 0.012*"proceeding" + 0.012*"would" + 0.012*"impeach" + 0.011*"question" + 0.011*"call" + 0.007*"2020" + 0.007*"report" + 0.006*"inquiry" + 0.006*"census" + 0.006*"begin" + 0.006*"democratic" + 0.005*"nancy" + 0.005*"also"
2019-07-21 08:37:14,094 : INFO : topic #22 (0.010): 0.008*"world" + 0.007*"american" + 0.007*"political" + 0.007*"power" + 0.006*"even" + 0.006*"america" + 0.006*"government" + 0.006*"people" + 0.006*"economic" + 0.005*"social" + 0.005*"capitalism" + 0.005*"like" + 0.005*"country" + 0.004*"socialism" + 0.004*"right" + 0.004*"democracy" + 0.004*"medium" + 0.004*"society" + 0.004*"united" + 0.004*"this"
2019-07-21 08:37:14,095 : INFO : topic #23 (0.010): 0.030*"u.s." + 0.024*"china" + 0.019*"trade" + 0.015*"iran" + 0.015*"trump" + 0.012*"tariff" + 0.010*"chinese" + 0.010*"administration" + 0

2019-07-21 08:37:14,175 : INFO : topic #42 (0.010): 0.079*"rourke" + 0.043*"beto" + 0.043*"o'rourke" + 0.040*"texas" + 0.017*"paso" + 0.011*"city" + 0.011*"cruz" + 0.010*"campaign" + 0.007*"congressman" + 0.006*"senate" + 0.005*"race" + 0.004*"member" + 0.004*"group" + 0.004*"presidential" + 0.004*"tribune" + 0.004*"year" + 0.004*"work" + 0.004*"2018" + 0.004*"cornyn" + 0.004*"democratic"
2019-07-21 08:37:14,177 : INFO : topic #43 (0.010): 0.048*"2019" + 0.019*"june" + 0.016*"afp/getty" + 0.008*"world" + 0.007*"april" + 0.006*"president" + 0.006*"protester" + 0.006*"reuters" + 0.006*"hold" + 0.006*"month" + 0.006*"first" + 0.006*"people" + 0.005*"minister" + 0.005*"india" + 0.005*"protest" + 0.005*"outside" + 0.005*"ramadan" + 0.005*"election" + 0.004*"state" + 0.004*"pride"
2019-07-21 08:37:14,181 : INFO : topic #44 (0.010): 0.073*"drug" + 0.043*"price" + 0.021*"prescription" + 0.014*"treasury" + 0.012*"secretary" + 0.011*"mnuchin" + 0.010*"pharmaceutical" + 0.008*"warmbier" + 0.008*"

2019-07-21 08:37:14,227 : INFO : topic #63 (0.010): 0.028*"sanders" + 0.025*"bernie" + 0.017*"ember" + 0.016*"trump" + 0.008*"people" + 0.008*"supporter" + 0.007*"candidate" + 0.007*"democratic" + 0.006*"baier" + 0.006*"donald" + 0.006*"think" + 0.006*"election" + 0.005*"many" + 0.004*"article" + 0.004*"moveon" + 0.004*"2020" + 0.004*"2016" + 0.004*"presidential" + 0.004*"would" + 0.004*"even"
2019-07-21 08:37:14,231 : INFO : topic #64 (0.010): 0.031*"puerto" + 0.025*"rico" + 0.012*"vaccine" + 0.011*"state" + 0.011*"federal" + 0.011*"island" + 0.009*"bottoms" + 0.007*"department" + 0.007*"hurricane" + 0.006*"rubio" + 0.006*"measles" + 0.006*"bill" + 0.006*"vaccination" + 0.006*"exemption" + 0.005*"disaster" + 0.005*"daca" + 0.005*"people" + 0.005*"atlanta" + 0.005*"public" + 0.005*"child"
2019-07-21 08:37:14,234 : INFO : topic #65 (0.010): 0.051*"reuters" + 0.038*"trump" + 0.026*"president" + 0.023*"photographer" + 0.010*"house" + 0.009*"white" + 0.009*"visit" + 0.008*"take" + 0.007*"t

2019-07-21 08:37:14,275 : INFO : topic #84 (0.010): 0.046*"abrams" + 0.021*"georgia" + 0.019*"stacey" + 0.008*"gubernatorial" + 0.006*"gillum" + 0.006*"year" + 0.006*"future" + 0.005*"group" + 0.005*"democrats" + 0.005*"governor" + 0.004*"democratic" + 0.004*"write" + 0.004*"last" + 0.004*"kemp" + 0.004*"make" + 0.004*"lose" + 0.004*"ticket" + 0.004*"candidate" + 0.003*"campaign" + 0.003*"perdue"
2019-07-21 08:37:14,277 : INFO : topic #85 (0.010): 0.014*"trump" + 0.009*"times" + 0.007*"tell" + 0.007*"business" + 0.007*"year" + 0.006*"story" + 0.006*"york" + 0.006*"father" + 0.006*"work" + 0.005*"family" + 0.004*"ivanka" + 0.004*"time" + 0.004*"would" + 0.004*"write" + 0.004*"company" + 0.004*"report" + 0.004*"house" + 0.003*"make" + 0.003*"know" + 0.003*"take"
2019-07-21 08:37:14,281 : INFO : topic #86 (0.010): 0.176*"warren" + 0.036*"elizabeth" + 0.013*"massachusetts" + 0.011*"senator" + 0.009*"american" + 0.008*"native" + 0.007*"plan" + 0.006*"policy" + 0.006*"presidential" + 0.006*"

mkdir: ../../data/reddit/Article_data_2019/LDA/topics: File exists


In [7]:
vis

In [8]:
ldamodel.print_topics()

2019-07-21 08:34:54,017 : INFO : topic #27 (0.010): 0.020*"jean" + 0.018*"think" + 0.017*"trump" + 0.015*"carroll" + 0.014*"tell" + 0.013*"michael" + 0.013*"like" + 0.012*"know" + 0.012*"story" + 0.012*"barbaro"
2019-07-21 08:34:54,020 : INFO : topic #38 (0.010): 0.089*"biden" + 0.023*"president" + 0.017*"vice" + 0.017*"former" + 0.015*"harris" + 0.011*"democratic" + 0.009*"campaign" + 0.009*"woman" + 0.008*"candidate" + 0.007*"make"
2019-07-21 08:34:54,022 : INFO : topic #0 (0.010): 0.051*"trump" + 0.040*"more" + 0.038*"2020" + 0.036*"biden" + 0.018*"slam" + 0.016*"project" + 0.013*"democrats" + 0.012*"july" + 0.012*"democratic" + 0.011*"advertisement"
2019-07-21 08:34:54,024 : INFO : topic #11 (0.010): 0.088*"news" + 0.024*"hall" + 0.022*"town" + 0.021*"network" + 0.020*"host" + 0.014*"msnbc" + 0.012*"carlson" + 0.011*"president" + 0.010*"medium" + 0.010*"make"
2019-07-21 08:34:54,026 : INFO : topic #61 (0.010): 0.021*"trump" + 0.021*"president" + 0.014*"call" + 0.014*"people" + 0.01

[(27,
  '0.020*"jean" + 0.018*"think" + 0.017*"trump" + 0.015*"carroll" + 0.014*"tell" + 0.013*"michael" + 0.013*"like" + 0.012*"know" + 0.012*"story" + 0.012*"barbaro"'),
 (38,
  '0.089*"biden" + 0.023*"president" + 0.017*"vice" + 0.017*"former" + 0.015*"harris" + 0.011*"democratic" + 0.009*"campaign" + 0.009*"woman" + 0.008*"candidate" + 0.007*"make"'),
 (0,
  '0.051*"trump" + 0.040*"more" + 0.038*"2020" + 0.036*"biden" + 0.018*"slam" + 0.016*"project" + 0.013*"democrats" + 0.012*"july" + 0.012*"democratic" + 0.011*"advertisement"'),
 (11,
  '0.088*"news" + 0.024*"hall" + 0.022*"town" + 0.021*"network" + 0.020*"host" + 0.014*"msnbc" + 0.012*"carlson" + 0.011*"president" + 0.010*"medium" + 0.010*"make"'),
 (61,
  '0.021*"trump" + 0.021*"president" + 0.014*"call" + 0.014*"people" + 0.012*"racist" + 0.009*"biden" + 0.009*"tweet" + 0.008*"comment" + 0.008*"2019" + 0.008*"attack"'),
 (28,
  '0.040*"percent" + 0.035*"poll" + 0.031*"voter" + 0.019*"trump" + 0.018*"biden" + 0.016*"candidate"