In [2]:
import nltk
from nltk.corpus import wordnet
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import string
import random
import re
import gensim
import pickle
from nltk import word_tokenize

from nltk.stem import WordNetLemmatizer
word_lemm = WordNetLemmatizer()

from nltk.corpus import stopwords
en_stopwords = set(stopwords.words('english'))

# nltk downloads
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#initialize tqdm
tqdm.pandas()

DATA_DIR = '../../data/reddit/Article_data_2019/'

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kippy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/kippy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
candidates = ["SANDERS","DELANEY","WARREN","HARRIS","GILLIBRAND","O'ROURKE","KLOBUCHAR","BOOKER",
    "BUTTIGIEG","GABBARD","YANG","INSLEE","HICKENLOOPER","WILLIAMSON","TULSI","CASTRO","BIDEN", "BERNIE",
    "BETO", "ROURKE", "BENNETT", "BULLOCK", "BLASIO", "TIM RYAN", "GRAVEL"]
candidates = set([x.lower() for x in candidates])

candidate_dict = {'klobuchar': 'klobuchar',
 'bennett': 'bennett',
 'booker': 'booker',
 'warren': 'warren',
 'castro': 'castro',
 'williamson': 'williamson',
 'gabbard': 'gabbard',
 'bernie': 'sanders',
 "o'rourke": "o_rourke",
 'bullock': 'bullock',
 'tim ryan': 'tim_ryan',
 'sanders': 'sanders',
 'biden': 'biden',
 'hickenlooper': 'hickenlooper',
 'blasio': 'de_blasio',
 'yang': 'yang',
 'delaney': 'delaney',
 'gillibrand': 'gillibrand',
 'beto': 'o_rourke',
 'harris': 'harris',
 'inslee': 'inslee',
 'rourke': 'o_rourke',
 'gravel': 'gravel',
 'tulsi': 'tulsi',
 'buttigieg': 'buttigieg'}

In [None]:
df = pd.read_pickle(DATA_DIR + 'reddit_2019_06_15_with_article_text.pkl')
df = df.append(pd.read_pickle(DATA_DIR + 'reddit_2019jun16tojul1_articleurls_with_text.pkl'))

In [None]:
# Unstack article data dictionary into columns
df = pd.concat( [df.drop(['article_data'],axis=1), df['article_data'].progress_apply(pd.Series)]
                      ,axis = 1)
df.drop(0, axis = 1, inplace=True)


def df_column_uniquify(df):
    df_columns = df.columns
    new_columns = []
    for item in df_columns:
        counter = 0
        newitem = item
        while newitem in new_columns:
            counter += 1
            newitem = "{}_{}".format(item, counter)
        new_columns.append(newitem)
    df.columns = new_columns
    return df

df = df_column_uniquify(df)
df = df[(~df['title_1'].isnull()) & (~df['text'].isnull())] #remove nulls
df = df.drop_duplicates(subset=['url']) #okay to drop duplicates since we have text for all articles

In [None]:
def convert_to_valid_pos(x):
    """Converts the pos tag returned by the nltk.pos_tag function to a format accepted by wordNetLemmatizer"""
    x = x[0].upper() # extract first character of the POS tag
    
    # define mapping for the tag to correct tag.
    tag_dict = {"J": wordnet.ADJ,
               "N": wordnet.NOUN,
               "R": wordnet.ADV,
               "V": wordnet.VERB}
    
    return tag_dict.get(x, wordnet.NOUN)

def get_lemma(sentence):
    """Given a sentence, derives the lemmatized version of the sentence"""
    pos_tagged_text = nltk.pos_tag(word_tokenize(sentence))
    
    lemm_list = []

    for (word, tag) in pos_tagged_text:
        lemm_list.append(word_lemm.lemmatize(word, pos = convert_to_valid_pos(tag)))
    
    return lemm_list

def prepare_text_for_lda(text):
    """Lemmatizes text, removes stopwords and short words from given text."""
    lemm_list = get_lemma(text)
    
    tokens = [i for i in lemm_list if i not in en_stopwords]
    
    tokens = [token.lower() for token in tokens if len(token) > 3]
    
    return tokens

In [None]:
df['title_lemmas'] = df['title_1'].progress_map(prepare_text_for_lda)
df['text_lemmas'] = df['text'].progress_map(prepare_text_for_lda)

In [3]:


def intersection(lemma): 
    lemma = [token.lower() for token in lemma]
    intersect = candidates.intersection(lemma)
    return {candidate_dict[x] for x in intersect}

df['candidate_title'] = df['title_lemmas'].progress_map(intersection)
df['candidate_text'] = df['text_lemmas'].progress_map(intersection)

NameError: name 'df' is not defined

In [None]:
!mkdir {DATA_DIR + 'LDA/'}
filename = DATA_DIR + 'LDA/df.pkl'
df.to_pickle(filename)
# df = pd.read_pickle(filename)

In [None]:
def save_corpus(dataframe, field, directory, data_name):
    dictionary = gensim.corpora.Dictionary(dataframe[field])
    corpus = list(dataframe[field].progress_map(dictionary.doc2bow))
    pickle.dump(corpus, open(directory + data_name + '_corpus.pkl', 'wb'))
    dictionary.save(directory + data_name + '_dictionary.gensim')
    

!mkdir {DATA_DIR + 'LDA/candidates/'}
for candidate in set(candidate_dict.values()):
    drr = DATA_DIR + 'LDA/candidates/{}/'.format(candidate)
    !mkdir {drr}
    save_corpus(df[df.candidate_text.map(lambda x: candidate in x)], 'title_lemmas', drr, 'article_title_2019_candidate_{}'.format(candidate))
    save_corpus(df[df.candidate_text.map(lambda x: candidate in x)], 'text_lemmas', drr, 'article_text_2019_candidate_{}'.format(candidate))

In [7]:
from IPython.display import clear_output

Topic_list = []
num_topics = 10
passes = 10
iterations = 400
eval_every = None
chunksize = 2000

for candidate in set(candidate_dict.values()):
    drr = DATA_DIR + 'LDA/candidates/{}/'.format(candidate)    
    
    dictionary = gensim.corpora.Dictionary.load(drr + 'article_text_2019_candidate_{}'.format(candidate) + '_dictionary.gensim')
    corpus = pickle.load(open(drr + 'article_text_2019_candidate_{}'.format(candidate) + '_corpus.pkl', 'rb'))
    # https://radimrehurek.com/gensim/models/ldamulticore.html
    try:
        ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus,
                                                           num_topics = num_topics, 
                                                           id2word = dictionary, 
                                                           passes=passes,
                                                           chunksize = chunksize,
                                                           eval_every = eval_every,
                                                           iterations = iterations,
                                                           workers = 3) #set this to cores - 1
    except ValueError:
        pass
    
    !mkdir {drr + 'models/'}
    ldamodel.save(drr + 'models/' + '{}_model_{}.gensim'.format(num_topics, passes))
    clear_output(wait=True)


2019-07-21 16:25:49,826 : INFO : loading Dictionary object from ../../data/reddit/Article_data_2019/LDA/candidates/buttigieg/article_text_2019_candidate_buttigieg_dictionary.gensim
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-21 16:25:49,851 : INFO : loaded ../../data/reddit/Article_data_2019/LDA/candidates/buttigieg/article_text_2019_candidate_buttigieg_dictionary.gensim
2019-07-21 16:25:50,043 : INFO : using symmetric alpha at 0.1
2019-07-21 16:25:50,044 : INFO : using symmetric eta at 0.1
2019-07-21 16:25:50,052 : INFO : using serial LDA version on this node
2019-07-21 16:25:50,101 : INFO : running online LDA training, 10 topics, 10 passes over the supplied corpus of 3241 documents, updating every 6000 documents, evaluating every ~0 documents, iterating 400x with a convergence threshold of 0.001000
2019-07-21 16:25:50,103 : INFO : training LDA model using 3 processes
2019-07-21 16:25:50,125 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up t

2019-07-21 16:26:23,357 : INFO : topic diff=0.287333, rho=0.388646
2019-07-21 16:26:23,358 : INFO : PROGRESS: pass 5, dispatched chunk #0 = documents up to #2000/3241, outstanding queue size 1
2019-07-21 16:26:23,534 : INFO : PROGRESS: pass 5, dispatched chunk #1 = documents up to #3241/3241, outstanding queue size 2
2019-07-21 16:26:27,434 : INFO : topic #4 (0.100): 0.012*"trump" + 0.007*"democratic" + 0.006*"people" + 0.006*"democrats" + 0.005*"news" + 0.005*"like" + 0.005*"also" + 0.005*"abortion" + 0.005*"vote" + 0.005*"make"
2019-07-21 16:26:27,436 : INFO : topic #2 (0.100): 0.019*"president" + 0.013*"2020" + 0.012*"trump" + 0.012*"announce" + 0.011*"former" + 0.011*"getty" + 0.008*"mayor" + 0.007*"would" + 0.007*"senator" + 0.006*"democratic"
2019-07-21 16:26:27,437 : INFO : topic #8 (0.100): 0.025*"buttigieg" + 0.008*"people" + 0.008*"south" + 0.008*"bend" + 0.007*"think" + 0.007*"city" + 0.007*"pete" + 0.007*"mayor" + 0.006*"like" + 0.005*"make"
2019-07-21 16:26:27,438 : INFO :

2019-07-21 16:26:43,015 : INFO : not storing attribute id2word
2019-07-21 16:26:43,016 : INFO : not storing attribute dispatcher
2019-07-21 16:26:43,016 : INFO : not storing attribute state
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-21 16:26:43,018 : INFO : saved ../../data/reddit/Article_data_2019/LDA/candidates/buttigieg/models/10_model_10.gensim


In [None]:
import csv

topics = ldamodel.print_topics(num_words = 20, num_topics=100)
for topic in topics:
    Topic_list.append(topic[1])
# save the topics for later use.
topic_df = pd.DataFrame({'topics':Topic_list})

def clean_topic_words(x):
    """Clean topic words as output by the algorithm"""
    clean_topic = re.findall("\".*?\"", x)
    clean_topic = [s.replace('\"', '') for s in clean_topic]
    return clean_topic

topic_df['topics'] = topic_df['topics'].map(clean_topic_words)

!mkdir {DATA_DIR + 'LDA/topics'}
topic_df.to_csv(DATA_DIR + "/LDA/topics/Topics_List_{}_model_{}_{}.csv".format(num_topics, passes, 'article_text_2019_candidate_only'),
                index=False, header=False, 
                quoting=csv.QUOTE_NONE, sep = '\n', escapechar='\\') # write out for later use

In [8]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary=dictionary, sort=True)
vis

TypeError: prepare() got an unexpected keyword argument 'sort'