In [1]:
import nltk
from nltk.corpus import wordnet
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import string
import random
import re
import gensim
import pickle


from nltk.stem import WordNetLemmatizer
word_lemm = WordNetLemmatizer()

from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')

# nltk downloads
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#initialize tqdm
tqdm.pandas()

DATA_DIR = '../../data/reddit/Article_data_2019/'

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kippy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/kippy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
df = pd.read_pickle(DATA_DIR + 'reddit_2019_06_15_with_article_text.pkl')
df = df.append(pd.read_pickle(DATA_DIR + 'reddit_2019jun16tojul1_articleurls_with_text.pkl'))

In [34]:
# Unstack article data dictionary into columns
df = pd.concat( [df.drop(['article_data'],axis=1), df['article_data'].progress_apply(pd.Series)]
                      ,axis = 1)
df.drop(0, axis = 1, inplace=True)


def df_column_uniquify(df):
    df_columns = df.columns
    new_columns = []
    for item in df_columns:
        counter = 0
        newitem = item
        while newitem in new_columns:
            counter += 1
            newitem = "{}_{}".format(item, counter)
        new_columns.append(newitem)
    df.columns = new_columns
    return df

df = df_column_uniquify(df)
df = df[(~df['title_1'].isnull()) & (~df['text'].isnull())] #remove nulls
df = df.drop_duplicates(subset=['url']) #okay to drop duplicates since we have text for all articles

HBox(children=(IntProgress(value=0, max=141539), HTML(value='')))

In [36]:
def convert_to_valid_pos(x):
    """Converts the pos tag returned by the nltk.pos_tag function to a format accepted by wordNetLemmatizer"""
    x = x[0].upper() # extract first character of the POS tag
    
    # define mapping for the tag to correct tag.
    tag_dict = {"J": wordnet.ADJ,
               "N": wordnet.NOUN,
               "R": wordnet.ADV,
               "V": wordnet.VERB}
    
    return tag_dict.get(x, wordnet.NOUN)

def get_lemma(sentence):
    """Given a sentence, derives the lemmatized version of the sentence"""
    pos_tagged_text = nltk.pos_tag(word_tokenize(sentence))
    
    lemm_list = []

    for (word, tag) in pos_tagged_text:
        lemm_list.append(word_lemm.lemmatize(word, pos = convert_to_valid_pos(tag)))
    
    return lemm_list

def prepare_text_for_lda(text):
    """Lemmatizes text, removes stopwords and short words from given text."""
    lemm_list = get_lemma(text)
    
    tokens = [i for i in lemm_list if i not in en_stopwords]
    
    tokens = [token for token in tokens if len(token) > 4]
    
    return tokens

In [37]:
df['title_lemmas'] = df['title_1'].progress_map(prepare_text_for_lda)
df['text_lemmas'] = df['text'].progress_map(prepare_text_for_lda)

HBox(children=(IntProgress(value=0, max=78409), HTML(value='')))

HBox(children=(IntProgress(value=0, max=78409), HTML(value='')))

In [38]:
!mkdir {DATA_DIR + 'LDA/'}

In [40]:
def save_corpus(field, directory, data_name):
    dictionary = gensim.corpora.Dictionary(df[field])
    corpus = list(df[field].progress_map(dictionary.doc2bow))
    pickle.dump(corpus, open(directory + data_name + '_corpus.pkl', 'wb'))
    dictionary.save(directory + data_name + '_dictionary.gensim')

save_corpus('title_lemmas', DATA_DIR + 'LDA/', 'article_title_2019')
save_corpus('text_lemmas', DATA_DIR + 'LDA/', 'article_text_2019')

HBox(children=(IntProgress(value=0, max=78409), HTML(value='')))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


HBox(children=(IntProgress(value=0, max=78409), HTML(value='')))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [49]:
dictionary = gensim.corpora.Dictionary.load(DATA_DIR + 'LDA/article_text_2019_dictionary.gensim')
corpus = pickle.load(open(DATA_DIR + 'LDA/article_text_2019' + '_corpus.pkl', 'rb'))

In [None]:
Topic_list = []
num_topics = 20
passes = 20
# https://radimrehurek.com/gensim/models/ldamulticore.html
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus,
                                                   num_topics = num_topics, 
                                                   id2word = dictionary, 
                                                   passes=passes,
                                                   workers = 3) #set this to cores - 1


In [None]:
!mkdir {DATA_DIR + 'LDA/models'}
ldamodel.save(DATA_DIR + '/LDA/models/{}_model_{}_{}.gensim'.format(num_topics, passes, 'article_text_2019'))

topics = ldamodel.print_topics(num_words = 20)
for topic in topics:
    Topic_list.append(topic[1])

In [None]:
ldamodel.print_topics()

In [None]:
import csv

# save the topics for later use.
topic_df = pd.DataFrame({'topics':Topic_list})

def clean_topic_words(x):
    """Clean topic words as output by the algorithm"""
    clean_topic = re.findall("\".*?\"", x)
    clean_topic = [s.replace('\"', '') for s in clean_topic]
    return clean_topic

topic_df['topics'] = topic_df['topics'].map(clean_topic_words)

!mkdir {DATA_DIR + 'LDA/topics'}
topic_df.to_csv(DATA_DIR + "/LDA/topics/Topics_List_{}_model_{}_{}.csv".format(num_topics, passes, 'article_text_2019'),
                index=False, header=False, 
                quoting=csv.QUOTE_NONE, sep = '\n', escapechar='\\') # write out for later use

In [None]:
for topic in list(topic_df.topics):
    print(topic)