In [45]:
import nltk
from nltk import word_tokenize
from nltk.corpus import wordnet
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import string
import random
import re
import gensim
import pickle


from nltk.stem import WordNetLemmatizer
word_lemm = WordNetLemmatizer()

from nltk.corpus import stopwords
en_stopwords = stopwords.words('english')

# nltk downloads
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#initialize tqdm
tqdm.pandas()

DATA_DIR = '../../data/reddit/comments/'

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kippy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kippy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/kippy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
df = pd.read_pickle(DATA_DIR + 'reddit_2019_comments_clean1.pkl')
df['clean'] = df['clean'].progress_apply(lambda x: x.replace('\n\n',' ').replace('\n',' ').replace('\'s','s'))
df_dev = df.sample(1000)

HBox(children=(IntProgress(value=0, max=514797), HTML(value='')))

In [16]:
def convert_to_valid_pos(x):
    """Converts the pos tag returned by the nltk.pos_tag function to a format accepted by wordNetLemmatizer"""
    x = x[0].upper() # extract first character of the POS tag
    
    # define mapping for the tag to correct tag.
    tag_dict = {"J": wordnet.ADJ,
               "N": wordnet.NOUN,
               "R": wordnet.ADV,
               "V": wordnet.VERB}
    
    return tag_dict.get(x, wordnet.NOUN)

def get_lemma(sentence):
    """Given a sentence, derives the lemmatized version of the sentence"""
    pos_tagged_text = nltk.pos_tag(word_tokenize(sentence))
    
    lemm_list = []

    for (word, tag) in pos_tagged_text:
        lemm_list.append(word_lemm.lemmatize(word, pos = convert_to_valid_pos(tag)))
    
    return lemm_list

def prepare_text_for_lda(text):
    """Lemmatizes text, removes stopwords and short words from given text."""
    lemm_list = get_lemma(text)
    
    tokens = [i for i in lemm_list if i not in en_stopwords]
    
    tokens = [token for token in tokens if len(token) > 4]
    
    return tokens

In [27]:
df['lemmas'] = df['clean'].progress_map(prepare_text_for_lda)

HBox(children=(IntProgress(value=0, max=514797), HTML(value='')))

In [39]:
dictionary = gensim.corpora.Dictionary(df.lemmas)
corpus = list(df['lemmas'].progress_map(dictionary.doc2bow))
pickle.dump(corpus, open(DATA_DIR + 'reddit_2019_corpus.pkl', 'wb'))
dictionary.save(DATA_DIR + 'reddit_2019_dictionary.gensim')

HBox(children=(IntProgress(value=0, max=514797), HTML(value='')))

FileNotFoundError: [Errno 2] No such file or directory: '../../data/reddit/comments/reddit_2019_corpus.pkl'

In [None]:
#dictionary2 = gensim.corpora.Dictionary.load(DATA_DIR + 'reddit_2019_dictionary.gensim')

In [49]:
Topic_list = []
num_topics = 20
passes = 15
# https://radimrehurek.com/gensim/models/ldamulticore.html
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus,
                                                   num_topics = num_topics, 
                                                   id2word = dictionary, 
                                                   passes=passes,
                                                   workers = 3 ) #set this to cores - 1

ldamodel.save(DATA_DIR + 'models/{}_model_{}_{}.gensim'.format(num_topics, passes, 'reddit2019'))

topics = ldamodel.print_topics(num_words = 20)
for topic in topics:
    Topic_list.append(topic[1])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


FileNotFoundError: [Errno 2] No such file or directory: '../../data/reddit/comments//models/20_model_15_reddit2019.gensim.state'

In [78]:
import csv

# save the topics for later use.
topic_df = pd.DataFrame({'topics':Topic_list})

def clean_topic_words(x):
    """Clean topic words as output by the algorithm"""
    clean_topic = re.findall("\".*?\"", x)
    clean_topic = [s.replace('\"', '') for s in clean_topic]
    return clean_topic

topic_df['topics'] = topic_df['topics'].map(clean_topic_words)


topic_df.to_csv(DATA_DIR + "topics/Topics_List_{}_model_{}_{}.csv".format(num_topics, passes, 'reddit2019'),
                index=False, header=False, 
                quoting=csv.QUOTE_NONE, sep = '\n', escapechar='\\') # write out for later use

In [86]:
for topic in list(topic_df.topics):
    print(topic)

['money', 'people', 'would', 'system', 'million', 'government', 'change', 'company', 'business', 'problem', 'healthcare', 'climate', 'health', 'public', 'dollar', 'class', 'private', 'spend', 'worker', 'economy']
['president', 'public', 'foreign', 'Russia', 'trump', 'Russian', 'official', 'government', 'election', 'return', 'administration', 'office', 'political', 'interest', 'corrupt', 'elect', 'special', 'information', 'corruption', 'meeting']
['people', 'video', 'every', 'fucking', 'tweet', 'stupid', 'literally', 'person', 'bullshit', 'Twitter', 'nothing', 'speak', 'These', 'truth', 'minute', 'watch', 'People', 'picture', 'notice', 'idiot']
['medium', 'border', 'illegal', 'social', 'press', 'immigration', 'immigrant', 'cover', 'Mexico', 'build', 'Election', 'protest', 'network', 'coverage', 'sexual', 'excuse', 'contempt', 'propaganda', 'attention', 'project']
['state', 'party', 'election', 'voter', 'Republican', 'primary', 'Democratic', 'Democrat', 'Democrats', 'republican', 'Texas'