In [30]:
# imports 
import json
import multiprocessing 
import os
import re
import string
import sys
sys.path.append("../")
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

# from gensim.corpora import Dictionary

from datahandler import DataHandler

In [11]:
# fcns

stopwords = nltk.corpus.stopwords.words()


def filter_ngram(ngram, n:int):
    tag = nltk.pos_tag(ngram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if n == 2:
        if ngram[0] in stopwords or ngram[1] in stopwords:
            return False
    if n==3:
        if ngram[0] in stopwords or ngram[-1] in stopwords or ngram[1] in stopwords:
            return False
    if 'n' in ngram or 't' in ngram:
        return False
    if 'PRON' in ngram:
        return False
    return True


def merge_ngram(x, bigrams, trigrams):
    for gram in trigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x
        

def filter_stopwords(x):
    return [word for word in x.split() if word not in stopwords and len(word)>2]


def filter_pos(x):
    pos = nltk.pos_tag(x)
    filtered = [word[0] for word in pos if word[1] in ['NN']]
    return filtered


In [12]:
seed = 123
data_dir = os.path.join(os.pardir, os.pardir, "web_data", "preproc")
print("Loading corpus")
corpus = DataHandler(data_dir, seed)

# print some various information from the corpus
print("Total Word Count: {}".format(corpus.total_words))
print("Number of Docs in the Corpus: {}".format(corpus.total_docs))

docs_fpath = corpus.data.keys()

# create dictionary for filename and text
fpath_txt = {}
for fpath in docs_fpath:
    with open(fpath, "r") as f:
        fpath_txt[fpath] = f.read()

# make dataframe
df = (pd.DataFrame.from_dict(fpath_txt, orient='index')
         .reset_index().rename(index = str, columns = {'index': 'file_name', 0: 'text'}))

corpus = df['text']
print("Finished loading corpus")


Loading corpus
Total Word Count: 2220710
Number of Docs in the Corpus: 5405
Finished loading corpus


In [13]:
min_bigram_frequency = 50

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents([doc.split() for doc in corpus])
finder.apply_freq_filter(min_bigram_frequency)
bigram_scores = finder.score_ngrams(bigram_measures.pmi)

bigram_pmi = pd.DataFrame(bigram_scores)
bigram_pmi.columns = ['bigram', 'pmi']
bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

min_trigram_frequency = 50

trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = nltk.collocations.TrigramCollocationFinder.from_documents([doc.split() for doc in corpus])
finder.apply_freq_filter(min_trigram_frequency)
trigram_scores = finder.score_ngrams(trigram_measures.pmi)

trigram_pmi = pd.DataFrame(trigram_scores)
trigram_pmi.columns = ['trigram', 'pmi']
trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

print("cell done")


cell done


In [14]:
min_pmi = 5
max_ngrams = 500

filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram:\
                                              filter_ngram(bigram['bigram'], 2)\
                                              and min_pmi > 5, axis = 1)][:max_ngrams]

filtered_trigram = trigram_pmi[trigram_pmi.apply(lambda trigram: \
                                                 filter_ngram(trigram['trigram'], 3)\
                                                 and min_pmi > 5, axis = 1)][:max_ngrams]


bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
trigrams = [' '.join(x) for x in filtered_trigram.trigram.values if len(x[0]) > 2 or len(x[1]) > 2 and len(x[2]) > 2]

print("cell done")


cell done


In [15]:
corpus_w_ngrams = corpus.copy()
corpus_w_ngrams = corpus_w_ngrams.map(lambda x: merge_ngram(x, bigrams, trigrams))

print("cell done")


cell done


In [16]:
p = multiprocessing.Pool()
corpus_w_ngrams = p.map(filter_stopwords, [doc for doc in corpus_w_ngrams])
p.close()
print("cell done")


cell done


In [17]:
p = multiprocessing.Pool()
final_corpus = p.map(filter_pos, [doc for doc in corpus_w_ngrams])
p.close()
print("cell done")


cell done


In [21]:
dictionary = gensim.corpora.Dictionary(final_corpus)
dictionary.filter_extremes(no_below=10, no_above=0.20)
corpus_bow = [dictionary.doc2bow(doc) for doc in final_corpus]
print("cell done")


cell done


In [22]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(corpus_bow, num_topics=5, id2word = dictionary, passes=40,\
               iterations=200, chunksize = 100, eval_every = None)
print("cell done")


cell done


In [23]:
p = pyLDAvis.gensim.prepare(ldamodel, corpus_bow, dictionary, mds='tsne')
pyLDAvis.save_html(p, 'web_lda_mp_debug.html')

In [27]:
coherence = []
for ii in range(3,5):
    print('lda with {} topics'.format(ii))
    Lda = gensim.models.ldamodel.LdaModel
    ldamodel = Lda(corpus_bow, num_topics=ii, id2word = dictionary, passes=40,\
                   iterations=200, chunksize = 100, eval_every = None)
    print("fit model, computing coherence")
    cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=final_corpus,\
                                                     dictionary=dictionary, coherence='c_v')
    coherence.append((ii,cm.get_coherence()))
    print("generating tsne viz")
    p = pyLDAvis.gensim.prepare(ldamodel, corpus_bow, dictionary, mds='tsne')
    title = 'web_lda_mp_debug_cm_{}.html'.format(ii)
    pyLDAvis.save_html(p, title)
    print("done")

lda with 3 topics
fit model, computing coherence
generating tsne viz
done
lda with 4 topics
fit model, computing coherence
generating tsne viz
done


In [28]:
n_topics = [x[0] for x in coherence]
cm = [x[1] for x in coherence]

In [31]:
plt.plot(n_topics,cm)
plt.scatter(n_topics,cm)
plt.title('Number of Topics vs. Coherence')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence')
plt.xticks(x_val)
plt.savefig("topic_coherence.png")
plt.close()