In [9]:
import numpy as np
import pandas as pd
from loguru import logger
import re
import matplotlib.pyplot as plt
# use seaborn plotting defaults
import seaborn as sns; sns.set()
%matplotlib inline

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim 

from nltk.corpus import stopwords
#from utils import _remove_punctuation_and_symbols, _remove_stopwords
from RealOrNot import utils

  and should_run_async(code)
[nltk_data] Downloading package stopwords to /home/arya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
train_df=pd.read_csv("data/train.csv")
col="text"

logger.info("Stripping punctuation and symbol from text")
train_df[col] = train_df[col].astype(str).apply(utils._remove_punctuation_and_symbols)

logger.info("Removing stopwords")
train_df[col] = train_df[col].astype(str).apply(utils._remove_stopwords)


  and should_run_async(code)
2020-10-06 03:14:08.579 | INFO     | __main__:<module>:4 - Stripping punctuation and symbol from text
2020-10-06 03:14:08.620 | INFO     | __main__:<module>:7 - Removing stopwords


In [11]:
text_words=list(utils.sent_to_words(train_df[col]))

  and should_run_async(code)


In [13]:
text_words[1]

  and should_run_async(code)


['forest', 'fire', 'near', 'ronge', 'sask', 'canada']

In [14]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(text_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[text_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[text_words[0]]])

  and should_run_async(code)


['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive']


In [17]:
# Remove Stop Words
text_words_nostops = utils.remove_stopwords(text_words)

# Form Bigrams
text_words_bigrams = utils.make_bigrams(text_words_nostops, bigram_mod)

# Do lemmatization keeping only noun, adj, vb, adv
text_lemmatized = utils.lemmatization(text_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(text_lemmatized[0])

  and should_run_async(code)


['reason', 'earthquake', 'may', 'forgive']


In [18]:
# Create Dictionary
id2word = corpora.Dictionary(text_lemmatized)

# Create Corpus
texts = text_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1)]


  and should_run_async(code)
  self._context.run(self._callback, *self._args)


In [19]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

  and should_run_async(code)
  self._context.run(self._callback, *self._args)


[[('earthquake', 1), ('forgive', 1), ('may', 1), ('reason', 1)]]

In [35]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=10,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

  and should_run_async(code)


In [36]:
len(corpus)

  and should_run_async(code)


7613

In [37]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.045*"weapon" + 0.028*"get" + 0.017*"go" + 0.017*"storm" + 0.016*"s" + 0.016*"wreck" + 0.013*"nuclear" + 0.012*"would" + 0.011*"part" + 0.010*"fall"'), (1, '0.022*"car" + 0.022*"kill" + 0.019*"people" + 0.017*"survive" + 0.015*"wreck" + 0.014*"break" + 0.013*"world" + 0.013*"back" + 0.011*"feel" + 0.010*"cause"'), (2, '0.044*"wound" + 0.017*"fire" + 0.016*"say" + 0.013*"watch" + 0.013*"wreck" + 0.012*"new" + 0.012*"look" + 0.010*"thunder" + 0.009*"earthquake" + 0.009*"even"'), (3, '0.018*"still" + 0.016*"see" + 0.015*"video" + 0.015*"wreck" + 0.014*"amp" + 0.014*"train" + 0.014*"think" + 0.013*"love" + 0.013*"help" + 0.013*"come"'), (4, '0.019*"wreck" + 0.018*"make" + 0.013*"shot" + 0.011*"take" + 0.010*"time" + 0.010*"sink" + 0.010*"great" + 0.010*"year" + 0.010*"be" + 0.009*"want"')]


  and should_run_async(code)


In [38]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=text_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -9.761754279641604

Coherence Score:  0.5366301428020785


In [39]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis)

  and should_run_async(code)
