<a href="https://colab.research.google.com/github/abhilashhn1993/Sentiment_Analysis_of_Tweets/blob/master/LDATopicModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install spacy
!pip install pyLDAvis

In [0]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [0]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
import io
df_post = pd.read_csv(io.BytesIO(uploaded['cleanedPostTweets.csv']))
df_pre = pd.read_csv(io.BytesIO(uploaded['cleanedPreTweets.csv']))

**LDA topic Modeling for PTSD pre-diagnosis Tweets**

**Pre-PTSD Topics**

In [0]:
pre_Tweets = df_pre.Tweets.values.tolist()
pre_Tweets = list(sent_to_words(pre_Tweets))

In [0]:
id2word = corpora.Dictionary(pre_Tweets)

# Create Corpus
texts = pre_Tweets

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [28]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('good', 1), ('luck', 1), ('ray', 1), ('send', 1), ('support', 1)]]

In [0]:
# Build LDA model
pre_lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=4, 
                                          random_state=100,
                                          update_every=1,
                                          chunksize=100,
                                          passes=10,
                                          alpha='auto',
                                          per_word_topics=True)

In [30]:
pprint(pre_lda_model.print_topics())
doc_lda_pre = pre_lda_model[corpus]

[(0,
  '0.107*"nan" + 0.016*"hate" + 0.016*"wait" + 0.015*"tell" + 0.015*"eat" + '
  '0.013*"could" + 0.012*"lose" + 0.011*"stop" + 0.009*"ill" + '
  '0.008*"chroniclife"'),
 (1,
  '0.025*"think" + 0.022*"life" + 0.018*"come" + 0.015*"help" + 0.015*"fuck" + '
  '0.015*"always" + 0.014*"honestly" + 0.012*"keep" + 0.012*"right" + '
  '0.011*"literally"'),
 (2,
  '0.037*"good" + 0.031*"say" + 0.018*"look" + 0.018*"friend" + 0.017*"work" + '
  '0.016*"thank" + 0.014*"ever" + 0.013*"cry" + 0.013*"fucking" + '
  '0.012*"hour"'),
 (3,
  '0.034*"want" + 0.034*"love" + 0.030*"feel" + 0.025*"people" + 0.024*"time" '
  '+ 0.022*"need" + 0.017*"take" + 0.016*"bad" + 0.016*"try" + 0.015*"start"')]


In [31]:
# Compute Perplexity
print('\nPerplexity: ', pre_lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=pre_lda_model, texts=pre_Tweets, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.413445944372059

Coherence Score:  0.2780846378522641


In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(pre_lda_model, corpus, id2word)
vis

**Post Diagnosis Topics**

In [0]:
post_Tweets = df_post.Tweets.values.tolist()
post_Tweets = list(sent_to_words(post_Tweets))

In [0]:
id2word = corpora.Dictionary(post_Tweets)

# Create Corpus
texts = post_Tweets

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [0]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [0]:
# Build LDA model
post_lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=4, 
                                          random_state=100,
                                          update_every=1,
                                          chunksize=100,
                                          passes=10,
                                          alpha='auto',
                                          per_word_topics=True)

In [21]:
pprint(post_lda_model.print_topics())
doc_lda_post = post_lda_model[corpus]

[(0,
  '0.029*"good" + 0.027*"need" + 0.019*"shit" + 0.016*"try" + 0.015*"tell" + '
  '0.014*"talk" + 0.013*"girl" + 0.013*"could" + 0.012*"tonight" + '
  '0.011*"find"'),
 (1,
  '0.100*"nan" + 0.031*"love" + 0.024*"think" + 0.017*"thank" + 0.015*"bad" + '
  '0.011*"call" + 0.010*"fucking" + 0.009*"ask" + 0.009*"use" + 0.008*"new"'),
 (2,
  '0.041*"want" + 0.040*"time" + 0.029*"people" + 0.026*"take" + 0.020*"work" '
  '+ 0.016*"start" + 0.014*"watch" + 0.013*"give" + 0.011*"hit" + 0.011*"lol"'),
 (3,
  '0.029*"say" + 0.029*"feel" + 0.018*"look" + 0.016*"life" + 0.016*"come" + '
  '0.016*"friend" + 0.015*"help" + 0.014*"never" + 0.012*"right" + '
  '0.012*"also"')]


In [22]:
# Compute Perplexity
print('\nPerplexity: ', post_lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=post_lda_model, texts=post_Tweets, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.422046389754199

Coherence Score:  0.3214387529798701


In [23]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(post_lda_model, corpus, id2word)
vis