# Topic Modelling using LDA (Latent  Dirichlet Allocation)

In [1]:
import eland as ed
from eland.conftest import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
from nltk.corpus import stopwords
import pyLDAvis.gensim
import pickle 
import pyLDAvis
from pprint import pprint
import gensim.corpora as corpora
from gensim.models import CoherenceModel, Phrases
import re
import warnings
warnings.filterwarnings('ignore')

### Importing the data from Elasticsearch

In [2]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text_processed'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False
query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

In [3]:
df_tweets.head()

Unnamed: 0,full_text_processed
1262754556456271872,proactive plan pm narendra modi take stock cyc...
1262754552505184256,odisha bengal ha amphan last 2 day
1262754542589861888,many test given human race corona ampan
1262754528979345408,latest release indian meteorological departmen...
1262746620736598016,visit principal secretary govtcommerce amptran...


## Tokenising and removing short tweets (less than 4 words) 

In [4]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [5]:
df_tweets['full_text_processed'] = df_tweets['full_text_processed'].apply(lambda x: remove_emoji(x))
df_tweets['full_text_tokens'] = df_tweets['full_text_processed'].apply(lambda x: [w for w in x.split()])
df_tweets['length'] = df_tweets['full_text_tokens'].apply(lambda x: len(x))
df_tweets = df_tweets[df_tweets['length']>4]

In [6]:
df_tweets.head()

Unnamed: 0,full_text_processed,full_text_tokens,length
1262754556456271872,proactive plan pm narendra modi take stock cyc...,"[proactive, plan, pm, narendra, modi, take, st...",15
1262754552505184256,odisha bengal ha amphan last 2 day,"[odisha, bengal, ha, amphan, last, 2, day]",7
1262754542589861888,many test given human race corona ampan,"[many, test, given, human, race, corona, ampan]",7
1262754528979345408,latest release indian meteorological departmen...,"[latest, release, indian, meteorological, depa...",12
1262746620736598016,visit principal secretary govtcommerce amptran...,"[visit, principal, secretary, govtcommerce, am...",18


## Building Bigram and Trigram models

In [7]:
stop_words = stopwords.words('english')
stop_words.extend(['from','not', 'would', 'say', 'could', '_', 'be', 'go', 'do', 'rather', 'seem', 'due', 'via', 'done', 'said'])

tweets_list = df_tweets.full_text_tokens.to_list()
tweet_ids = df_tweets.index.to_list()

# Build the bigram and trigram models

bigram = Phrases(tweets_list, min_count=10, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[tweets_list], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [8]:
tweets = [[word for word in gensim.utils.simple_preprocess(str(tweet))] for tweet in tweets_list]
tweets = [bigram_mod[tweet] for tweet in tweets]
tweets = [trigram_mod[bigram_mod[tweet]] for tweet in tweets]

## Building the Topic Model

In [9]:
# Create Dictionary
tweets_dict = corpora.Dictionary(tweets)

# Filtering extremes by removing tokens occuring in less than 10 tweets and have occured in more than 90% tweets
tweets_dict.filter_extremes(no_below=10, no_above=0.9)

# Create Corpus: Term Document Frequency
corpus = [tweets_dict.doc2bow(twt) for twt in tweets]

# Adding the TF-IDF for better insight 
tfidf = gensim.models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

## Based on Hyperparameter optimization - trying 2 approaches:
- Topics = 6, Alpha = 0.01
- Topics = 10, Alpha = 1

In [10]:
# LDA Model Parameters

NUM_TOPICS_1 = 10
ALPHA_1 = 1
NUM_TOPICS_2 = 6
ALPHA_2 = 0.01

In [11]:
def lda_model_build(corpus, dictionary, topics, alpha, texts):
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=topics, 
                                            random_state=100,
                                            passes=10,
                                            alpha=alpha,
                                            per_word_topics=True)
    
    print("\nModel, Topics=",topics)
    pprint(lda_model.print_topics())
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    return lda_model, coherence_lda

In [12]:
# Build first model - Topics=10
lda_model_1, score_1 = lda_model_build(corpus=tfidf_corpus, dictionary=tweets_dict, topics=NUM_TOPICS_1, alpha=ALPHA_1, texts=tweets)

# Build first model - Topics=6
lda_model_2, score_2 = lda_model_build(corpus=tfidf_corpus, dictionary=tweets_dict, topics=NUM_TOPICS_2, alpha=ALPHA_2, texts=tweets)


Model, Topics= 10
[(0,
  '0.068*"bengal" + 0.065*"west" + 0.040*"pm" + 0.040*"modi" + 0.032*"odisha" '
  '+ 0.028*"relief" + 0.027*"crore" + 0.017*"lakh" + 0.016*"say" + '
  '0.016*"visit"'),
 (1,
  '0.025*"nisarga" + 0.016*"like" + 0.015*"mumbai" + 0.014*"nisarg" + '
  '0.012*"come" + 0.011*"earthquake" + 0.010*"nature" + 0.010*"much" + '
  '0.009*"know" + 0.009*"hope"'),
 (2,
  '0.050*"storm" + 0.045*"hit" + 0.038*"super" + 0.028*"may" + 0.024*"coast" + '
  '0.021*"wind" + 0.015*"hour" + 0.014*"speed" + 0.012*"cyclonic" + '
  '0.012*"strong"'),
 (3,
  '0.083*"india" + 0.063*"bangladesh" + 0.042*"cyclone" + 0.033*"news" + '
  '0.033*"via" + 0.031*"amphan" + 0.028*"kolkata" + 0.025*"dead" + '
  '0.024*"million" + 0.020*"least"'),
 (4,
  '0.026*"make" + 0.025*"landfall" + 0.024*"time" + 0.024*"hurricane" + '
  '0.023*"today" + 0.021*"killed" + 0.019*"many" + 0.018*"tree" + '
  '0.017*"house" + 0.016*"two"'),
 (5,
  '0.022*"day" + 0.017*"please" + 0.013*"still" + 0.012*"get" + 0.011*"wa

In [13]:
## Coherence Scores

print("Model 1 - Topics = 10, Score =",score_1)
print("Model 2 - Topics = 6, Score =",score_2)

Model 1 - Topics = 10, Score = 0.46908414489685174


## Further analysis on Model 1 (Topics = 10)
Addressing certain questions and extracting more information out of the topics

## Extracting Dominant Topic for each tweet and its percentage contribution

In [76]:
def get_dominant_topic(lda_model, corpus):
    tweet_topics = []
    tweet_topics_percent = []
    for tweet in tfidf_corpus:
        topics_dist = lda_model.get_document_topics(tweet)
        dom_topic, percent = max(topics_dist, key=lambda item:item[1])
        tweet_topics.append(dom_topic)
        tweet_topics_percent.append(percent)
    return tweet_topics, tweet_topics_percent

In [77]:
tweet_topics, tweet_topics_percent = get_dominant_topic(lda_model_1, tfidf_corpus) ## Storing the topic assignments for each tweet

In [78]:
tweet_topics_df = pd.DataFrame(list(zip(tweets, tweet_topics, tweet_topics_percent)), columns=['Tokenized Tweet', 'Topic', 'Percentage Contribution'], index=tweet_ids)

In [79]:
tweet_topics_df.head()

Unnamed: 0,Tokenized Tweet,Topic,Percentage Contribution
1262754556456271872,"[proactive, plan, pm, narendra, modi, take_sto...",4,0.175766
1262754552505184256,"[odisha, bengal, ha, amphan, last, day]",4,0.13714
1262754542589861888,"[many, test, given, human_race, corona, ampan]",6,0.190081
1262754528979345408,"[latest, release, indian, meteorological_depar...",6,0.141426
1262746620736598016,"[visit, principal_secretary, govtcommerce, amp...",4,0.145735


## Number of Tweets for Each Topic

In [100]:
tweet_topics_df.groupby('Topic')['Tokenized Tweet'].agg('count')

Topic
0    10112
1    12243
2    11862
3    10066
4     7967
5    11603
6     8683
7     8207
8     7198
9     7396
Name: Tokenized Tweet, dtype: int64

## Visualize the topics


In [23]:
pyLDAvis.enable_notebook()
LDAvis_prepared_1 = pyLDAvis.gensim.prepare(lda_model_1, tfidf_corpus, tweets_dict)
LDAvis_prepared_2 = pyLDAvis.gensim.prepare(lda_model_2, tfidf_corpus, tweets_dict)

In [24]:
## Saving the HTML
pyLDAvis.save_html(LDAvis_prepared_1, '../reports/figures/LDA_topic_10.html')
pyLDAvis.save_html(LDAvis_prepared_2, '../reports/figures/LDA_topic_6.html')