### Loading necessary packages

In [8]:
pip install emot



In [9]:
import warnings
warnings.filterwarnings('ignore')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words
from emot.emo_unicode import UNICODE_EMOJI
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pandas as pd
from pprint import pprint
import string
import os
import re

In [10]:
#data loader class
class DataLoader:
  def __init__(self, file_name):
    self.file_name = file_name
    
 
  def read_csv(self):
    tweets_df=pd.read_csv(self.file_name)
    return tweets_df

In [11]:
#object creation
DataLoader_obj= DataLoader('/content/clean_processed_tweet_data.csv')

**Removing NULL valued raws** 

In [12]:
tweets_df=DataLoader_obj.read_csv()
tweets_df.dropna()

Unnamed: 0.1,Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
156,257,2022-04-22 17:14:49+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",habs guy lafleurs statue beside bell centre st...,0.000,0.10,en,1518,120,gfliverpool9966,1295,1507,False,Habs,JohnLuTSNMtl,Ashamed to be in Canada
176,281,2022-04-22 12:05:39+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",please enjoy affordablehousing cartoon todays,0.400,0.50,en,304,45,gfliverpool9966,1295,1507,False,affordablehousing,"TheoMoudakis, TorontoStar",Ashamed to be in Canada
240,404,2022-04-16 22:44:20+00:00,"<a href=""http://twitter.com/download/android"" ...",happy easter friend cheers,0.800,1.00,en,0,0,Cannonballs17,125,300,False,cheers,BZeit72,Planet 🌎
248,413,2022-04-03 01:10:45+00:00,"<a href=""http://twitter.com/download/android"" ...",hope like burbon,0.000,0.00,en,0,0,Cannonballs17,125,300,False,Burbon,lisamarinass,Planet 🌎
249,414,2022-04-03 00:33:01+00:00,"<a href=""http://twitter.com/download/android"" ...",mitchcers old fashioned infused cherry smoke w...,0.125,0.20,en,0,0,Cannonballs17,125,300,False,"burbon, possibilites",petenajarian,Planet 🌎
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16160,24062,2022-04-22 18:14:55+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",read daily mirror today dailymirror dailymirro...,0.000,0.00,en,10,5,steve4you4ever,491,4997,False,"DailyMirror, DailyMirrorOnline",Dailymirror_SL,"Colombo,srilanka"
16162,24064,2022-04-22 18:13:28+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",read daily mirror today dailymirror dailymirro...,0.000,0.00,en,13,4,steve4you4ever,491,4997,False,"DailyMirror, DailyMirrorOnline",Dailymirror_SL,"Colombo,srilanka"
16175,24078,2022-04-22 05:57:39+00:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",happy birthday nifty meri rozi roti aaj mst re...,0.800,1.00,en,141,9,NiveshValue,55,8,False,nifty,Bhai1Its,"Gurgaon, Haryana"
16228,24184,2022-04-20 23:47:58+00:00,"<a href=""http://twitter.com/download/android"" ...",india made people india,0.000,0.00,en,154,97,yashin143,4733,4711,False,India,_tanveersheikh,"Mumbai, INDIA"


In [13]:
len(tweets_df)

16386

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
class PrepareData:
  def __init__(self,df):
    self.df=df
    
  def preprocess_data(self):
    tweets_df = self.df.loc[self.df['lang'] =="en"]
    tweet_tokens = tweets_df['original_text']
      
    #text Preprocessing
    tweet_tokens = tweet_tokens.astype(str)
    tweet_tokens = tweet_tokens.apply(lambda x: x.lower())
    tweet_tokens = tweet_tokens.apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))   
    
    #Converting tweets to list of words For feature engineering
    sentence_list = [tweet for tweet in tweet_tokens]
    word_list = [sent.split() for sent in sentence_list]

    # Remove stopwords
    stop_words = list(stopwords.words('english'))
    emojis = list(UNICODE_EMOJI.keys())  # full list of emojis
    
    filtered_words = [w for w in word_list if w not in stop_words]
    filtered_words = [w for w in filtered_words if w not in emojis]
    filtered_words = [w for w in filtered_words if w in word_list]

    #Create dictionary which contains Id and word 
    word_to_id = corpora.Dictionary(filtered_words)
    corpus_1= [word_to_id.doc2bow(tweet) for tweet in filtered_words]

    return word_list, word_to_id, corpus_1

In [16]:
PrepareData_obj=PrepareData(tweets_df)
word_list ,id2word,corpus=PrepareData_obj.preprocess_data()

**Topic Modeling with LDA**

In [17]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [18]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('india', 0.02410113),
   ('follow', 0.023667822),
   ('read', 0.012196038),
   ('levels', 0.0105431415),
   ('power', 0.008893978),
   ('country', 0.007957049),
   ('army', 0.0073709213),
   ('never', 0.0064901584),
   ('state', 0.006002952),
   ('tomorrow', 0.005672645)]),
 (1,
  [('world', 0.011013914),
   ('minister', 0.008732659),
   ('make', 0.008318668),
   ('even', 0.007939606),
   ('chart', 0.006850983),
   ('still', 0.0066300933),
   ('president', 0.006059221),
   ('best', 0.0059319297),
   ('think', 0.005394838),
   ('national', 0.005372991)]),
 (2,
  [('today', 0.012290991),
   ('us', 0.011778534),
   ('day', 0.010885267),
   ('new', 0.010514958),
   ('details', 0.00979972),
   ('online', 0.009597191),
   ('know', 0.008708531),
   ('sec', 0.008075538),
   ('profile', 0.007899926),
   ('years', 0.0067606866)]),
 (3,
  [('people', 0.014751991),
   ('sri', 0.010106131),
   ('please', 0.009915105),
   ('need', 0.006965267),
   ('dont', 0.006792071),
   ('government', 0.

In [19]:
# Compute Perplexity

#It's a measure of how good the model is. The lower the better. Perplexity is a negative value
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
doc_lda = lda_model[corpus]


# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)


Perplexity:  -10.404027527499373

 Ldamodel Coherence Score/Accuracy on Tweets:  0.5018444123752539


**Anlayizing Results**

In [21]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  from collections import Iterable
