### Loading necessary packages

In [2]:
import warnings
warnings.filterwarnings('ignore')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words
from emot.emo_unicode import UNICODE_EMOJI
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pandas as pd
from pprint import pprint
import string
import os
import re

In [3]:
#data loader class
class DataLoader:
  def __init__(self, file_name):
    self.file_name = file_name
    
 
  def read_csv(self):
    tweets_df=pd.read_csv(self.file_name)
    return tweets_df

In [4]:
#object creation
DataLoader_obj= DataLoader('/content/clean_processed_tweet_data.csv')

**Removing NULL valued raws** 

In [5]:
tweets_df=DataLoader_obj.read_csv()
tweets_df.dropna()

Unnamed: 0.1,Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
157,257,2022-04-22 17:14:49+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",RT @JohnLuTSNMtl: #Habs Guy Lafleur’s statue b...,0.000,0.10,en,1518,120,gfliverpool9966,1295,1507,False,Habs,JohnLuTSNMtl,Ashamed to be in Canada
177,281,2022-04-22 12:05:39+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",RT @TheoMoudakis: Please enjoy my #affordableh...,0.400,0.50,en,304,45,gfliverpool9966,1295,1507,False,affordablehousing,"TheoMoudakis, TorontoStar",Ashamed to be in Canada
241,404,2022-04-16 22:44:20+00:00,"<a href=""http://twitter.com/download/android"" ...",@BZeit72 \nHappy Easter my friend.\n#cheers ht...,0.800,1.00,en,0,0,Cannonballs17,125,300,False,cheers,BZeit72,Planet 🌎
249,413,2022-04-03 01:10:45+00:00,"<a href=""http://twitter.com/download/android"" ...",@lisamarinass Hope you had one like this !\n#B...,0.000,0.00,en,0,0,Cannonballs17,125,300,False,Burbon,lisamarinass,Planet 🌎
250,414,2022-04-03 00:33:01+00:00,"<a href=""http://twitter.com/download/android"" ...",@petenajarian Mitchcer's\nOld Fashioned infuse...,0.125,0.20,en,0,0,Cannonballs17,125,300,False,"burbon, possibilites",petenajarian,Planet 🌎
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16246,24062,2022-04-22 18:14:55+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",RT @Dailymirror_SL: Read the Daily Mirror toda...,0.000,0.00,en,10,5,steve4you4ever,491,4997,False,"DailyMirror, DailyMirrorOnline",Dailymirror_SL,"Colombo,srilanka"
16248,24064,2022-04-22 18:13:28+00:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",RT @Dailymirror_SL: Read the Daily Mirror toda...,0.000,0.00,en,13,4,steve4you4ever,491,4997,False,"DailyMirror, DailyMirrorOnline",Dailymirror_SL,"Colombo,srilanka"
16261,24078,2022-04-22 05:57:39+00:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @Bhai1Its: Happy birthday #nifty ....\nMeri...,0.800,1.00,en,141,9,NiveshValue,55,8,False,nifty,Bhai1Its,"Gurgaon, Haryana"
16314,24184,2022-04-20 23:47:58+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @_tanveersheikh: My India is made of these ...,0.000,0.00,en,154,97,yashin143,4733,4711,False,India,_tanveersheikh,"Mumbai, INDIA"


In [6]:
len(tweets_df)

16472

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
class PrepareData:
  def __init__(self,df):
    self.df=df
    
  def preprocess_data(self):
    tweets_df = self.df.loc[self.df['lang'] =="en"]
    tweet_tokens = tweets_df['original_text']
      
    #text Preprocessing
    tweet_tokens = tweet_tokens.astype(str)
    tweet_tokens = tweet_tokens.apply(lambda x: x.lower())
    tweet_tokens = tweet_tokens.apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))   
    
    #Converting tweets to list of words For feature engineering
    sentence_list = [tweet for tweet in tweet_tokens]
    word_list = [sent.split() for sent in sentence_list]

    # Remove stopwords
    stop_words = list(stopwords.words('english'))
    emojis = list(UNICODE_EMOJI.keys())  # full list of emojis
    
    filtered_words = [w for w in word_list if w not in stop_words]
    filtered_words = [w for w in filtered_words if w not in emojis]
    filtered_words = [w for w in filtered_words if w in word_list]

    #Create dictionary which contains Id and word 
    word_to_id = corpora.Dictionary(filtered_words)
    corpus_1= [word_to_id.doc2bow(tweet) for tweet in filtered_words]

    return word_list, word_to_id, corpus_1

In [11]:
PrepareData_obj=PrepareData(tweets_df)
word_list ,id2word,corpus=PrepareData_obj.preprocess_data()

**Topic Modeling with LDA**

In [12]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [13]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('you', 0.026271263),
   ('are', 0.022953043),
   ('not', 0.016795922),
   ('more', 0.011256309),
   ('what', 0.010357337),
   ('we', 0.010082784),
   ('they', 0.009942557),
   ('your', 0.008879352),
   ('go', 0.0066863354),
   ('do', 0.0064603896)]),
 (1,
  [('follow', 0.022955721),
   ('back', 0.014551646),
   ('know', 0.009248403),
   ('me', 0.0076309047),
   ('even', 0.0073262826),
   ('2', 0.0067427834),
   ('like', 0.0067103654),
   ('while', 0.0065391697),
   ('\U0001f7e9\U0001f7e9\U0001f7e9\U0001f7e9\U0001f7e9', 0.0059472397),
   ('wordle', 0.0053625084)]),
 (2,
  [('the', 0.05968852),
   ('rt', 0.050841812),
   ('to', 0.039525043),
   ('of', 0.031793933),
   ('in', 0.028648676),
   ('a', 0.025112996),
   ('and', 0.021748504),
   ('is', 0.021271637),
   ('for', 0.01927146),
   ('on', 0.015722204)]),
 (3,
  [('amp', 0.01951048),
   ('sri', 0.008832757),
   ('so', 0.008189451),
   ('power', 0.0070334054),
   ('why', 0.0063769906),
   ('says', 0.005877724),
   ('pm', 0.005

In [14]:
# Compute Perplexity

#It's a measure of how good the model is. The lower the better. Perplexity is a negative value
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
doc_lda = lda_model[corpus]


# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)


Perplexity:  -9.476325902737367

 Ldamodel Coherence Score/Accuracy on Tweets:  0.3858758426389203


**Anlayizing Results**

In [17]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared

  from collections import Iterable
