# Importing The Libraries

In [316]:
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis.gensim_models as gensimvis
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
import time
nlp = spacy.load('en_core_web_lg')
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# Reading And Preprocessing The DataSet

In [359]:
raw_data = pd.read_csv('MTurk-Labels-20201022.csv')
tweet_data = raw_data[['Input.text']].copy()
tweet_data = tweet_data.rename(columns={'Input.text': 'raw_tweets'})
tweet_data = tweet_data.drop_duplicates(subset = ["raw_tweets"])
tweet_data.head()

Unnamed: 0,raw_tweets
0,Climate Lite personified. Hayhoe is a total fa...
2,"last I check, thanks to the ChinaVirus, ther..."
4,"He presented what you asked for, but it's no..."
6,"Wow, that is shockingly stupid"
9,"I feel like ""ok doomer"" is the appropriate res..."


#### Since we have duplicate rows 

# Data Cleaning and Preprocessing

In [360]:
"""
Remove the Emoticons and Url from the tweets
"""
def give_emoji_url_free_text(text):
    #Removing Emoji
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    #Removing URL
    clean_text = re.sub(r'http\S+', '', clean_text)
    return clean_text

In [361]:
"""
Create the StopWords list from Gensim, NLTK, and Custom Stopwords
"""
def create_stopwords():
    # Provided in the File
    with open("stopwords.txt") as f:
        custom_stopwords = f.read().splitlines() 
        
    more_custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
    all_stopwords = nlp.Defaults.stop_words.union(custom_stopwords).union(SW).union(stopwords).union(more_custom_stopwords)
    return all_stopwords

In [362]:
"""
Creating the tokens, and removing stopwords
and punctuations
"""
def tokenization():
    tokenizer = Tokenizer(nlp.vocab)
    all_stopwords = create_stopwords()
    tokens = []
    for doc in tokenizer.pipe(tweet_data['url_emoji_url_free_tweets'], batch_size=500):
        doc_tokens = []    
        for token in doc: 
            if token.text.lower() not in all_stopwords and token.text.lower() not in string.punctuation:
                doc_tokens.append(token.text.lower())   
        tokens.append(doc_tokens)
    # Makes tokens column
    return tokens


In [363]:
"""
Applying the lemmatization
"""
def lemmatization(text):
    lemmas = []
    doc = nlp(text)
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ not in 'PRON'): # ["NOUN","ADJ","VERB","ADV"]
            lemmas.append(token.lemma_)
    return lemmas

In [364]:
"""
Data Cleaning 
"""
def data_cleaning():
    tweet_data['url_emoji_url_free_tweets'] = tweet_data['raw_tweets'].apply(lambda x: give_emoji_url_free_text(x))
    tweet_data['tokens'] = tokenization()
    tweet_data['tokens_back_to_text'] = [' '.join(map(str, l)) for l in tweet_data['tokens']]
    tweet_data['lemma_tokens'] = tweet_data['tokens_back_to_text'].apply(lemmatization)
    tweet_data['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in tweet_data['lemma_tokens']]
    tweet_data.head()
data_cleaning()
tweet_data.head()

Unnamed: 0,raw_tweets,url_emoji_url_free_tweets,tokens,tokens_back_to_text,lemma_tokens,lemmas_back_to_text
0,Climate Lite personified. Hayhoe is a total fa...,Climate Lite personified. Hayhoe is a total fa...,"[climate, lite, personified., hayhoe, total, f...",climate lite personified. hayhoe total fake. d...,"[climate, lite, personify, hayhoe, total, fake...",climate lite personify hayhoe total fake deep ...
2,"last I check, thanks to the ChinaVirus, ther...","last I check, thanks to the ChinaVirus, there'...","[check,, chinavirus,, lot, carbon, atmosphere....","check, chinavirus, lot carbon atmosphere...or ...","[check, chinavirus, lot, carbon, atmosphere, w...",check chinavirus lot carbon atmosphere watch news
4,"He presented what you asked for, but it's no...","He presented what you asked for, but it's not ...","[presented, asked, for,, good, you,, ""almighty...","presented asked for, good you, ""almighty"" agw-...","[present, ask, good, almighty, agw, alarmist, ...",present ask good almighty agw alarmist truth u...
6,"Wow, that is shockingly stupid","Wow, that is shockingly stupid","[wow,, shockingly, stupid]","wow, shockingly stupid","[wow, shockingly, stupid]",wow shockingly stupid
9,"I feel like ""ok doomer"" is the appropriate res...","I feel like ""ok doomer"" is the appropriate res...","[feel, ""ok, doomer"", response]","feel ""ok doomer"" response","[feel, ok, doomer, response]",feel ok doomer response


# Creating the Bigram and Trigrams

Hyperparameters
min_count = 3
threshold=10

In [365]:
data_words = tweet_data['lemma_tokens'].tolist()
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=3, threshold=10)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=10)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0][0:20])

['climate', 'lite', 'personify', 'hayhoe', 'total', 'fake', 'deep', 'swamp', 'elevate', 'celebrity', 'status', 'claim', 'extreme', 'weather', 'event', 'increase', 'ipcc']


In [366]:
id2word = Dictionary(data_bigrams_trigrams)
print(len(id2word))
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))
corpus = [id2word.doc2bow(d) for d in data_bigrams_trigrams]

1330
343


### Corpus is ready

# Grid Search

In [367]:
lemmas_df = tweet_data['lemmas_back_to_text']
print(type(lemmas_df[0]))


<class 'str'>


In [368]:
vectorizer = CountVectorizer(ngram_range = (1,3))
data_vectorized = vectorizer.fit_transform(tweet_data['lemmas_back_to_text'])

In [369]:
gs_start_time = time.time()

# Define Search Param
search_params = {'n_components': [10], 'learning_decay': [.5]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
              n_jobs=1,
             param_grid={'n_topics': [10], 
                         'learning_decay': [0.5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)

gs_end_time = time.time()


In [370]:
print(gs_end_time - gs_start_time, "seconds to finish")

1.1320929527282715 seconds to finish


In [371]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -26775.472060155735
Model Perplexity:  8005.459218766658


### The issue with grid search is it does not work with Model Coherence, it works with perplexity only


### Need to figure out how to hypertune parameters, for now done manually

# Experimenting Different Models With Different Params

In [338]:
def trainingLDAModel(corpus,id2word,num_topics,chunksize,passes):
    
    model = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=num_topics,
                       random_state=42,
                       chunksize=chunksize,
                       passes=passes)
    return model
    

In [339]:
def getCoherenceAndPerplexity(model,corpus):
    model_perplexity = model.log_perplexity(corpus)
    print('\nPerplexity: ', model_perplexity) 
    coherence_model = CoherenceModel(model=model, texts=data_bigrams_trigrams, 
                                       dictionary=id2word, coherence='c_v')
    model_coherence = coherence_model.get_coherence()
    print('\nCoherence Score: ', model_coherence)
    return model_perplexity,model_coherence

In [340]:
def getTopics(model):
    # Filtering for words 
    words = [re.findall(r'"([^"]*)"',t[1]) for t in model.print_topics()]
    # Create Topics
    topics = [' '.join(t[0:10]) for t in words]
    # Getting the topics
#     for id, t in enumerate(topics): 
#         print(f"------ Topic {id} ------")
#         print(t, end="\n\n")
    return words, topics

In [341]:
model_1_0_start_time = time.time()
model_1_0 = trainingLDAModel(corpus,id2word,5,2000,10)
model_1_0_end_time = time.time()
model_1_0_runtime = round(model_1_0_end_time - model_1_0_start_time, 2)
words_1_0,topics_1_0 =  getTopics(model_1_0)
model_1_0_perplexity,coherence_lda_model_1_0 = getCoherenceAndPerplexity(model_1_0,corpus)


Perplexity:  -6.210330137173361

Coherence Score:  0.4537877581627042


In [343]:
model_1_1_start_time = time.time()
model_1_1 = trainingLDAModel(corpus,id2word,10,2000,10)
model_1_1_end_time = time.time()
model_1_1_runtime = round(model_1_1_end_time - model_1_1_start_time, 2)
words_1_1,topics_1_1 =  getTopics(model_1_1)
model_1_1_perplexity,coherence_lda_model_1_1 = getCoherenceAndPerplexity(model_1_1,corpus)


Perplexity:  -6.492891231853672

Coherence Score:  0.4781788094022691


In [344]:
model_1_2_start_time = time.time()
model_1_2 = trainingLDAModel(corpus,id2word,15,2000,10)
model_1_2_end_time = time.time()
model_1_2_runtime = round(model_1_2_end_time - model_1_2_start_time, 2)
words_1_2,topics_1_1 =  getTopics(model_1_2)
model_1_2_perplexity,coherence_lda_model_1_2 = getCoherenceAndPerplexity(model_1_2,corpus)


Perplexity:  -6.590829690469389

Coherence Score:  0.4119105030747734


# Running LDA Models With Different Values Of Number Of Topics

In [372]:
def compute_coherence_values_topics(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word,random_state=42,chunksize=2000)
        model_list_topic.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    return model_list_topic, coherence_values_topic    

In [373]:
model_list_topic, coherence_values_topic = compute_coherence_values_topics(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=data_bigrams_trigrams,
                                                        start=2, limit=200, step=3)


In [374]:
limit=200; start=2; step=3;
x_topic = range(start, limit, step)
topic_ts = {'coherence_value': coherence_values_topic,
            'number_of_topics': x_topic}
topic_chart = pd.DataFrame(data=topic_ts)
topic_fig = px.line(topic_chart, x="number_of_topics", y="coherence_value")
topic_fig.show()

# Running LDA Models With Different Values Of Number Of Passes

In [230]:
#Defining a function to loop over iterations to find an optimal number of tipics
def compute_coherence_values_passes(dictionary,corpus,texts,start,limit,step):
    coherence_values_its = []
    model_list_its = []
    for passes in range(start, limit, step):
        model = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,# Fixed From First Iterations
                       random_state=42,
                       chunksize=2000,
                       passes=passes)
        model_list_its.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_its.append(coherencemodel.get_coherence())

    return model_list_its, coherence_values_its   

In [231]:
model_list_its, coherence_values_its = compute_coherence_values_passes(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=data_bigrams_trigrams,
                                                        start=10, limit=50, step=10)


In [234]:
#Printing First Model Time
limit=50; start=10; step=10;
x_2 = range(start, limit, step)

its_ts = {'coherence_value': coherence_values_its,
            'number_of_passes': x_2}

its_track_sheet = pd.DataFrame(data=its_ts)

its_fig = px.line(its_track_sheet, x="number_of_passes", y="coherence_value")
its_fig.show()

12.95


# Running LDA Models With Different Values Of Number Of Iterations

In [242]:
#Defining a function to loop over iterations to find an optimal number of iterations
def compute_coherence_values_iterations(dictionary, corpus, texts, start,limit, step):
    coherence_values_its = []
    model_list_its = []
    for iterations in range(start, limit, step):
        model = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,# Fixed From First Iterations
                       random_state=42,
                       chunksize=2000,
                       passes=20,# Fixed From Second Iterations
                       iterations=iterations)
        model_list_its.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values_its.append(coherencemodel.get_coherence())

    return model_list_its, coherence_values_its   

In [243]:


model_list_its, coherence_values_its = compute_coherence_values_iterations(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=data_bigrams_trigrams,
                                                        start=10, limit=150, step=10)


In [245]:
limit=150; start=10; step=10;
x_2 = range(start, limit, step)

its_ts = {'coherence_value': coherence_values_its,
            'number_of_iterations': x_2}

its_track_sheet = pd.DataFrame(data=its_ts)

its_fig = px.line(its_track_sheet, x="number_of_iterations", y="coherence_value")
its_fig.show()

43.0


#  Running LDA Models With Different Values Of Decay

### Decay = 0.5

In [263]:
model_2_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,
                       random_state=42,
                       chunksize=2000,
                       passes=20,
                       iterations=30,
                       decay=0.5)

In [265]:
model_2_1_perplexity = model_2_1.log_perplexity(corpus)
print('\nPerplexity: ', model_2_1_perplexity) 

coherence_model_2_1 = CoherenceModel(model=model_2_1, texts=data_bigrams_trigrams, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_2_1 = coherence_model_2_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_2_1)


Perplexity:  -6.397354650849029

Coherence Score:  0.4824599923684124


In [269]:
model_4_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,
                       random_state=42,
                       chunksize=2000,
                       passes=20,
                       iterations=30,
                       decay=0.7)

In [271]:
model_4_2_perplexity = model_4_2.log_perplexity(corpus)
print('\nPerplexity: ', model_4_2_perplexity) 

coherence_model_4_2 = CoherenceModel(model=model_4_2, texts=data_bigrams_trigrams, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_4_2 = coherence_model_4_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_4_2)


Perplexity:  -6.397683581973082

Coherence Score:  0.5005826977541236


In [284]:
model_4_3 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,
                       random_state=42,
                       chunksize=2000,
                       passes=20,
                       iterations=30,
                       decay=0.9)

In [286]:
model_4_3_perplexity = model_4_3.log_perplexity(corpus)
print('\nPerplexity: ', model_4_3_perplexity) 

coherence_model_4_3 = CoherenceModel(model=model_4_3, texts=data_bigrams_trigrams, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_4_3 = coherence_model_4_3.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_4_3)


Perplexity:  -6.402737761538751

Coherence Score:  0.501871023626274


### We are getting the highest coherence with decay = 0.9

#  Running LDA Models With Different Values Of Alpha

In [292]:
model_3_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,
                       random_state=42,
                       chunksize=2000,
                       passes=20,
                       iterations=30,
                       decay=0.9,alpha = 'asymmetric')

In [294]:
model_3_1_perplexity = model_3_1.log_perplexity(corpus)
print('\nPerplexity: ', model_3_1_perplexity) 

coherence_model_3_1 = CoherenceModel(model=model_3_1, texts=data_bigrams_trigrams, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_3_1 = coherence_model_3_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_3_1)


Perplexity:  -6.350832687416609

Coherence Score:  0.4229459310150331


In [296]:
model_3_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,
                       random_state=42,
                       chunksize=2000,
                       passes=20,
                       iterations=30,
                       decay=0.9,alpha = 'symmetric')


In [298]:
model_3_2_perplexity = model_3_2.log_perplexity(corpus)
print('\nPerplexity: ', model_3_2_perplexity) 

coherence_model_3_2 = CoherenceModel(model=model_3_2, texts=data_bigrams_trigrams, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_3_2 = coherence_model_3_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_3_2)


Perplexity:  -6.402437916006075

Coherence Score:  0.501871023626274


### Symmetric gives better result

# Minimum Probability

In [375]:


model_4_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,
                       random_state=42,
                       chunksize=2000,
                       passes=20,
                       decay=0.9,
                       iterations=30,
                       alpha = "symmetric",
                       minimum_probability=0.1)




In [376]:
model_4_1_perplexity = model_4_1.log_perplexity(corpus)
print('\nPerplexity: ', model_4_1_perplexity) 

coherence_model_4_2 = CoherenceModel(model=model_4_1, texts=data_bigrams_trigrams, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_4_2 = coherence_model_4_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_4_2)


Perplexity:  -6.5119233366358795

Coherence Score:  0.5173938737057657


In [377]:
model_4_3 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=8,
                       random_state=42,
                       chunksize=2000,
                       passes=20,
                       decay=0.9,
                       iterations=30,
                       alpha = "symmetric",
                       minimum_probability=0.7)

In [378]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_4_3_perplexity = model_4_3.log_perplexity(corpus)
print('\nPerplexity: ', model_4_3_perplexity) 

# Compute Coherence Score
coherence_model_4_3 = CoherenceModel(model=model_4_3, texts=data_bigrams_trigrams, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_4_3 = coherence_model_4_3.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_4_3)


Perplexity:  -6.511739818375075

Coherence Score:  0.5197734587137116


### Minumum Probability does not show an impact

In [317]:
vis = pyLDAvis.gensim_models.prepare(model_6_3,corpus,id2word,mds="mmds",R=30) # use gensim_models instead of gensim
vis


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



In [309]:
# Filtering for words 
words_6_3 = [re.findall(r'"([^"]*)"',t[1]) for t in model_6_3.print_topics()]

# Create Topics
topics_6_3 = [' '.join(t[0:10]) for t in words_6_3]

# Getting the topics
for id, t in enumerate(topics_6_3): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

  

------ Topic 0 ------
climate model think heat science time idiot $ tweet temperature

------ Topic 1 ------
climate co2 datum history propaganda topic earth warm coal science

------ Topic 2 ------
year co2 degree fact ice warm big tax good invest

------ Topic 3 ------
co2 point history forget read talk meet paper f stupid

------ Topic 4 ------
climate change scientist gerald year emission stop fuck time global

------ Topic 5 ------
climate good model scientist year build support narrative end sign

------ Topic 6 ------
climate carbon change warming global fact model science scientist long

------ Topic 7 ------
science co2 climate proof datum change emission answer challenge brain



https://towardsdatascience.com/twitter-topic-modeling-e0e3315b12e2

https://www.youtube.com/watch?v=UkmIljRIG_M

In [379]:
!pip3 freeze > requirements.txt
