#SoMe Topic Modeling Notebook | Release canvas 1 📖

## Installations and Libraries 💽

In [44]:
# Installations
import sys
if 'google.colab' in sys.modules:
    !pip install emoji --upgrade
    !pip install pandas-profiling==2.*
    !pip install plotly==4.*
    !python -m spacy download en_core_web_lg
    !pip install pyldavis
    !pip install gensim
    #!pip install --upgrade autopep8

Requirement already up-to-date: emoji in /usr/local/lib/python3.6/dist-packages (0.5.4)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
# Required Libraries

#Base and Cleaning 
import json
import requests
import pandas as pd
import numpy as np
import emoji
import regex
import re
import string
from collections import Counter
import time 

#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib as plt 
import pyLDAvis.gensim

#Natural Language Processing (NLP)
import spacy
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)

##Data Cleaning 🧹

In [46]:
# Loading the JSON file 
url_elon = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/elonmusk_followers_english.json'
url_dutchbros = 'https://raw.githubusercontent.com/Lambda-School-Labs/social-media-strategy-ds/feature/topic/python_notebooks/dutchbros_followers.json'

df = requests.get(url_elon).json()

# Converting the dataset to pandas DataFrame and renaming the columns 
df = pd.DataFrame(df.values())
df = df.rename(columns={0:'original_tweets'})

#Removing emojies from text
#Refrence 1 : https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
#Refrence 2 : https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)

#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)

df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets
0,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend https://t.co...,This kid will forever be a legend
1,"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!
3,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam,@Bhuvan_Bam
4,I'm not crying you're crying.\nhttps://t.co/Bc...,I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.
...,...,...,...
9941,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...
9942,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...
9943,"@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu..."
9944,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...


##Topic Modeling ㊙️

###Tokenizing 🕵🏻‍♂

In [0]:
# Load spacy
# Make sure to restart the runtime after running installations and libraries tab
nlp = spacy.load('en_core_web_lg')


In [48]:
"""
Import Gensim and Wordcloud to use their stopwords as well and use the combined stopwords of ALL as the variable:
ALL_STOP_WORDS
"""
# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)


# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&amp;', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']

# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)

# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)


tokens = []

for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
    doc_tokens = []    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())   
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

# Timing End
program_end_time = time.time()

# View df
df

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,tokens
0,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend https://t.co...,This kid will forever be a legend,"[kid, forever, legend]"
1,"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","[truly, believe, lebrons, mindset,, competitiv..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,"[buttlicker!, prices, lower!!!]"
3,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam,@Bhuvan_Bam,[@bhuvan_bam]
4,I'm not crying you're crying.\nhttps://t.co/Bc...,I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.,"[crying, you're, crying.]"
...,...,...,...,...
9941,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,"[@mirandasleeper, offerings, f-3, closed, prev..."
9942,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,"[defining, segment, “the, dance.”]"
9943,"@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","[@frank_miskelly, don’t, it,, love, it!, think..."
9944,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,"[excited, brother, @shufly10, embarks, new, jo..."


In [49]:
# See how long it took
print(program_end_time - program_start_time, "seconds to finish")

1.7005128860473633 seconds to finish


###Lemmatization🇬🇧

In [0]:
# Refrence 4 : https://stackoverflow.com/questions/45306988/column-of-lists-convert-list-to-string-as-a-new-column

# Timing Start
program_start_time = time.time()

# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []
    
    doc = nlp(text)
    
    # Something goes here :P
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)

# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
# df[['original_tweet', 'lemmas_back_to_text']]

# Timing End
program_end_time = time.time()


In [51]:
#Printing Lemmetization Time
print(program_end_time - program_start_time, "seconds to finish")

76.06071639060974 seconds to finish


In [52]:
# Timing Start
program_start_time = time.time()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"
    
    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it
    
    return tokens

# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)

# Timing End
program_end_time = time.time()

# View those tokens (the 4th column)
df


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escape sequence \$


invalid escape sequence \w


invalid escap

Unnamed: 0,original_tweets,emoji_free_tweets,url_free_tweets,tokens,tokens_back_to_text,lemmas,lemmas_back_to_text,lemma_tokens
0,This kid will forever be a legend 😂 https://t....,This kid will forever be a legend https://t.co...,This kid will forever be a legend,"[kid, forever, legend]",kid forever legend,"[kid, forever, legend]",kid forever legend,"[kid, forever, legend]"
1,"If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","If you truly believe Lebrons mindset, competit...","[truly, believe, lebrons, mindset,, competitiv...","truly believe lebrons mindset, competitive fir...","[truly, believe, lebrons, mindset, competitive...",truly believe lebrons mindset competitive fire...,"[truly, believe, lebrons, mindset, competitive..."
2,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,BUTTLICKER! OUR PRICES HAVE NEVER BEEN LOWER!!!,"[buttlicker!, prices, lower!!!]",buttlicker! prices lower!!!,"[buttlicker, price, lower]",buttlicker price lower,"[buttlicker, price, lower]"
3,@Bhuvan_Bam ❤️❤️,@Bhuvan_Bam,@Bhuvan_Bam,[@bhuvan_bam],@bhuvan_bam,[@bhuvan_bam],@bhuvan_bam,[bhuvan_bam]
4,I'm not crying you're crying.\nhttps://t.co/Bc...,I'm not crying you're crying. https://t.co/BcF...,I'm not crying you're crying.,"[crying, you're, crying.]",crying you're crying.,"[cry, cry]",cry cry,"[cry, cry]"
...,...,...,...,...,...,...,...,...
9941,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,@MirandaSleeper The offerings in that f-3 are ...,"[@mirandasleeper, offerings, f-3, closed, prev...",@mirandasleeper offerings f-3 closed previous ...,"[@mirandasleeper, offering, f-3, close, previo...",@mirandasleeper offering f-3 close previous of...,"[mirandasleeper, offering, f-3, close, previou..."
9942,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,This will be the defining segment of “The Last...,"[defining, segment, “the, dance.”]",defining segment “the dance.”,"[define, segment, dance]",define segment dance,"[define, segment, dance]"
9943,"@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","@frank_miskelly I don’t like it, I LOVE IT! Bu...","[@frank_miskelly, don’t, it,, love, it!, think...","@frank_miskelly don’t it, love it! think sixth...","[@frank_miskelly, love, think, sixth, old, run...",@frank_miskelly love think sixth old run middl...,"[frank_miskelly, love, think, sixth, old, run,..."
9944,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,Excited for my brother @Shufly10 as he embarks...,"[excited, brother, @shufly10, embarks, new, jo...","excited brother @shufly10 embarks new journey,...","[excited, brother, @shufly10, embark, new, jou...",excited brother @shufly10 embark new journey o...,"[excited, brother, shufly10, embark, new, jour..."


In [53]:
#Printing Tokenization Time
print(program_end_time - program_start_time, "seconds to finish")


0.3407933712005615 seconds to finish


###id2word 📒

In [54]:
# Create a id2word dictionary
id2word = Dictionary(df['lemma_tokens'])
print(len(id2word))

21883


In [55]:
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
print(len(id2word))

8229


###Corpus Object & Generating Base Model Topics 📚

In [0]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

####Base Model

In [57]:
# Timing Start
base_model_program_start_time = time.time()

# Instantiating a LDA model 
model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

# Timing End
base_model_program_end_time = time.time()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th

In [58]:
#Printing First Model Time
base_model_runtime = round(base_model_program_end_time - base_model_program_start_time, 2)
print(base_model_runtime)


25.250271320343018


In [0]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in model.print_topics()]


In [0]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]


In [61]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
people trump president need realdonaldtrump come say good work help

------ Topic 1 ------
know day new love happy good look try biden year

------ Topic 2 ------
time follow good retweet people know giveaway work big thank

------ Topic 3 ------
time think go people bitcoin let come today state new

------ Topic 4 ------
obamagate obama day think death tweet 3 say year rt



In [62]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=model, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th


Perplexity:  -8.453721220210504

Coherence Score:  0.17176779993227603


#### Base Model Topic Distance Visualization 📈

In [63]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model, corpus, id2word)

###Hyperparameter Tuning 🔧

####Enhanced Model 1 (number of topics) 🦾


In [64]:
# Let's start with parameter tuning for the LDA model and,
# Increase number of topics to be used later to get to 
# Optimal number of distinct topics

# Define chunksize and passes

# Timing Start
model_1_program_start_time = time.time()

model_1 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=20,
                       random_state=42,
                       chunksize=2000,
                       passes=10)

# Timing End
model_1_program_end_time = time.time()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th

In [65]:
#Printing First Model Time
model_1_runtime = round(model_1_program_end_time - model_1_program_start_time, 2)
print(model_1_runtime)


36.3963987827301


In [66]:
# Filtering for words 
words_1 = [re.findall(r'"([^"]*)"',t[1]) for t in model_1.print_topics()]

# Create Topics
topics_1 = [' '.join(t[0:10]) for t in words_1]

# Getting the topics
for id, t in enumerate(topics_1): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
good love world friend week bad stay thank market card

------ Topic 1 ------
people right stop check go leave government pay give understand

------ Topic 2 ------
year money old feel school bring real 50 tomorrow problem

------ Topic 3 ------
look power believe americans kind = result virus bill dog

------ Topic 4 ------
need come video join house launch youtube live tag player

------ Topic 5 ------
way ktov crossover girl rise mean soon car pr come

------ Topic 6 ------
today 4 wait word night drop share de think little

------ Topic 7 ------
bitcoin state crypto care blah hope gift national response plan

------ Topic 8 ------
happen person find health face god lead learn truth say

------ Topic 9 ------
work help send woman long story allow twitter support weekend

------ Topic 10 ------
let time start end + high talk 24 low listen

------ Topic 11 ------
think new know guy case home death die country coronavirus

------ Topic 12 ------
hour rt free 5 oba

In [67]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_1_perplexity = model_1.log_perplexity(corpus)
print('\nPerplexity: ', model_1_perplexity) 

# Compute Coherence Score
coherence_model_1 = CoherenceModel(model=model_1, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_1 = coherence_model_1.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_1)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th


Perplexity:  -9.742035949658115

Coherence Score:  0.4138281791570699


In [68]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_1, corpus, id2word)

####Model 2 (Increase Passes and define iterations)

In [69]:
#Change passes to asses if it'll improve coherence score
#passes = 50
model_2_program_start_time = time.time()

model_2 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=20,
                       random_state=42,
                       chunksize=2000,
                       passes=50,
                       iterations=100)

model_2_program_end_time = time.time()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th

In [70]:
model_2_runtime = round(model_2_program_end_time - model_2_program_start_time, 2)
print(model_2_runtime, "seconds to finish")


179.17814183235168 seconds to finish


In [71]:
# Filtering for words 
words_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_2.print_topics()]

# Create Topics
topics_2 = [' '.join(t[0:10]) for t in words_2]

# Getting the topics
for id, t in enumerate(topics_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")


------ Topic 0 ------
good love thank friend week bad stay market card x

------ Topic 1 ------
people right stop check go government leave give pay big

------ Topic 2 ------
year money old covid-19 feel school real bring project 50

------ Topic 3 ------
join power say americans ago kind = virus bill result

------ Topic 4 ------
need realdonaldtrump video buy house youtube tag player reach actually

------ Topic 5 ------
come long girl mean rise soon car better pr cause

------ Topic 6 ------
today world 4 wait ktov business word share drop night

------ Topic 7 ------
bitcoin crypto care hope national gift response plan trump important

------ Topic 8 ------
happen person health god face lead time glennjacobstn truth point

------ Topic 9 ------
help work woman post twitter story allow support weekend write

------ Topic 10 ------
let time start way end + talk 24 1 7

------ Topic 11 ------
think know new guy case home death coronavirus die country

------ Topic 12 ------
hour rt l

In [72]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_2_perplexity = model_2.log_perplexity(corpus)
print('\nPerplexity: ', model_2_perplexity) 

# Compute Coherence Score
coherence_model_2 = CoherenceModel(model=model_2, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_2 = coherence_model_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_2)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th


Perplexity:  -9.475841208321867

Coherence Score:  0.4133494959991747


In [73]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_2, corpus, id2word)


####Model 3 (Minimum probability)

In [93]:
model_3_program_start_time = time.time()

model_3 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=20,
                       random_state=42,
                       chunksize=2000,
                       passes=50,
                       minimum_probability=0.2, 
                       iterations=100)

model_3_program_end_time = time.time()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th

In [0]:
model_3_runtime = round(model_3_program_end_time - model_3_program_start_time, 2)
print(model_3_runtime, "seconds to finish")


In [76]:
# Filtering for words 
words_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_2.print_topics()]

# Create Topics
topics_2 = [' '.join(t[0:10]) for t in words_2]

# Getting the topics
for id, t in enumerate(topics_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")
    

------ Topic 0 ------
good love thank friend week bad stay market card x

------ Topic 1 ------
people right stop check go government leave give pay big

------ Topic 2 ------
year money old covid-19 feel school real bring project 50

------ Topic 3 ------
join power say americans ago kind = virus bill result

------ Topic 4 ------
need realdonaldtrump video buy house youtube tag player reach actually

------ Topic 5 ------
come long girl mean rise soon car better pr cause

------ Topic 6 ------
today world 4 wait ktov business word share drop night

------ Topic 7 ------
bitcoin crypto care hope national gift response plan trump important

------ Topic 8 ------
happen person health god face lead time glennjacobstn truth point

------ Topic 9 ------
help work woman post twitter story allow support weekend write

------ Topic 10 ------
let time start way end + talk 24 1 7

------ Topic 11 ------
think know new guy case home death coronavirus die country

------ Topic 12 ------
hour rt l

In [77]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_3_perplexity = model_3.log_perplexity(corpus)
print('\nPerplexity: ', model_3_perplexity) 

# Compute Coherence Score
coherence_model_3 = CoherenceModel(model=model_3, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_3 = coherence_model_3.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_3)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th


Perplexity:  -8.963115786917106

Coherence Score:  0.4985579012163873


In [78]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_2, corpus, id2word)


####Model 4 (alpha & eta)

In [79]:
model_4_start_time = time.time()

model_4 = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       num_topics=20,
                       random_state=42,
                       chunksize=2000,
                       passes=50,
                       minimum_probability=0.2, 
                       iterations=100,
                       alpha = 0.01,
                       eta=0.91)

model_4_end_time = time.time()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th

In [88]:
model_4_runtime = round(model_4_end_time - model_4_start_time, 2)
print(model_4_runtime, "seconds to finish")


219.39 seconds to finish


In [81]:
# Filtering for words 
words_2 = [re.findall(r'"([^"]*)"',t[1]) for t in model_2.print_topics()]

# Create Topics
topics_2 = [' '.join(t[0:10]) for t in words_2]

# Getting the topics
for id, t in enumerate(topics_2): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")
    

------ Topic 0 ------
good love thank friend week bad stay market card x

------ Topic 1 ------
people right stop check go government leave give pay big

------ Topic 2 ------
year money old covid-19 feel school real bring project 50

------ Topic 3 ------
join power say americans ago kind = virus bill result

------ Topic 4 ------
need realdonaldtrump video buy house youtube tag player reach actually

------ Topic 5 ------
come long girl mean rise soon car better pr cause

------ Topic 6 ------
today world 4 wait ktov business word share drop night

------ Topic 7 ------
bitcoin crypto care hope national gift response plan trump important

------ Topic 8 ------
happen person health god face lead time glennjacobstn truth point

------ Topic 9 ------
help work woman post twitter story allow support weekend write

------ Topic 10 ------
let time start way end + talk 24 1 7

------ Topic 11 ------
think know new guy case home death coronavirus die country

------ Topic 12 ------
hour rt l

In [82]:
# Compute Perplexity
# a measure of how good the model is. lower the better
model_4_perplexity = model_4.log_perplexity(corpus)
print('\nPerplexity: ', model_4_perplexity) 

# Compute Coherence Score
coherence_model_4 = CoherenceModel(model=model_4, texts=df['lemma_tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_4 = coherence_model_4.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_4)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or the python sum builtin instead.


Calling np.sum(generator) is deprecated, and in the future will give a different result. Use np.sum(np.fromiter(generator)) or th


Perplexity:  -8.870009298561314

Coherence Score:  0.5311231452909411


In [83]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model_2, corpus, id2word)


###Progress Tracksheet 🧑‍🔬

In [86]:
TS = {'model': ["base", 1, 2, 3, 4], 
      'runtime_seconds': [base_model_runtime, model_1_runtime,
                 model_2_runtime, model_3_runtime, model_4_runtime],
      'coherence_score': [coherence_lda_model_base, coherence_lda_model_1,
                          coherence_lda_model_2, coherence_lda_model_3, 
                          coherence_lda_model_4],
      'Perplexity': [base_perplexity, model_1_perplexity, model_2_perplexity,
                     model_3_perplexity, model_4_perplexity]}

track_sheet = pd.DataFrame(data=TS)

track_sheet.head()

Unnamed: 0,model,runtime_seconds,coherence_score,Perplexity
0,base,25.25027,0.171768,-8.453721
1,1,36.3964,0.413828,-9.742036
2,2,179.1781,0.413349,-9.475841
3,3,1591744000.0,0.498558,-8.963116
4,4,219.3909,0.531123,-8.870009


In [87]:
track_sheet['runtime_seconds'][0]

25.250271320343018

In [0]:
Track_Sheet.dtypes