In [50]:
import nltk
import time
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
df = pd.read_csv('covidexcel.csv')

In [21]:
x = df.iloc[22983].Tweet

In [4]:
df

Unnamed: 0,User,Date,Tweet,Permalink,Retweet count,Likes count,Tweet value
0,The Spectator Index,23/3/2020 22:11:21,UNITED STATES: Man in his sixties dies after t...,https://www.twitter.com/user/status/1242212365...,1158,2607,2638.90
1,The White House,23/3/2020 22:08:41,LIVE: Press Briefing with Coronavirus Task For...,https://www.twitter.com/user/status/1242211691...,2411,5061,34796.42
2,NCDC,23/3/2020 22:02:49,Four new cases of #COVID19 have been confirmed...,https://www.twitter.com/user/status/1242210215...,2949,1973,511.37
3,Scott Dworkin,23/3/2020 21:35:12,This video pointing out Trump’s coronavirus li...,https://www.twitter.com/user/status/1242203267...,1218,1275,1532.21
4,Caleb Hull,23/3/2020 21:31:17,Sen. @tedcruz UNLOADS on Dems from the Senate ...,https://www.twitter.com/user/status/1242202282...,3050,6152,314.39
...,...,...,...,...,...,...,...
22980,World Health Organization (WHO),22/1/2020 19:50:46,LIVE: Press conference on the Emergency Commit...,https://www.twitter.com/user/status/1220071324...,2499,2898,11808.43
22981,pictoline,22/1/2020 19:47:03,El #coronavirus de Wuhan (China): una breve ex...,https://www.twitter.com/user/status/1220070388...,10727,17985,3975.19
22982,CNW,22/1/2020 19:44:32,Roads in Wuhan on lockdown and blocked by poli...,https://www.twitter.com/user/status/1220069758...,7546,12698,131.05
22983,woppa 🎗😷,22/1/2020 19:38:54,When you eat bats and bamboo rats and shit and...,https://www.twitter.com/user/status/1220068338...,10664,11997,63.77


In [6]:
data = df.Tweet.values.tolist()

In [46]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# # Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [48]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=5, step=5):
    """ Compute coherence values for various number of topics.
    Parameters:
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics
        start : Int of lowest number of topics to model
        step : Int of increment to change number of topics to model
    Returns:
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        start=time.time()
        model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           chunksize=10000,
                                           passes=1,
                                           per_word_topics=True)
        
        print(f'Topic modeling for {num_topics} topics took {time.time()-start} seconds.')
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [59]:
ml, cov = compute_coherence_values(id2word, corpus, texts, 11, 4, 2)

KeyboardInterrupt: 

In [None]:
def graph_coherence(coherence_values):
    """ Graph a list of coherence values to determine the optimal number of topics to model. """
    limit=9; start=4; step=1;
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [None]:
graph_coherence(cov)

In [191]:
optimal_model = ml[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

[(0,
  '0.046*"co" + 0.011*"people" + 0.009*"report" + 0.009*"case" + 0.008*"virus" '
  '+ 0.008*"trump" + 0.008*"spread" + 0.006*"know" + 0.005*"new" + '
  '0.005*"break"'),
 (1,
  '0.012*"country" + 0.011*"co" + 0.011*"people" + 0.010*"test" + '
  '0.008*"spread" + 0.008*"work" + 0.007*"know" + 0.007*"get" + 0.006*"take" + '
  '0.006*"say"'),
 (2,
  '0.030*"co" + 0.018*"case" + 0.018*"people" + 0.015*"test" + 0.012*"go" + '
  '0.009*"confirm" + 0.006*"https" + 0.005*"get" + 0.005*"say" + '
  '0.005*"spread"'),
 (3,
  '0.015*"say" + 0.010*"take" + 0.010*"covid" + 0.010*"people" + 0.008*"break" '
  '+ 0.007*"test" + 0.007*"virus" + 0.007*"amp" + 0.006*"spread" + '
  '0.006*"make"'),
 (4,
  '0.021*"co" + 0.015*"say" + 0.013*"day" + 0.010*"people" + 0.008*"take" + '
  '0.008*"get" + 0.007*"test" + 0.006*"virus" + 0.006*"free" + 0.005*"be"'),
 (5,
  '0.036*"co" + 0.020*"people" + 0.010*"https" + 0.009*"case" + 0.008*"test" + '
  '0.008*"virus" + 0.006*"come" + 0.006*"say" + 0.006*"death" 

In [60]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [61]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis



TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGABRT(-6), SIGABRT(-6), SIGABRT(-6), SIGABRT(-6), SIGABRT(-6), SIGABRT(-6), SIGABRT(-6), SIGABRT(-6)}