In [192]:
# A Jupyter Notebook that is designed to support the topic modelling analysis done as part of the CIM workshop
# "Virtual workshop on COVID-19 Testing on Twitter: Surfacing testing situations beyond the laboratory"
# https://warwick.ac.uk/fac/cross_fac/cim/news/covid-19-testing-on-twitter
#
# Main author: Cagatay Turkay
#                      
# Following largely the topic modelling example shared here: https://www.kaggle.com/errearanhas/topic-modelling-lda-on-elon-tweets
#
# Date: 22 June 2020
#
# Notes:
# - Works on a csv file of Tweets as exported from a TCAT server -- https://github.com/digitalmethodsinitiative/dmi-tcat
# - Contains some code to remove retweets, remove selected words, such as query terms
# - Uses an LDA model from Gensim to produce a topic model where each Tweet is treated as a single document
# - Once the model is ready, uses pyLDAViz to interactively visualise the model

import os
import pandas as pd
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
import gensim
from gensim import corpora, models, similarities
import logging
import tempfile
from nltk.corpus import stopwords
from string import punctuation
from collections import OrderedDict
import seaborn as sns
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import re
%matplotlib inline

init_notebook_mode(connected=True) #do not miss this line

import warnings
warnings.filterwarnings("ignore")

# This is an additional flag to remove retweets in the data
removeRetweets = True


In [227]:
datafile = 'data/temporal/tcat_TestingRelations-temporal1.csv'

In [232]:
tweets = pd.read_csv(datafile, encoding='latin1')
tweets = tweets.assign(created_at=pd.to_datetime(tweets.created_at))

print("Number of tweets: ",len(tweets['text']))
tweets.head(5)

Number of tweets:  76741


Unnamed: 0,ï»¿id,created_at,from_user_name,text,lang,location,lat,lng,from_user_id,from_user_realname,from_user_tweetcount,from_user_followercount,from_user_friendcount,from_user_favourites_count
0,1242049982619889664,1970-01-01 00:00:01.584962766,AlmostSenseless,RT @GaryBurgessCI: Guernsey coronavirus press ...,en,"Sark, Bailiwick of Guernsey",,,130773785,Victoria Stamps,87821,1196,962,13590
1,1242049999401345024,1970-01-01 00:00:01.584962770,XantahR,RT @balleralert: Idris Elbaâs Wife Sabrina D...,en,catch me if you can,,,1006648588791943168,Xantah????,17738,925,983,45897
2,1242050532623212546,1970-01-01 00:00:01.584962897,lettyruta,RT @PopCrave: Idris Elbaâs wife Sabrina test...,en,,,,1690940252,Letty,2737,160,121,189
3,1242053661309784067,1970-01-01 00:00:01.584963643,no__ddy,RT @spectatorindex: JUST IN: State prison inma...,en,,,,290487664,Nodthegang,3429,115,1010,9943
4,1242054190098235393,1970-01-01 00:00:01.584963769,emimatsgal,RT @PopCrave: Idris Elbaâs wife Sabrina test...,en,Nigeria,,,1048192736,Matsgal_Jnr,11733,1332,2398,23370


In [234]:
# Preparing a corpus for analysis 
# Depending on the flag, the retweets are removed or left in
retweetTweetCount = 0
originalTweetCount = 0
corpus=[]
a=[]
for i in range(len(tweets['text'])):
        a=tweets['text'][i]
        if removeRetweets:
            searchRes = re.search("^RT ", a)
            if not searchRes:
                corpus.append(a)
                originalTweetCount = originalTweetCount + 1
            else:
                retweetTweetCount = retweetTweetCount + 1
        else:  
            corpus.append(a)
            originalTweetCount = originalTweetCount + 1
        
#corpus[0:10]

TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

print("retweetTweetCount: ", retweetTweetCount)
print("originalTweetCount: ", originalTweetCount)

Folder "/var/folders/sl/wxrn89b1285_h_36lggkfydw0000gn/T" will be used to save temporary dictionary and corpus.
retweetTweetCount:  68254
originalTweetCount:  8487


In [235]:
# removing common words and tokenizing
# note here that some words can be removed from the modelling to increase the coherence/usefulness of the models
# This is something to align with the original TCAT query
list1 = ['RT','rt']
list2 = ['coronavirus', 'test', 'testing', 'tests', 'state']
#list2 = ['need', 'needs', 'needed']
#list2 = ['visit']
list3 = []
stoplist = stopwords.words('english') + list(punctuation) + list1 + list2 + list3

texts = [[word for word in str(document).lower().split() if word not in stoplist] for document in corpus]

dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(TEMP_FOLDER, 'tempFile.dict'))  # store the dictionary, for future reference

#print(dictionary)
#print(dictionary.token2id)

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'tempFile.mm'), corpus)  # store to disk, for later use


2020-06-23 15:07:16,680 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-06-23 15:07:16,906 : INFO : built Dictionary(29828 unique tokens: ['better.', 'bloody', 'brother', 'dad', 'get']...) from 8487 documents (total 127901 corpus positions)
2020-06-23 15:07:16,907 : INFO : saving Dictionary object under /var/folders/sl/wxrn89b1285_h_36lggkfydw0000gn/T/tempFile.dict, separately None
2020-06-23 15:07:16,918 : INFO : saved /var/folders/sl/wxrn89b1285_h_36lggkfydw0000gn/T/tempFile.dict
2020-06-23 15:07:17,141 : INFO : storing corpus in Matrix Market format to /var/folders/sl/wxrn89b1285_h_36lggkfydw0000gn/T/tempFile.mm
2020-06-23 15:07:17,142 : INFO : saving sparse matrix to /var/folders/sl/wxrn89b1285_h_36lggkfydw0000gn/T/tempFile.mm
2020-06-23 15:07:17,142 : INFO : PROGRESS: saving document #0
2020-06-23 15:07:17,161 : INFO : PROGRESS: saving document #1000
2020-06-23 15:07:17,181 : INFO : PROGRESS: saving document #2000
2020-06-23 15:07:17,201 : INFO : PROGRESS: savi

In [236]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]  # step 2 -- use the model to transform vectors

2020-06-23 15:07:21,994 : INFO : collecting document frequencies
2020-06-23 15:07:21,994 : INFO : PROGRESS: processing document #0
2020-06-23 15:07:22,019 : INFO : calculating IDF weights for 8487 documents and 29828 features (124510 matrix non-zeros)


In [237]:
total_topics = 15
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=total_topics)
corpus_lda = lda[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold
#Show first n important word in the topics:
lda.show_topics(total_topics,5)

2020-06-23 15:07:25,049 : INFO : using symmetric alpha at 0.06666666666666667
2020-06-23 15:07:25,050 : INFO : using symmetric eta at 0.06666666666666667
2020-06-23 15:07:25,055 : INFO : using serial LDA version on this node
2020-06-23 15:07:25,106 : INFO : running online (single-pass) LDA training, 15 topics, 1 passes over the supplied corpus of 8487 documents, updating model once every 2000 documents, evaluating perplexity every 8487 documents, iterating 50x with a convergence threshold of 0.001000
2020-06-23 15:07:25,109 : INFO : PROGRESS: pass 0, at document #2000/8487
2020-06-23 15:07:26,003 : INFO : merging changes from 2000 documents into a model of 8487 documents
2020-06-23 15:07:26,027 : INFO : topic #12 (0.067): 0.032*"positive" + 0.014*"first" + 0.012*"new" + 0.012*"treatment" + 0.011*"york" + 0.011*"#coronavirus" + 0.011*"recovered" + 0.010*"blood" + 0.010*"patients" + 0.010*"via"
2020-06-23 15:07:26,028 : INFO : topic #3 (0.067): 0.020*"positive" + 0.012*"cases" + 0.011*"h

[(0,
  '0.026*"human" + 0.026*"us" + 0.025*"vaccine" + 0.021*"cure" + 0.018*"become"'),
 (1,
  '0.056*"positive" + 0.035*"new" + 0.026*"york" + 0.022*"cases" + 0.021*"#coronavirus"'),
 (2,
  '0.016*"experts" + 0.010*"health" + 0.009*"tuesday" + 0.009*"korea" + 0.009*"monday"'),
 (3,
  '0.023*"new" + 0.020*"cases" + 0.015*"york" + 0.014*"positive" + 0.013*"trump"'),
 (4,
  '0.024*"antibody" + 0.017*"rate" + 0.015*"community" + 0.013*"#coronavirus" + 0.013*"nation"'),
 (5,
  '0.051*"california" + 0.039*"first" + 0.039*"without" + 0.026*"symptoms" + 0.011*"positive"'),
 (6,
  '0.023*"positive" + 0.019*"wife" + 0.014*"negative" + 0.010*"bauchi" + 0.010*"coronavirus."'),
 (7,
  '0.101*"positive" + 0.080*"coronavirus:" + 0.079*"2" + 0.078*"new" + 0.075*"york"'),
 (8,
  '0.028*"kits" + 0.019*"department" + 0.016*"u.s.," + 0.016*"12,000" + 0.015*"denied"'),
 (9,
  '0.020*"1st" + 0.017*"governors" + 0.017*"week." + 0.014*"across" + 0.014*"@wsj"'),
 (10,
  '0.033*"becomes" + 0.022*"says" + 0.021

In [238]:
# A small piece of code to save the model as a csv file. Useful for plugging in to a different visualisation

topics = lda.get_topics()
topics

top_words_per_topic = []
for t in range(lda.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda.show_topic(t, topn = 10)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words_temporal1.csv", index=False)


In [239]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda, corpus_lda, dictionary, mds='tsne')
panel

In [208]:
# Export the above interactive visual as an html
pyLDAvis.save_html(panel, 'lda_temporal1TermsRemoved.html')