In [78]:
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim
from gensim import corpora, models


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
# Read in dataset of Coronavirus tweets
tweets = pd.read_csv(r"Corona_combined_tweets.csv", encoding = "latin1")
tweets

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,2/3/2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",2/3/2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,2/3/2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,2/3/2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",3/3/2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
44951,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral
44952,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative
44953,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive
44954,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral


In [80]:
# Separate documents from dataset
documents = tweets['OriginalTweet']
documents

0        TRENDING: New Yorkers encounter empty supermar...
1        When I couldn't find hand sanitizer at Fred Me...
2        Find out how you can protect yourself and love...
3        #Panic buying hits #NewYork City as anxious sh...
4        #toiletpaper #dunnypaper #coronavirus #coronav...
                               ...                        
44951    Airline pilots offering to stock supermarket s...
44952    Response to complaint not provided citing COVI...
44953    You know itÂs getting tough when @KameronWild...
44954    Is it wrong that the smell of hand sanitizer i...
44955    @TartiiCat Well new/used Rift S are going for ...
Name: OriginalTweet, Length: 44956, dtype: object

In [81]:
# Let us first clean our documents
def text_prepare(text):
    """Tokenization and Preprocessing."""
    
    # Make everything lowercase
    text = text.lower()
    
    # Remove misspelled words or words not found in GoogleNews embeddings (determined in data exploration and preprocessing)
    text = re.sub("doesnt", "does not", text)
    text = re.sub("dont", "do not", text)
    text = re.sub("isnt", "is not", text)
    text = re.sub("wasnt", "was not", text)
    text = re.sub("didnt", "did not", text)
    text = re.sub("behaviour", "behavior", text)
    text = re.sub("colour", "color", text)
    
    # Replace symbols,newline characters and remove stopwords. Then tokenize sentence 
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    good_symbols_re = re.compile('[^0-9a-z +_]')
    stopwords_set = set(stopwords.words('english'))
    text = re.sub("\n", " ", text)
    text = replace_by_space_re.sub(' ', text)
    text = good_symbols_re.sub('', text)
    text = re.sub("covid19", "coronavirus", text)
    text = re.sub("covid", "coronavirus", text)
    text = re.sub("corona", "coronavirus", text)
    text = re.sub("store", "supermarket", text)
    text = re.sub("grocery", "supermarket", text)
    text = re.sub("social distancing", "social-distancing", text)
    text = re.sub("socialdistancing", "social-distancing", text)
    text = re.sub("toilet paper", "toilet-paper", text)
    text = re.sub("toiletpaper", "toilet-paper", text)
    text = re.sub("hand sanitizer", "sanitizer", text)
    text = re.sub("oil prices", "oil-prices", text)
    text = re.sub("consumer demand", "consumer-demand", text)
    text = re.sub("online shopping", "online-shopping", text)
    text = re.sub("http", "", text)
    text = re.sub("https", "", text)
    text = re.sub(" tco ", "", text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    
    return text

documents = documents.map(lambda x: text_prepare(x))
documents = documents.map(lambda x: x.split())
documents

0        [trending, new, yorkers, encounter, empty, sup...
1        [couldnt, find, sanitizer, fred, meyer, turned...
2           [find, protect, loved, ones, coronavirusvirus]
3        [panic, buying, hits, newyork, city, anxious, ...
4        [toilet-paper, dunnypaper, coronavirusvirus, c...
                               ...                        
44951    [airline, pilots, offering, stock, supermarket...
44952    [response, complaint, provided, citing, corona...
44953    [know, getting, tough, kameronwilds, rationing...
44954    [wrong, smell, sanitizer, starting, turn, coro...
44955    [tartiicat, well, new, used, rift, going, 7000...
Name: OriginalTweet, Length: 44956, dtype: object

In [82]:
#  Now to create our dictionary, BoW and use TD-IDF instead of count

dic = gensim.corpora.Dictionary(documents)
dic.filter_extremes(keep_n=20000)

bow = []
for doc in documents:
  bow.append(dic.doc2bow(doc))


In [83]:
# Train LDA model

lda = gensim.models.ldamodel.LdaModel
model = lda(bow_tdidf, num_topics=3, id2word = dic, passes=10)

In [84]:
# Let us look at the top three topics

for topic in model.print_topics(num_topics = 3, num_words = 5): 
  print(topic)

(0, '0.014*"supermarket" + 0.007*"toilet-paper" + 0.006*"social-distancing" + 0.006*"people" + 0.005*"go"')
(1, '0.010*"consumer" + 0.006*"pandemic" + 0.005*"19" + 0.004*"demand" + 0.004*"crisis"')
(2, '0.014*"prices" + 0.008*"sanitizer" + 0.006*"oil" + 0.005*"gas" + 0.005*"oil-prices"')


We can see the three most common themes (topics) referenced in tweets referencing Covid are:


1.   Referenves to shopping in person (supermarket, social-distancing, people ...).
2.   References to the pandemic and presumably its effect on demand.
3.   References to the financial impact of the pandemic (price of various goods).




