# 1. Preprocessing Data

In [1]:
import pandas as pd
import re
import string
import os

from wordcloud import WordCloud
from pprint import pprint

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import flatten

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import pyLDAvis.gensim_models

In [2]:
# Prefix Removal
def prefix_removal(content):
    colon = content.find(":")
    return content[colon+1:]

In [3]:
# Map to lowercase
def lowercase(content):
    return content.lower()

In [4]:
# Digit Removal
def digit_removal(content):
    return re.sub(r'\d+', '', content)

In [5]:
# Emoji Removal
def emoji_removal(content):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', content) # no emoji

In [6]:
# URL Removal
def url_removal(content):
    return re.sub(r'http\S+', '', content)

In [7]:
# Punctuation Removal by tokenizer
def punc_removal(content):
    punctuations = list(string.punctuation)
    punctuations.append('…')
    
    tk = TweetTokenizer(r'\w+')
    content_tokens = [i.strip("".join(punctuations)) for i in tk.tokenize(content) if i not in punctuations]
    return content_tokens

In [8]:
# Stop words removal
def stop_words_removal(content_tokens):
    stop_words = set(stopwords.words('english'))
#     stop_words.add('covid')
#     stop_words.add('coronaviru')
#     stop_words.add('coronavirus')
    return [w for w in content_tokens if not w in stop_words]

In [9]:
# Only keep english character
def non_english_removal(content_tokens):
    return [w for w in content_tokens if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+", w) and len(w) > 2]

In [10]:
# Example of how to use the functions
print('\n')
print(">>>Example<<<\n")

content = "RT @DrLiMengYAN1: WHY ف_ج 你 and HOW the 2,999 ❤️😳 🈶️Lab-origin http://www.google.com of COVID-19 virus self-monitoring in a… "

print(">>>Original Content:", content, '\n')

content = lowercase(content)
print(">>>Lowercase:", content, '\n')

content = prefix_removal(content)
print(">>>Prefix Removal: ", content, '\n')

content = digit_removal(content)
print(">>>Digit Removal: ", content, '\n')

content = emoji_removal(content)
print(">>>Emoji Removal: ", content, '\n')

content = url_removal(content)
print(">>>Url Removal: ", content, '\n')

content_token = punc_removal(content)
print(">>>Punctuation Removal: ", content_token, '\n')

content_token = stop_words_removal(content_token)
print(">>>Stop Word Removal: ", content_token, '\n')

content_token = non_english_removal(content_token)
print(">>>Non English Character Removal: ", content_token, '\n')



>>>Example<<<

>>>Original Content: RT @DrLiMengYAN1: WHY ف_ج 你 and HOW the 2,999 ❤️😳 🈶️Lab-origin http://www.google.com of COVID-19 virus self-monitoring in a…  

>>>Lowercase: rt @drlimengyan1: why ف_ج 你 and how the 2,999 ❤️😳 🈶️lab-origin http://www.google.com of covid-19 virus self-monitoring in a…  

>>>Prefix Removal:   why ف_ج 你 and how the 2,999 ❤️😳 🈶️lab-origin http://www.google.com of covid-19 virus self-monitoring in a…  

>>>Digit Removal:   why ف_ج 你 and how the , ❤️😳 🈶️lab-origin http://www.google.com of covid- virus self-monitoring in a…  

>>>Emoji Removal:   why ف_ج  and how the ,  lab-origin http://www.google.com of covid- virus self-monitoring in a…  

>>>Url Removal:   why ف_ج  and how the ,  lab-origin  of covid- virus self-monitoring in a…  

>>>Punctuation Removal:  ['why', 'ف_ج', 'and', 'how', 'the', 'lab-origin', 'of', 'covid', 'virus', 'self-monitoring', 'in', 'a'] 

>>>Stop Word Removal:  ['ف_ج', 'lab-origin', 'covid', 'virus', 'self-monitoring'] 

>>>Non En

# 2. LDA

In [11]:
def find_file_name(file, country):
    for fname in os.listdir('Data/' + file + '/'):    # change directory as needed
        if country in fname:
            return 'Data/' + file + '/' + fname

In [12]:
def preprocess_tweet(content):
    content = lowercase(content)

    if content[:4] == 'RT @':
        content = prefix_removal(content)

    content = digit_removal(content)
    content = emoji_removal(content)
    content = url_removal(content)
    content_token = punc_removal(content)
    content_token = stop_words_removal(content_token)
    content_token = non_english_removal(content_token)
    return content_token

In [13]:
def read_file(directory):
    file = pd.read_csv(directory, index_col=False, on_bad_lines='skip', engine='python')
    
    print('diretory is', directory)
    twitters = []
    for i, t in enumerate(file['tweet']):
        content_token = preprocess_tweet(t)
        if len(content_token) >= 1:
            twitters.append(content_token)

        # Print examples
        if i < 2:
            print('line[{}] = {}'.format(i, t), '\n')
            
    if len(twitters) == 0:
        print("**********************************")
    return twitters


In [34]:
def find_tweets(country):
    root_dir = 'data/'
    early = find_file_name('Early_Processed', country)
    mid   = find_file_name('Mid_Processed', country)
    late  = find_file_name('Late_Processed', country)
    
    early_tweets = read_file(early)
    mid_tweets   = read_file(mid)
    late_tweets  = read_file(late)
    all_tweets = early_tweets + mid_tweets + late_tweets
    return [early_tweets, mid_tweets, late_tweets, all_tweets]

In [35]:
def LDAmodel(tweets, index=0, country='None'):
    names = ['Early', 'Mid', 'Late', 'All']
    print(len(tweets))
    id2word = corpora.Dictionary(tweets)

    # Create Corpus
    texts = tweets

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    print(i,len(corpus))


    # Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10)
    
    pyLDAvis.enable_notebook()
    vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(vis_data, 'RQ3 Visualization/' + names[index] + '/' + country + '.html')

In [37]:
countries = ['United States', 'United Kingdom', 'Thailand', 'India', 'Canada', 'Serbia', 'Australia', 'Nigeria', 'South Africa', 'Colombia']
for country in countries:
    results = find_tweets(country)
    for i, each in enumerate(results):
        LDAmodel(each, i, country)

diretory is Data/Early_Processed/8_United States Minor Outlying Islands.csv
line[0] = Spotting A Flying Praying Mantis Is Rare &amp; Hard Not To Notice ONCE SEEN! 4 The Past Ten Yrs My Training In Mixed Martial Arts Changed To Gi My Style &amp; Movements Within This Disipline To Take The Proper Stands &amp; Embraces For Your Love Ones Putting On A Gi For Our Life! #COVID19 https://t.co/9Q14PP9z3Y 

line[1] = Spotting A Flying Praying Mantis Is Rare &amp; Hard Not To Notice ONCE SEEN! 4 The Past Ten Yrs My Training In Mixed Martial Arts Changed To Gi My Style &amp; Movements Within This Disipline To Take The Proper Stands &amp; Embraces For Your Love Ones Putting On A Gi For Our Life! #COVID19 https://t.co/EX8fAACDGU 

diretory is Data/Mid_Processed/239919_United States_en.csv
line[0] = Sabotaging Trump was part of his plan all along.  #FireFauci 

line[1] = Apocalyptic forecast of 4,000 coronavirus deaths a day could be FIVE TIMES too high and had already been proved wrong when governm

  default_term_info = default_term_info.sort_values(


215567
1 215567


  default_term_info = default_term_info.sort_values(


29859
2 29859


  default_term_info = default_term_info.sort_values(


245434
3 245434


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/41378_United Kingdom.csv
line[0] = Spare a thought for the nut jobs with end of the world bunkers in their gardens. They've never been so joyous #lockdown #COVID19 

line[1] = @CNN @OneMoreGoodMan #Brexit supporting, Tax avoiding, @Conservatives supporting, @Dyson whose manufacturing base is in the Far East, gets the contract over British based companies such as MEC Medical. 🙄
#COVID19
 https://t.co/RXAE4fyBSI 

diretory is Data/Mid_Processed/97969_United Kingdom_en.csv
line[0] =  The 'new normal', in which Boris Johnson dictates how many humans you can interact with, is dystopian. Am I the only one who is genuinely far more worried about government overreach than COVID-19 (which has a 99.5%+ recovery rate)? 

line[1] =   The 'new normal', in which Boris J dictates how many people you can peacefully interact with, is dystopian. Am I the only one who is genuinely more worried about government overreach than COVID-19 (which has a 99.5%+ recovery rate)? Th

  default_term_info = default_term_info.sort_values(


95741
1 95741


  default_term_info = default_term_info.sort_values(


6153
2 6153


  default_term_info = default_term_info.sort_values(


143272
3 143272


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/37862_Thailand.csv
line[0] = Applause for all the healthworkers that are working effortlessly to keep us safe and healthy. Thank you ❤️ #covid19 #bangkok #thailand #โควิท19 (@RichardBarrow) https://t.co/qzrvXyvVtT 

line[1] = Massive traffic jam at the northern end of Phuket as many people try to flee the island before the midnight lockdown #Thailand #COVID19 https://t.co/g4OAvSwIWM 

diretory is Data/Mid_Processed/748_Thailand_en.csv
line[0] = COVID is Over! 

line[1] =   11 Ajax players test positive for coronavirus ahead of Champions League fixture: The Amsterdammers have confirmed just a… 

diretory is Data/Late_Processed/35568_Thailand.csv
line[0] = The NHS spend 6 billion pounds a year on obesity… 
#NHS #COVID19 #WakeUp 

line[1] = Before this year ends, would like to thank the warriors who helped people in worst of their times during the second wave of #COVID19 . Let’s all be prepared to fight again 🙏👍  #CoronaWarriors  

37860
0 37860


  default_term_info = default_term_info.sort_values(


494
1 494


  default_term_info = default_term_info.sort_values(


35568
2 35568


  default_term_info = default_term_info.sort_values(


73922
3 73922


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/37919_India.csv
line[0] = Jee lo thoda.
#IndiaFightsCorona #IndiaFightsCOVID19 #IndiaVsCorona #GoCorona #GoCoronaGo #GoCoronaCoronaGo #IndiaBattlesCoronavirus #IndiaFightCorona #IndiaLockdown #IndiaLockdownFor21Days #indialove #MyCountryTheNewAge #CoronaUpdate #CoronavirusPandemic #Coronaindia https://t.co/QfAcxLEePH 

line[1] = Dear Chics and Dappers of every Community, Let your Lift rooms and Basement breath.

#StayAtHomeAndStaySafe #Quarantine #Lockdown21 #SocialDistancing #Covid_19 #Coronavirus 

diretory is Data/Mid_Processed/16704_India_en.csv
line[0] =  Please back our project -     #coronavirus 

line[1] =  Please back our project -     #coronavirus 

diretory is Data/Late_Processed/11418_India.csv
line[0] = India approves 2 COVID vaccines, 1 COVID pill in a single day for emergency
#COVID19 #CovidVaccine #OmicronVariant #Latest #news #pupulse  

line[1] = Scientists Identify Antibodies that Can Neutralise Omicron Variant of COVID-19 
#Scientist

  default_term_info = default_term_info.sort_values(


13812
1 13812


  default_term_info = default_term_info.sort_values(


11418
2 11418


  default_term_info = default_term_info.sort_values(


63145
3 63145


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/16135_Canada.csv
line[0] = Some good news ! #Italy #COVID19 https://t.co/8Da68CLl4x 

line[1] = Kerry's #Covid_19 survival #drink of the day:

The Whiskey Sour.

@JPWisersCA https://t.co/KRSPtJpd3Z 

diretory is Data/Mid_Processed/12334_Canada_en.csv
line[0] =     IMHO, it is a crime for health authorities not to let the populace know of the efficacy of Vitamin D3 .… 

line[1] = Taking a stand.  Staff members at Glamorgan Junior Public School are refusing work after a COVID-19 outbreak was confirmed last week.   

diretory is Data/Late_Processed/4837_Canada.csv
line[0] = And infect their parents. 🤦‍♀️

In #Sarnia Lambton, the left most column is cases/100k in &lt;20, right most is 80+.

#COVID19 #COVID19Ontario #covid19ON   

line[1] = This is a really good read to better understand how quickly Omicron spreads.
How Long Does Omicron Take to Make You Sick? - The Atlantic #covid19 #omicron    

16135
0 16135


  default_term_info = default_term_info.sort_values(


10451
1 10451


  default_term_info = default_term_info.sort_values(


4837
2 4837


  default_term_info = default_term_info.sort_values(


31423
3 31423


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/5370_Serbia.csv
line[0] = Fuck you #covid_19 

line[1] = Counting #COVID19 deceased in #Italy 

#COVID2019italia 
#CoronavirusOutbreak https://t.co/b1PQOlsZNU 

diretory is Data/Mid_Processed/15171_Serbia_en.csv
line[0] =   . decision to ban the sale of firecrackers is adding misery to the lives of millions of workers in the Firew… 

line[1] =   The poisonous smoke emanating from fireworks is a health hazard for #COVID19 patients as well as those suffering from he… 

diretory is Data/Late_Processed/6349_Serbia.csv
line[0] = Maharashtra government issued fresh guidelines and imposed Section 144 in the state from 9 pm to 6 am

#COVID19 #Omicron #Maharashtra #section144 

 

line[1] = . has said that a certificate is not required for the administration of precaution doses to those about 60 with comorbidities

#COVID19 #Omicron #OmicronVariant 

 

5370
0 5370


  default_term_info = default_term_info.sort_values(


11278
1 11278


  default_term_info = default_term_info.sort_values(


6349
2 6349


  default_term_info = default_term_info.sort_values(


22997
3 22997


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/7478_Australia.csv
line[0] = Self-Isolation Day 10: Coming around the last turn &amp; into the final stretch. The house is cleaner than it’s ever been this year, because Shondelle is here kicking my ass! In the meantime, we are enjoying a yummy pork ragu with linguini. 
#selfisolation #covid_19 #nomnomnom https://t.co/6b5MtED836 

line[1] = #inthistogether @NMHC national mental health and wellbeing for #COVID19 tips for everyone to follow each day while physical distancing. https://t.co/46FW5ukJlD https://t.co/MS8X6X5p5f 

diretory is Data/Mid_Processed/3043_Australia_en.csv
line[0] =   1 of 6 Good morning  I'm very happy to share with you that my baby girl is breathing on her own for 3 days now. She's got… 

line[1] =   Uncomfortable truth: there is a reasonable chance that Victoria will locally eliminate #COVID19, leaving NSW as the only… 

diretory is Data/Late_Processed/3481_Australia.csv
line[0] = If you’re going to follow anyone, make it someone y

  default_term_info = default_term_info.sort_values(


1793
1 1793


  default_term_info = default_term_info.sort_values(


3481
2 3481


  default_term_info = default_term_info.sort_values(


12752
3 12752


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/9953_Nigeria.csv
line[0] = @NCDCgov @Fmohnigeria SPAIN, NETHERLANDS AND TURKEY have issued statements about defective medical orders received from China. Please do your DD and exercise caution as you order the needed supplies from China. We all understand you’re working on an impossible timeline #COVID19 https://t.co/3F3xDZgOdL 

line[1] = *Call family and friends to check up. Do not visit! Just call. That is the best emotional support we can give to each other at this time. If you love me, stay away from me; for now!* #StayAtHomeAndStaySafe #coronavirus #COVID19NIGERIA 

diretory is Data/Mid_Processed/2204_Nigeria_en.csv
line[0] =   Where is your outrage at COVID-19 palliatives that were hidden by those who are supposed to be leaders? Is that not haram… 

line[1] =   The second wave of COVID-19 is here  7 new patients admitted for COVID-19 last week, 5 landed in the ICU and are curre… 

diretory is Data/Late_Processed/123_Nigeria.csv
line[0] =  will mi

  default_term_info = default_term_info.sort_values(


1949
1 1949


  default_term_info = default_term_info.sort_values(


123
2 123


  default_term_info = default_term_info.sort_values(


12024
3 12024


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/6309_South Africa.csv
line[0] = We went Mexican for lunch. Corona and Tacos... And Monopoly to kill.some.hours. well.done lee.becker.9480 for winning. And the tacos were a triumph jomarivermeulen . #lockdown2020 #covid19 @ Kakamas, Northern Cape https://t.co/a9cmt33PIZ 

line[1] = Statement of the Gauteng Provincial Command Council on the adherence to the 21-Day lockdown regulations and further measures to be taken to flatten the curve #Covid_19 #CoronaVirusSA #GautengCOVID19 #StayHomeSA https://t.co/r3acKAaCq6 

diretory is Data/Mid_Processed/5087_South Africa_en.csv
line[0] =   Here are my latest graphs showing the weekly #COVID19 data and trends in South Africa 🇿🇦  I've received a number of queries… 

line[1] =   The team  has donated its prize money to  A Stellenbosch initiative which was established to provide… 

diretory is Data/Late_Processed/630_South Africa.csv
line[0] = It's pathetic that  does not want to refund my flight that I've needed to 

  default_term_info = default_term_info.sort_values(


4748
1 4748


  default_term_info = default_term_info.sort_values(


630
2 630


  default_term_info = default_term_info.sort_values(


11687
3 11687


  default_term_info = default_term_info.sort_values(


diretory is Data/Early_Processed/873_Colombia.csv
line[0] = Cracks #QuedateEnCasa #coronavirus #JuntosSaldremosAdelante https://t.co/MhYFqNuPBh 

line[1] = Fuck it up! #coronavirus https://t.co/dx0FSONTdT 

diretory is Data/Mid_Processed/11716_Colombia_en.csv
line[0] = Fuck Trump!!! 

line[1] = Trump is a pos! Vote him out! 

diretory is Data/Late_Processed/668_Colombia.csv
line[0] =  Like limiting  and the state of Florida of monoclonal therapy? #Florida #COVID19 #BidenIsAFailure #DoNotComply 

line[1] = New COVID-19 testing site opens THURSDAY in Orange County at the South Orange Youth Sports Complex. Hours 9am-5pm, or until capacity is reached. Plan to arrive early!   #testing #COVID19  

873
0 873


  default_term_info = default_term_info.sort_values(


6598
1 6598


  default_term_info = default_term_info.sort_values(


668
2 668


  default_term_info = default_term_info.sort_values(


8139
3 8139


  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
