## Data Cleaning & Consolidation

In [43]:
# !pip install googletrans



In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import os
from googletrans import Translator
import math
from datetime import datetime

### Text Preprocessing (functions)

In [43]:
# Text Preprocessing

def remove_urls(s):
    # re.sub(pattern,repl,string) is used to replace substrings. Will replace the matches in string with repl
    return re.sub(r'https?://\S+', "", s)

def remove_usernames(s):
    return re.sub(r'@\S+', "", s)

def remove_specialchar(text, remove_digits=False):    
    if not remove_digits:
        pattern = r'[^a-zA-z0-9\s]'
    else:
        pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

### Singapore Tweets in March (till 18 Mar)

In [None]:
# Get SG tweets in March

path = 'coronavirus-covid19-tweets/'
tweets_csv = ['2020-03-12 Coronavirus Tweets.csv',
             '2020-03-13 Coronavirus Tweets.csv',
             '2020-03-14 Coronavirus Tweets.csv',
             '2020-03-15 Coronavirus Tweets.csv',
             '2020-03-16 Coronavirus Tweets.csv',
             '2020-03-17 Coronavirus Tweets.csv',
             '2020-03-18 Coronavirus Tweets.csv']

tweets = pd.read_csv('coronavirus-covid19-tweets/2020-03-00 Coronavirus Tweets (pre 2020-03-12).csv')
tweets_sg = tweets[tweets['country_code'] == 'SG']

for file in tweets_csv:
    filepath = path + file
    df = pd.read_csv(filepath)
    df_sg = df[df['country_code'] == 'SG']
    
    tweets_sg = pd.concat([tweets_sg, df_sg], ignore_index=True)
    
tweets_sg.shape

In [21]:
# Export Singapore's tweets on COVID-19 in March
tweets_sg.to_csv('tweets_sg_mar.csv')

In [195]:
tweets_mar = pd.read_csv('tweets_sg_mar.csv', index_col=0)

print(tweets_mar.shape)
tweets_mar.head()

(356, 22)


Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,retweet_count,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang
0,1235359346659217408,4373254527,2020-03-05T00:19:54Z,AquibIkubal,Good morning. Crowded metro. Fully packed. Sin...,Twitter for Android,,,,False,...,0,SG,"Central Region, Singapore",admin,32,91,,2015-12-04T15:15:45Z,False,en
1,1235550008314744833,45325484,2020-03-05T12:57:32Z,bucksteeth,#pohchaipills is life. Fuck you #Covid_19 http...,Twitter for Android,,,,False,...,0,SG,"East Region, Singapore",admin,124,448,,2009-06-07T11:41:59Z,False,en
2,1235578864945913856,836488722157744130,2020-03-05T14:52:12Z,SulingLinCNA,Hmm not many good options to pump the economy ...,Twitter for iPhone,,,,False,...,0,SG,"Central Region, Singapore",admin,143,143,,2017-02-28T08:10:27Z,False,en
3,1235589693913190400,355284635,2020-03-05T15:35:13Z,ArunPrasanth_R,#Covid_19 #CoronavirusOutbreak #COVID19india ...,Twitter for iPhone,,,,True,...,0,SG,"West Region, Singapore",admin,103,209,,2011-08-15T03:32:43Z,False,en
4,1235716860881993729,282135000,2020-03-06T00:00:32Z,danielfyork,Didn’t think I’d be saying this a few weeks ag...,Twitter for iPhone,,,,False,...,2,SG,"Central Region, Singapore",admin,6251,2054,,2011-04-14T16:24:28Z,False,en


In [196]:
# list(tweets_mar.columns)

In [197]:
# Get list of tweets
tweets_list = tweets_mar['text'].tolist()
tweets_list[:10]

['Good morning. Crowded metro. Fully packed. Singapore is still moving fast irrespective of the coronavirus spread. Can they hold up!\n #singapore #coronavirus #CoronavirusOutbreak',
 '#pohchaipills is life. Fuck you #Covid_19 https://t.co/y3I51XQOq9',
 'Hmm not many good options to pump the economy in a #CoronavirusOutbreak. Impt when how the US economy does may determines who wins the Presidency in the 2020 https://t.co/kjowSxMh56',
 '#Covid_19 #CoronavirusOutbreak #COVID19india  STAY HYGIENE 💪🏼😷 https://t.co/rirBFf3bm9',
 'Didn’t think I’d be saying this a few weeks ago but I’m actually getting nervous about returning to the UK from Asia #COVID19 #CoronavirusOutbreak',
 'Updates on Covid-19 (Coronavirus Disease 2019)  #CoronavirusOutbreak - Case Summary in #Singapore (as of 5 Mar 2020, 1200h) #COVID19 \n@sporeMOH via: https://t.co/OzIGzEq9b9',
 'Updates on Covid-19 (Coronavirus Disease 2019)  #CoronavirusOutbreak - Case Summary in #Singapore (as of 6 Mar 2020, 1200h) #COVID19 \n@spo

#### Text Preprocessing

In [198]:
# Text Preprocessing

tweets_list_processed = []
for tweet in tweets_list:
    tweet = remove_urls(tweet)
    tweet = remove_usernames(tweet)
    tweet = remove_specialchar(tweet, remove_digits=False)
    tweets_list_processed.append(tweet)

tweets_list_processed

['Good morning Crowded metro Fully packed Singapore is still moving fast irrespective of the coronavirus spread Can they hold up\n singapore coronavirus CoronavirusOutbreak',
 'pohchaipills is life Fuck you Covid_19 ',
 'Hmm not many good options to pump the economy in a CoronavirusOutbreak Impt when how the US economy does may determines who wins the Presidency in the 2020 ',
 'Covid_19 CoronavirusOutbreak COVID19india  STAY HYGIENE  ',
 'Didnt think Id be saying this a few weeks ago but Im actually getting nervous about returning to the UK from Asia COVID19 CoronavirusOutbreak',
 'Updates on Covid19 Coronavirus Disease 2019  CoronavirusOutbreak  Case Summary in Singapore as of 5 Mar 2020 1200h COVID19 \n via ',
 'Updates on Covid19 Coronavirus Disease 2019  CoronavirusOutbreak  Case Summary in Singapore as of 6 Mar 2020 1200h COVID19 \n via ',
 'Groflchige Triage in der Intensivmedizin in Italien in Diskussion unter Druck des CoronavirusOutbreak Manahmen wie bei Megadesastern oder im

#### Translation 

In [199]:
# Initial exploratory data analysis shows that there are a lot of SG tweets in Malay. 
# We will attempt to detect tweets made in Malay (or any other languages) and translate that to English.

translator = Translator()
translated_tweet_list = translator.translate(tweets_list_processed, dest='en')

In [200]:
translated_tweets = []

num_translated_tweets = 0 
for translation in translated_tweet_list:
    if translation.origin != translation.text:
        num_translated_tweets += 1
    translated_tweets.append(translation.text)
    
print("The number of tweets that are not in English and have been translated is: ", num_translated_tweets)
translated_tweets

The number of tweets that are not in English and have been translated is:  287


['Good morning Crowded metro Fully packed Singapore is still moving fast irrespective of the coronavirus spread Can they hold up\n singapore coronavirus CoronavirusOutbreak',
 'pohchaipills is life Fuck you Covid_19',
 'Hmm not many good options to pump the economy in a CoronavirusOutbreak Impt when how the US economy does may determines who wins the Presidency in the 2020',
 'Covid_19 CoronavirusOutbreak COVID19india  STAY HYGIENE',
 'Didnt think Id be saying this a few weeks ago but Im actually getting nervous about returning to the UK from Asia COVID 19 Coronavirus Outbreak',
 'Updates on Covid19 Coronavirus Disease 2019  CoronavirusOutbreak  Case Summary in Singapore as of 5 Mar 2020 1200h COVID19 \n via',
 'Updates on Covid19 Coronavirus Disease 2019  CoronavirusOutbreak  Case Summary in Singapore as of 6 Mar 2020 1200h COVID19 \n via',
 'Groflchige Triage in intensive care in Italy in discussion under pressure of CoronavirusOutbreak measures like Mega disasters or war where it wa

In [201]:
tweets_mar['translated_text'] = translated_tweets
tweets_mar.to_csv('sg_tweets_covid19_march.csv')

print(tweets_mar.shape)
tweets_mar.head()

(356, 23)


Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,translated_text
0,1235359346659217408,4373254527,2020-03-05T00:19:54Z,AquibIkubal,Good morning. Crowded metro. Fully packed. Sin...,Twitter for Android,,,,False,...,SG,"Central Region, Singapore",admin,32,91,,2015-12-04T15:15:45Z,False,en,Good morning Crowded metro Fully packed Singap...
1,1235550008314744833,45325484,2020-03-05T12:57:32Z,bucksteeth,#pohchaipills is life. Fuck you #Covid_19 http...,Twitter for Android,,,,False,...,SG,"East Region, Singapore",admin,124,448,,2009-06-07T11:41:59Z,False,en,pohchaipills is life Fuck you Covid_19
2,1235578864945913856,836488722157744130,2020-03-05T14:52:12Z,SulingLinCNA,Hmm not many good options to pump the economy ...,Twitter for iPhone,,,,False,...,SG,"Central Region, Singapore",admin,143,143,,2017-02-28T08:10:27Z,False,en,Hmm not many good options to pump the economy ...
3,1235589693913190400,355284635,2020-03-05T15:35:13Z,ArunPrasanth_R,#Covid_19 #CoronavirusOutbreak #COVID19india ...,Twitter for iPhone,,,,True,...,SG,"West Region, Singapore",admin,103,209,,2011-08-15T03:32:43Z,False,en,Covid_19 CoronavirusOutbreak COVID19india STA...
4,1235716860881993729,282135000,2020-03-06T00:00:32Z,danielfyork,Didn’t think I’d be saying this a few weeks ag...,Twitter for iPhone,,,,False,...,SG,"Central Region, Singapore",admin,6251,2054,,2011-04-14T16:24:28Z,False,en,Didnt think Id be saying this a few weeks ago ...


### Singapore Tweets in March (from 19 Mar)

In [266]:
# Get SG tweets in March

path = 'coronavirus-covid19-tweets/'
tweets_csv2 = ['2020-03-20 Coronavirus Tweets.csv',
             '2020-03-21 Coronavirus Tweets.csv',
             '2020-03-22 Coronavirus Tweets.csv']

tweets2 = pd.read_csv('coronavirus-covid19-tweets/2020-03-19 Coronavirus Tweets.csv')
tweets_sg2 = tweets2[tweets2['country_code'] == 'SG']

for file in tweets_csv2:
    filepath = path + file
    df = pd.read_csv(filepath)
    df_sg = df[df['country_code'] == 'SG']
    
    tweets_sg2 = pd.concat([tweets_sg2, df_sg], ignore_index=True)
    
tweets_sg2.shape

(247, 22)

In [267]:
# Export Singapore's tweets on COVID-19 in March
tweets_sg2.to_csv('tweets_sg_mar(2).csv')

In [287]:
tweets_mar2 = pd.read_csv('tweets_sg_mar(2).csv', index_col=0)

print(tweets_mar2.shape)
tweets_mar2.head()

(247, 22)


Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,retweet_count,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang
0,1240432252376379393,837548605,2020-03-19T00:17:49Z,SanghiAnand,The “experts” on TV have now moved from the R ...,Twitter for iPhone,,,,False,...,0,SG,"Central Region, Singapore",admin,129,95,,2012-09-21T11:03:25Z,False,en
1,1240432793881997312,76647624,2020-03-19T00:19:58Z,JayMcr,"Cumon people...\n\n""Social Distancing"" Does NO...",Instagram,,,,False,...,0,SG,"Central Region, Singapore",admin,2294,2400,,2009-09-23T13:49:42Z,False,en
2,1240433938377539584,90391638,2020-03-19T00:24:31Z,triciapang,824am and the train is barely packed at Hougan...,Twitter for iPhone,,,,False,...,0,SG,"North-East Region, Singapore",admin,170,171,,2009-11-16T13:22:19Z,False,en
3,1240436806396440576,1016482563328602112,2020-03-19T00:35:55Z,3e_like,In time likes this QC will go down. #mayor #co...,Twitter for iPhone,,,,False,...,0,SG,"North-East Region, Singapore",admin,1,2,,2018-07-10T00:41:43Z,False,en
4,1240438146317217794,544789016,2020-03-19T00:41:14Z,onefern,#Singapore #coronavirus #COVID19 https://t.co/...,Twitter for Android,,,,False,...,0,SG,"Central Region, Singapore",admin,77,325,,2012-04-04T03:01:24Z,False,und


In [289]:
# Get list of tweets
tweets_list2 = tweets_mar2['text'].tolist()
tweets_list2[:10]

['The “experts” on TV have now moved from the R word to the D word...the recovery shapes have moved from V to U to W, but L seems to be the favourite now. For us ordinary folk, best to take precautions and stay safe while carrying on with our work. \n#recession #depression #COVID19',
 'Cumon people...\n\n"Social Distancing" Does NOT mean "Self Isolation" So Please get a grip, get outta the House &amp; most importantly...Keep Exercising!\n⚠️\n⚠️\n⚠️\n#covid_19 #corona #NOTairborn… https://t.co/uOWeVfmcMV',
 '824am and the train is barely packed at Hougang #COVID19 #thursdaymorning #singapore',
 'In time likes this QC will go down. #mayor #covid19 #QC #QuezonCity',
 '#Singapore #coronavirus #COVID19 https://t.co/qqgzAv2NXY',
 '@Expedia i am trying to change my reservation since last 4 days and send you number of emails about it. There is no response. At least update your site to allow changing the reservation. #COVID19 #Expedia',
 '#coronavirus playlist on @Spotify https://t.co/GCiTQKdNE

#### Text Preprocessing

In [290]:
# Text Preprocessing

tweets_list_processed2 = []
for tweet in tweets_list2:
    tweet = remove_urls(tweet)
    tweet = remove_usernames(tweet)
    tweet = remove_specialchar(tweet, remove_digits=False)
    tweets_list_processed2.append(tweet)

tweets_list_processed2

['The experts on TV have now moved from the R word to the D wordthe recovery shapes have moved from V to U to W but L seems to be the favourite now For us ordinary folk best to take precautions and stay safe while carrying on with our work \nrecession depression COVID19',
 'Cumon people\n\nSocial Distancing Does NOT mean Self Isolation So Please get a grip get outta the House amp most importantlyKeep Exercising\n\n\n\ncovid_19 corona NOTairborn ',
 '824am and the train is barely packed at Hougang COVID19 thursdaymorning singapore',
 'In time likes this QC will go down mayor covid19 QC QuezonCity',
 'Singapore coronavirus COVID19 ',
 ' i am trying to change my reservation since last 4 days and send you number of emails about it There is no response At least update your site to allow changing the reservation COVID19 Expedia',
 'coronavirus playlist on  ',
 'Covid19 Nuove misure restrittive allingresso a Singapore \n\n\n\n\ncoronavirus covid ',
 'Shout out to Singapore government for deal

In [291]:
temp = pd.DataFrame(tweets_list_processed2)
temp.to_csv('temp.csv')

#### Translation 

In [292]:
# Initial exploratory data analysis shows that there are a lot of SG tweets in Malay. 
# We will attempt to detect tweets made in Malay (or any other languages) and translate that to English.

# translator = Translator()
# translated_tweet_list2 = translator.translate(tweets_list_processed2, dest='en')

In [293]:
# translated_tweets2 = []

# num_translated_tweets2 = 0 
# for translation in translated_tweet_list2:
#     if translation.origin != translation.text:
#         num_translated_tweets2 += 1
#     translated_tweets2.append(translation.text)
    
# print("The number of tweets that are not in English and have been translated is: ", num_translated_tweets2)
# translated_tweets2

In [294]:
# Read file with the translated tweets (that were generated from above code)

translated_tweets2 = pd.read_csv('translated_mar(2).csv', index_col=0)['0'].tolist()
translated_tweets2[:10]

['The experts on TV have now moved from the R word to the D wordthe recovery shapes have moved from V to U to W but L seems to be the favourite now For us ordinary folk best to take precautions and stay safe while carrying on with our work \nrecession depression COVID19',
 'Cumon people\n\nSocial Distancing Does NOT mean Self Isolation So Please get a grip get outta the House amp most importantlyKeep Exercising\n\n\n\ncovid_19 corona NOTairborn',
 '824am and the train is barely packed at Hougang COVID 19 thursday morning singapore',
 'In time like this BAC will go down mayor covid 19 QC Quezon City',
 'Singapore coronavirus COVID19',
 'i am trying to change my reservation since last 4 days and send you number of emails about it There is no response At least update your site to allow changing the reservation COVID19 Expedia',
 'coronavirus playlist on',
 'Covid19 New restrictive measures at the entrance to Singapore\n\n\n\n\ncoronavirus covid',
 'Shout out to Singapore government for de

In [295]:
tweets_mar2['translated_tweets'] = translated_tweets2
tweets_mar2.to_csv('sg_tweets_covid19_march(2).csv')

print(tweets_mar2.shape)
tweets_mar2.head()

(247, 23)


Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,translated_tweets
0,1240432252376379393,837548605,2020-03-19T00:17:49Z,SanghiAnand,The “experts” on TV have now moved from the R ...,Twitter for iPhone,,,,False,...,SG,"Central Region, Singapore",admin,129,95,,2012-09-21T11:03:25Z,False,en,The experts on TV have now moved from the R wo...
1,1240432793881997312,76647624,2020-03-19T00:19:58Z,JayMcr,"Cumon people...\n\n""Social Distancing"" Does NO...",Instagram,,,,False,...,SG,"Central Region, Singapore",admin,2294,2400,,2009-09-23T13:49:42Z,False,en,Cumon people\n\nSocial Distancing Does NOT mea...
2,1240433938377539584,90391638,2020-03-19T00:24:31Z,triciapang,824am and the train is barely packed at Hougan...,Twitter for iPhone,,,,False,...,SG,"North-East Region, Singapore",admin,170,171,,2009-11-16T13:22:19Z,False,en,824am and the train is barely packed at Hougan...
3,1240436806396440576,1016482563328602112,2020-03-19T00:35:55Z,3e_like,In time likes this QC will go down. #mayor #co...,Twitter for iPhone,,,,False,...,SG,"North-East Region, Singapore",admin,1,2,,2018-07-10T00:41:43Z,False,en,In time like this BAC will go down mayor covid...
4,1240438146317217794,544789016,2020-03-19T00:41:14Z,onefern,#Singapore #coronavirus #COVID19 https://t.co/...,Twitter for Android,,,,False,...,SG,"Central Region, Singapore",admin,77,325,,2012-04-04T03:01:24Z,False,und,Singapore coronavirus COVID19


### Singapore Tweets (Jan - Early Mar)

In [237]:
# Retrieve all SG tweets from Jan - Early March
tweets_bulk = pd.read_csv('sg_tweets_covid19.csv', index_col=0)

print(tweets_bulk.shape)
tweets_bulk.head()

(6084, 5)


Unnamed: 0,name,texts,timestamp,lang,full text
0,PuteriAmirah01,b'RT @rndsmsl: Kiamat dh dekat knp aku masih m...,1580145448380,in,
1,joseph_yeow,b'RT @new_prykm: This video shows how South Ko...,1580145450152,en,
2,_naufalraffid,b'RT @L0s3r_AT_w0rk: Thomas had never seen suc...,1580145460572,en,
3,FaadehlehHakehm,b'RT @chey_cobb: Chinese officials say this is...,1580145610362,en,
4,miz4h,"b'RT @putridamiaa: spread this, it\xe2\x80\x99...",1580145646308,en,


In [238]:
def get_sgtweet_list(tweetdf):
    
    text_list = tweetdf['texts'].tolist()
    fulltext_list = tweetdf['full text'].tolist()
    num_tweets = len(text_list)

    sgtweets_list = []
    for i in range(0, num_tweets):
        try:
            if math.isnan(fulltext_list[i]):
                sgtweets_list.append(text_list[i])
        except:
            sgtweets_list.append(fulltext_list[i])
            
    return sgtweets_list 

In [239]:
sgtweet_list = get_sgtweet_list(tweets_bulk)
sgtweet_list[:5]

["b'RT @rndsmsl: Kiamat dh dekat knp aku masih malas solat \\xf0\\x9f\\x98\\x94'",
 "b'RT @new_prykm: This video shows how South Korea Airport is dealing with #coronavirus #\\xe0\\xb9\\x82\\xe0\\xb8\\x84\\xe0\\xb9\\x82\\xe0\\xb8\\xa3\\xe0\\xb8\\x99\\xe0\\xb9\\x88\\xe0\\xb8\\xb2\\xe0\\xb9\\x84\\xe0\\xb8\\xa7\\xe0\\xb8\\xa3\\xe0\\xb8\\xb1\\xe0\\xb8\\xaa from spreading.\\n\\n\\xe0\\xb8\\xad\\xe0\\xb8\\xa2\\xe0\\xb8\\xb2\\xe0\\xb8\\x81\\xe0\\xb9\\x80\\xe0\\xb8\\xab\\xe0\\xb9\\x87\\xe0\\xb8\\x99\\xe0\\xb9\\x81\\xe0\\xb8\\x9a\\xe0\\xb8\\x9a\\xe0\\xb8\\x99\\xe0\\xb8\\xb5\\xe0\\xb9\\x89\\xe0\\xb9\\x83\\xe0\\xb8\\x99\\xe0\\xb9\\x84\\xe0\\xb8\\x97\\xe0\\xb8\\xa2\\nCr.\\xe2\\x80\\xa6'",
 "b'RT @L0s3r_AT_w0rk: Thomas had never seen such bullshit before\\n#CoronaOutbreak https://t.co/RYukvuBtzT'",
 "b'RT @chey_cobb: Chinese officials say this is a photo of a new hospital building in #Wuhan.\\n\\nFunny, because a reverse image search reveals t\\xe2\\x80\\xa6'",
 "b'RT @putridamiaa: spread this, it\\

#### Text Preprocessing

In [240]:
# We observed that there are unicode character errors, so this function helps to remove that
def remove_unicode_errors(tweet):
    return re.sub(r'\\\S+', "", tweet)

In [241]:
# Text Preprocessing

sgtweet_list_processed = []
for tweet in sgtweet_list:
    tweet = remove_urls(tweet)
    tweet = remove_usernames(tweet)
    tweet = remove_specialchar(tweet, remove_digits=False)
    tweet = remove_unicode_errors(tweet)
    sgtweet_list_processed.append(tweet)

sgtweet_list_processed[:10]

['bRT  Kiamat dh dekat knp aku masih malas solat ',
 'bRT  This video shows how South Korea Airport is dealing with coronavirus  from spreading',
 'bRT  Thomas had never seen such bullshit before ',
 'bRT  Chinese officials say this is a photo of a new hospital building in Wuhan because a reverse image search reveals t',
 'bRT  spread this it worth to share  coronarvirus ',
 'bRT  Alleged leak from epidemiologist working for Brazil Ministry of Health says like someone had taken different par',
 'bRT  Please do retweet this We need to change our disgusting habits this is a really2 serious matter This is no joke nor a gam',
 'bRT  Wuhan mayor reveals FIVE MILLION residents have left the city before it went into lockdown coronavirusoutbreak ',
 'bRT  Please do retweet this We need to change our disgusting habits this is a really2 serious matter This is no joke nor a gam',
 'bRT  New video form the coronavirus quarantine zone in Wuhan A man demands hospital not to turn away patients His re

#### Translation 

In [242]:
# Number of tweets that are not in English
print("Number of tweets that are not in English is: ", tweets_bulk[tweets_bulk['lang'] != 'en'].shape[0])

Number of tweets that are not in English is:  669


In [243]:
# Note: Below code might hit errors, when GoogleTranslate API limit is reached
# translator = Translator()
# translated_sgtweet_list = translator.translate(sgtweet_list_processed, dest='en')

# translated_tweets_bulk = []
# for translation in translated_sgtweet_list:
#     translated_tweets_bulk.append(translation.text)
    
# translated_tweets_bulk

In [244]:
# Retrieve set of translated tweets for sgtweet_list_processed (tweets were split up and translated with above code due to API limits)

# DataFrame to continue concatenating translated tweets from other files
translated_tweets_bulk = pd.read_csv('translated_tweets_bulk/translated1.csv')

input_file_list = ['translated_tweets_bulk/translated2.csv',
                   'translated_tweets_bulk/translated4.csv',
                   'translated_tweets_bulk/translated5.csv',
                   'translated_tweets_bulk/translated6.csv',
                   'translated_tweets_bulk/translated7.csv',
                   'translated_tweets_bulk/translated8.csv',
                   'translated_tweets_bulk/translated9.csv',
                   'translated_tweets_bulk/translated10.csv',
                   'translated_tweets_bulk/translated11.csv',
                   'translated_tweets_bulk/translated12.csv'
                  ]

counter = 0
for file in input_file_list:
    df = pd.read_csv(file)
    counter += df.shape[0]    
    translated_tweets_bulk = pd.concat([translated_tweets_bulk, df], ignore_index=True)
    
print(translated_tweets_bulk.shape)
translated_tweets_bulk.head()

(6084, 1)


Unnamed: 0,0
0,News Apocalypse is near why I'm still lazy pra...
1,bRT This video shows how South Korea Airport ...
2,bRT Thomas had never seen such bullshit before
3,bRT Chinese officials say this is a photo of ...
4,bRT spread this it worth to share coronavirus


In [245]:
# Further clean the data to remove 'bRT'

translated = translated_tweets_bulk['0'].tolist()
translated_cleaned = []

for tweet in translated:
    tweet = str(tweet)
    tweet = re.sub(r'bRT\s', "", tweet)
    translated_cleaned.append(tweet)
    
translated_tweets_bulk = pd.DataFrame(translated_cleaned)

print(translated_tweets_bulk.shape)
translated_tweets_bulk.head()

(6084, 1)


Unnamed: 0,0
0,News Apocalypse is near why I'm still lazy pra...
1,This video shows how South Korea Airport is d...
2,Thomas had never seen such bullshit before
3,Chinese officials say this is a photo of a ne...
4,spread this it worth to share coronavirus


In [246]:
# Add translated tweets to DataFrame

tweets_bulk['translated_tweets'] = translated_tweets_bulk

print(tweets_bulk.shape)
tweets_bulk.head()

# Export data
tweets_bulk.to_csv('sg_tweets_covid19_bulk.csv')

(6084, 6)


### Combine Twitter Datasets

- Jan to Early March
- March (till 18 March)
- March (from 19 March)

This section combines the three datasets that were processed above

#### Jan to Early Mar

In [299]:
df1 = pd.read_csv('sg_tweets_covid19_bulk.csv', index_col=0)

print(df1.shape)
df1.head()

(6084, 6)


Unnamed: 0,name,texts,timestamp,lang,full text,translated_tweets
0,PuteriAmirah01,b'RT @rndsmsl: Kiamat dh dekat knp aku masih m...,1580145448380,in,,News Apocalypse is near why I'm still lazy pra...
1,joseph_yeow,b'RT @new_prykm: This video shows how South Ko...,1580145450152,en,,This video shows how South Korea Airport is d...
2,_naufalraffid,b'RT @L0s3r_AT_w0rk: Thomas had never seen suc...,1580145460572,en,,Thomas had never seen such bullshit before
3,FaadehlehHakehm,b'RT @chey_cobb: Chinese officials say this is...,1580145610362,en,,Chinese officials say this is a photo of a ne...
4,miz4h,"b'RT @putridamiaa: spread this, it\xe2\x80\x99...",1580145646308,en,,spread this it worth to share coronavirus


In [300]:
# Convert timestamp (in milliseconds) to datetime

df1['timestamp'] = pd.to_datetime(df1['timestamp'], unit='ms')
df1['timestamp'] = df1['timestamp'].apply(lambda x: x.date())
df1.columns = ['username', 'text', 'date', 'lang', 'full text', 'processed_tweets']

print(df1.shape)
df1.head()

(6084, 6)


Unnamed: 0,username,text,date,lang,full text,processed_tweets
0,PuteriAmirah01,b'RT @rndsmsl: Kiamat dh dekat knp aku masih m...,2020-01-27,in,,News Apocalypse is near why I'm still lazy pra...
1,joseph_yeow,b'RT @new_prykm: This video shows how South Ko...,2020-01-27,en,,This video shows how South Korea Airport is d...
2,_naufalraffid,b'RT @L0s3r_AT_w0rk: Thomas had never seen suc...,2020-01-27,en,,Thomas had never seen such bullshit before
3,FaadehlehHakehm,b'RT @chey_cobb: Chinese officials say this is...,2020-01-27,en,,Chinese officials say this is a photo of a ne...
4,miz4h,"b'RT @putridamiaa: spread this, it\xe2\x80\x99...",2020-01-27,en,,spread this it worth to share coronavirus


In [301]:
# Extract required columns only

df1_new = df1[['username', 'text', 'date', 'processed_tweets']].copy()

print(df1_new.shape)
df1_new.head()

(6084, 4)


Unnamed: 0,username,text,date,processed_tweets
0,PuteriAmirah01,b'RT @rndsmsl: Kiamat dh dekat knp aku masih m...,2020-01-27,News Apocalypse is near why I'm still lazy pra...
1,joseph_yeow,b'RT @new_prykm: This video shows how South Ko...,2020-01-27,This video shows how South Korea Airport is d...
2,_naufalraffid,b'RT @L0s3r_AT_w0rk: Thomas had never seen suc...,2020-01-27,Thomas had never seen such bullshit before
3,FaadehlehHakehm,b'RT @chey_cobb: Chinese officials say this is...,2020-01-27,Chinese officials say this is a photo of a ne...
4,miz4h,"b'RT @putridamiaa: spread this, it\xe2\x80\x99...",2020-01-27,spread this it worth to share coronavirus


#### March (till 18 Mar)

In [302]:
df2 = pd.read_csv('sg_tweets_covid19_march.csv', index_col=0)

print(df2.shape)
df2.head()

(356, 23)


Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,translated_text
0,1235359346659217408,4373254527,2020-03-05T00:19:54Z,AquibIkubal,Good morning. Crowded metro. Fully packed. Sin...,Twitter for Android,,,,False,...,SG,"Central Region, Singapore",admin,32,91,,2015-12-04T15:15:45Z,False,en,Good morning Crowded metro Fully packed Singap...
1,1235550008314744833,45325484,2020-03-05T12:57:32Z,bucksteeth,#pohchaipills is life. Fuck you #Covid_19 http...,Twitter for Android,,,,False,...,SG,"East Region, Singapore",admin,124,448,,2009-06-07T11:41:59Z,False,en,pohchaipills is life Fuck you Covid_19
2,1235578864945913856,836488722157744130,2020-03-05T14:52:12Z,SulingLinCNA,Hmm not many good options to pump the economy ...,Twitter for iPhone,,,,False,...,SG,"Central Region, Singapore",admin,143,143,,2017-02-28T08:10:27Z,False,en,Hmm not many good options to pump the economy ...
3,1235589693913190400,355284635,2020-03-05T15:35:13Z,ArunPrasanth_R,#Covid_19 #CoronavirusOutbreak #COVID19india ...,Twitter for iPhone,,,,True,...,SG,"West Region, Singapore",admin,103,209,,2011-08-15T03:32:43Z,False,en,Covid_19 CoronavirusOutbreak COVID19india STA...
4,1235716860881993729,282135000,2020-03-06T00:00:32Z,danielfyork,Didn’t think I’d be saying this a few weeks ag...,Twitter for iPhone,,,,False,...,SG,"Central Region, Singapore",admin,6251,2054,,2011-04-14T16:24:28Z,False,en,Didnt think Id be saying this a few weeks ago ...


In [303]:
# Extract required columns only

df2_new = df2[['screen_name', 'text', 'created_at', 'translated_text']].copy()
df2_new.columns = ['username', 'text', 'date', 'processed_tweets']

print(df2_new.shape)
df2_new.head()

(356, 4)


Unnamed: 0,username,text,date,processed_tweets
0,AquibIkubal,Good morning. Crowded metro. Fully packed. Sin...,2020-03-05T00:19:54Z,Good morning Crowded metro Fully packed Singap...
1,bucksteeth,#pohchaipills is life. Fuck you #Covid_19 http...,2020-03-05T12:57:32Z,pohchaipills is life Fuck you Covid_19
2,SulingLinCNA,Hmm not many good options to pump the economy ...,2020-03-05T14:52:12Z,Hmm not many good options to pump the economy ...
3,ArunPrasanth_R,#Covid_19 #CoronavirusOutbreak #COVID19india ...,2020-03-05T15:35:13Z,Covid_19 CoronavirusOutbreak COVID19india STA...
4,danielfyork,Didn’t think I’d be saying this a few weeks ag...,2020-03-06T00:00:32Z,Didnt think Id be saying this a few weeks ago ...


In [304]:
# Extract date only from 'date' column

df2_new['date'] = df2_new['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
df2_new['date'] = df2_new['date'].apply(lambda x: x.date())
df2_new.head()

Unnamed: 0,username,text,date,processed_tweets
0,AquibIkubal,Good morning. Crowded metro. Fully packed. Sin...,2020-03-05,Good morning Crowded metro Fully packed Singap...
1,bucksteeth,#pohchaipills is life. Fuck you #Covid_19 http...,2020-03-05,pohchaipills is life Fuck you Covid_19
2,SulingLinCNA,Hmm not many good options to pump the economy ...,2020-03-05,Hmm not many good options to pump the economy ...
3,ArunPrasanth_R,#Covid_19 #CoronavirusOutbreak #COVID19india ...,2020-03-05,Covid_19 CoronavirusOutbreak COVID19india STA...
4,danielfyork,Didn’t think I’d be saying this a few weeks ag...,2020-03-06,Didnt think Id be saying this a few weeks ago ...


#### March (from 19 Mar)

In [305]:
df3 = pd.read_csv('sg_tweets_covid19_march(2).csv', index_col=0)

print(df3.shape)
df3.head()

(247, 23)


Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,translated_tweets
0,1240432252376379393,837548605,2020-03-19T00:17:49Z,SanghiAnand,The “experts” on TV have now moved from the R ...,Twitter for iPhone,,,,False,...,SG,"Central Region, Singapore",admin,129,95,,2012-09-21T11:03:25Z,False,en,The experts on TV have now moved from the R wo...
1,1240432793881997312,76647624,2020-03-19T00:19:58Z,JayMcr,"Cumon people...\n\n""Social Distancing"" Does NO...",Instagram,,,,False,...,SG,"Central Region, Singapore",admin,2294,2400,,2009-09-23T13:49:42Z,False,en,Cumon people\n\nSocial Distancing Does NOT mea...
2,1240433938377539584,90391638,2020-03-19T00:24:31Z,triciapang,824am and the train is barely packed at Hougan...,Twitter for iPhone,,,,False,...,SG,"North-East Region, Singapore",admin,170,171,,2009-11-16T13:22:19Z,False,en,824am and the train is barely packed at Hougan...
3,1240436806396440576,1016482563328602112,2020-03-19T00:35:55Z,3e_like,In time likes this QC will go down. #mayor #co...,Twitter for iPhone,,,,False,...,SG,"North-East Region, Singapore",admin,1,2,,2018-07-10T00:41:43Z,False,en,In time like this BAC will go down mayor covid...
4,1240438146317217794,544789016,2020-03-19T00:41:14Z,onefern,#Singapore #coronavirus #COVID19 https://t.co/...,Twitter for Android,,,,False,...,SG,"Central Region, Singapore",admin,77,325,,2012-04-04T03:01:24Z,False,und,Singapore coronavirus COVID19


In [306]:
# Extract required columns only

df3_new = df3[['screen_name', 'text', 'created_at', 'translated_tweets']].copy()
df3_new.columns = ['username', 'text', 'date', 'processed_tweets']

print(df3_new.shape)
df3_new.head()

(247, 4)


Unnamed: 0,username,text,date,processed_tweets
0,SanghiAnand,The “experts” on TV have now moved from the R ...,2020-03-19T00:17:49Z,The experts on TV have now moved from the R wo...
1,JayMcr,"Cumon people...\n\n""Social Distancing"" Does NO...",2020-03-19T00:19:58Z,Cumon people\n\nSocial Distancing Does NOT mea...
2,triciapang,824am and the train is barely packed at Hougan...,2020-03-19T00:24:31Z,824am and the train is barely packed at Hougan...
3,3e_like,In time likes this QC will go down. #mayor #co...,2020-03-19T00:35:55Z,In time like this BAC will go down mayor covid...
4,onefern,#Singapore #coronavirus #COVID19 https://t.co/...,2020-03-19T00:41:14Z,Singapore coronavirus COVID19


In [307]:
# Extract date only from 'date' column

df3_new['date'] = df3_new['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
df3_new['date'] = df3_new['date'].apply(lambda x: x.date())
df3_new.head()

Unnamed: 0,username,text,date,processed_tweets
0,SanghiAnand,The “experts” on TV have now moved from the R ...,2020-03-19,The experts on TV have now moved from the R wo...
1,JayMcr,"Cumon people...\n\n""Social Distancing"" Does NO...",2020-03-19,Cumon people\n\nSocial Distancing Does NOT mea...
2,triciapang,824am and the train is barely packed at Hougan...,2020-03-19,824am and the train is barely packed at Hougan...
3,3e_like,In time likes this QC will go down. #mayor #co...,2020-03-19,In time like this BAC will go down mayor covid...
4,onefern,#Singapore #coronavirus #COVID19 https://t.co/...,2020-03-19,Singapore coronavirus COVID19


#### Merging datasets

In [309]:
# Combining the datasets above

combined_tweets1 = pd.concat([df1_new, df2_new], ignore_index=True)
combined_tweets2 = pd.concat([combined_tweets1, df3_new], ignore_index=True)

print(combined_tweets2.shape)
combined_tweets2.head()

(6687, 4)


Unnamed: 0,username,text,date,processed_tweets
0,PuteriAmirah01,b'RT @rndsmsl: Kiamat dh dekat knp aku masih m...,2020-01-27,News Apocalypse is near why I'm still lazy pra...
1,joseph_yeow,b'RT @new_prykm: This video shows how South Ko...,2020-01-27,This video shows how South Korea Airport is d...
2,_naufalraffid,b'RT @L0s3r_AT_w0rk: Thomas had never seen suc...,2020-01-27,Thomas had never seen such bullshit before
3,FaadehlehHakehm,b'RT @chey_cobb: Chinese officials say this is...,2020-01-27,Chinese officials say this is a photo of a ne...
4,miz4h,"b'RT @putridamiaa: spread this, it\xe2\x80\x99...",2020-01-27,spread this it worth to share coronavirus


In [310]:
# Export combined data
combined_tweets2.to_csv('combined_sg_tweets.csv')

In [311]:
import dill

In [312]:
#save notebook session
dill.dump_session('datacleaning_env.db')

In [154]:
#restore notebook session
dill.load_session('datacleaning_env.db')