In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim as gensim
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import re

import json

### Read and retrieve tweets from Singapore

In [None]:
# Read twitter data

tweets_data_path = 'coronavirus_tweets_20200203.txt'

def get_tweetlist(filepath):
    
    tweets_file = open(filepath, 'r')
    tweets_list = []

    for line in tweets_file:
        if len(line.strip()) > 0:
            tweet = json.loads(line) # each line is a JSON object
            if 'text' in tweet:
                tweets_list.append(tweet)
    
    return tweets_list

In [4]:
tweets_list = get_tweetlist(tweets_data_path)

In [5]:
#sample tweet
# tweets_list[5]

In [6]:
# Filter and retrieve tweets from Singapore, and return selected fields in a dataframe

def get_sgtweet_df(tweetlist):
    
    tweets_df = pd.DataFrame()

    tweets_df['text'] = list(map(lambda tweet: tweet['text'], tweets_list))

    tweets_df['lang'] = list(map(lambda tweet: tweet['lang'], tweets_list))

    tweets_df['country_code'] = list(map(lambda tweet: tweet['place']['country_code']
                            if tweet['place'] != None
                            else
                                None,
                        tweets_list))

    tweets_df['location'] = list(map(lambda tweet: tweet['user']['location']
                            if tweet['user'] != None
                            else
                                None,
                        tweets_list))

    tweets_df['timestamp'] = list(map(lambda tweet: tweet['timestamp_ms'], tweets_list))

    fulltext_list = []
    for tweet in tweets_list:
        try:
            fulltext_list.append(tweet['extended_tweet']['full_text'])
        except KeyError:
            fulltext_list.append(None)
    tweets_df['full text'] = fulltext_list 
    
    
    #get SG tweets only
    tweets_sg = tweets_df.loc[ (tweets_df['location'].str.contains('Singapore', na=False)) | (tweets_df['country_code'] == 'SG') ].copy()
    tweets_sg['timestamp'] = pd.to_datetime(tweets_sg['timestamp'], unit='ms')
    tweets_sg.head()
    
    return tweets_sg

In [7]:
sgtweets_df = get_sgtweet_df(tweets_list)
sgtweets_df.head()

Unnamed: 0,text,lang,country_code,location,timestamp,full text
152,RT @MoneyTalkR3: Despite the liquidity support...,en,,Singapore,2020-02-03 02:37:16.400,
672,RT @haloefekti: #CoronavirusOutbreak Russians ...,en,,Singapore,2020-02-03 02:38:47.464,
1519,RT @Indounik: While #coronavirus is on the min...,en,,"Outram Road, Singapore",2020-02-03 02:41:05.296,
1582,They must be law enforcement personnel. This i...,en,SG,India,2020-02-03 02:41:15.373,
1894,Hopefully this one succeeds. 🙏🏻#coronavirus\n#...,en,SG,Singapore,2020-02-03 02:42:14.787,


In [8]:
# Returns Singapore tweets (text field) as a list, for text preprocessing and analysis

def get_sgtweet_list(tweetdf):
    
    text_list = tweetdf['text'].tolist()
    fulltext_list = tweetdf['full text'].tolist()
    num_tweets = len(text_list)

    sgtweets_list = []
    for i in range(0, num_tweets):
        if fulltext_list[i] == None:
            sgtweets_list.append(text_list[i])
        else:
            sgtweets_list.append(fulltext_list[i])
            
    return sgtweets_list 

In [9]:
sg_tweets = get_sgtweet_list(sgtweets_df)
sg_tweets[:5]

['RT @MoneyTalkR3: Despite the liquidity support from the #PBOC and reverse repo cuts in #China the yuan has weakened below 7 versus the USD.…',
 'RT @haloefekti: #CoronavirusOutbreak Russians evacuated from China https://t.co/ubRZ2fmOIX',
 'RT @Indounik: While #coronavirus is on the minds of many in Bali where there have been no confirmed cases, Balinese animal health authoriti…',
 "They must be law enforcement personnel. This is what's fake news &amp; Rumor mongering.",
 'Hopefully this one succeeds. 🙏🏻#coronavirus\n#CoronavirusOutbreak \n\nhttps://t.co/Hu4waCjEEK']

### Text preprocessing

In [19]:
from nltk.tokenize import RegexpTokenizer

#splits string into substrings using a regular expression
#tokenises words that contain 1 or more (+) alphanumeric characters (\w) or '@'
retokenizer = RegexpTokenizer(r'[\w@]+')

# create stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# add twitter specific stop words
stop_words = stop_words + ["rt", "got", "thats", "would", "going", "u", "get", "also", "one", "could", "said", "like", "via"]

# create lemmatizer
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def remove_urls(s):
    # re.sub(pattern,repl,string) is used to replace substrings. Will replace the matches in string with repl
    return re.sub(r'https?://\S+', "", s)

def remove_usernames(s):
    return re.sub(r'@\S+', "", s)

def remove_specialchar(text, remove_digits=False):    
    if not remove_digits:
        pattern = r'[^a-zA-z0-9\s]'
    else:
        pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [20]:
def process_text(tweetlist):
    processed_docs = []
    for doc in tweetlist:
        doc = doc.lower()
        doc = remove_urls(doc)
        doc = remove_usernames(doc)
        doc = remove_specialchar(doc, remove_digits=False)
        doc_tokens = retokenizer.tokenize(doc)
        doc_tokens = [token for token in doc_tokens if token not in stop_words]
        doc = ' '.join(doc_tokens)
        processed_docs.append(doc)
        
    return processed_docs        

In [22]:
processed_tweets = process_text(sg_tweets)
processed_tweets

['despite liquidity support pboc reverse repo cuts china yuan weakened 7 versus usd',
 'coronavirusoutbreak russians evacuated china',
 'coronavirus minds many bali confirmed cases balinese animal health authoriti',
 'must law enforcement personnel whats fake news amp rumor mongering',
 'hopefully succeeds coronavirus coronavirusoutbreak',
 'coronavirus train italy teenage chinese boy boards train woman comments loudly go',
 'chinas futures market slumped open mon iron ore screw thread crude oil palm oil eggs contracts fa',
 'novel coronavirus likely much wider deeper impact chinas economy trade singapore rest',
 'wrong real coronavirus number went model yesterday im shocked extended vacation immediately saw green prediction curve wise inflection point coming soon take care everyone',
 'coronavirus concerns may cause grab taxi late days warned coronavirus grab',
 'novel coronavirus likely much wider deeper impact chinas economy trade singapore rest',
 'thai doctors say key treating cor