In [1]:
import pandas as pd
import numpy as np
import json
import re

from pymongo import MongoClient

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
#### Define pre-processing subfunctions ####

def removeNonAlphaNumChars(tweets):
    tweets = [re.sub(r'\-', r' ', tweet) for tweet in tweets]  # replace - with SPACE
    tweets = [re.sub(r'[^\w\s]', r'', tweet) for tweet in tweets]  #remove other NON alpha numeric, excluding whitespace
    return tweets


# Taken from https://github.com/Deffro/text-preprocessing-techniques
def removeUnicode(tweets):
    """ Removes unicode strings like "\u002c" and "x96" """
    tweets = [re.sub(r'(\\u[0-9A-Fa-f]+)',r'', tweet) for tweet in tweets]
    tweets = [re.sub(r'[^\x00-\x7f]',r'',tweet) for tweet in tweets]
    return tweets
def replaceURL(tweets):
    """ Replaces url address with "url" """
    tweets = [re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', tweet) for tweet in tweets]
    tweets = [re.sub(r'#([^\s]+)', r'\1', tweet) for tweet in tweets]
    return tweets
def replaceAtUser(tweets):
    """ Replaces "@user" with "atUser" """
    tweets = [re.sub('@[^\s]+','atUser',tweet) for tweet in tweets]
    return tweets
def removeHashtagInFrontOfWord(tweets):
    """ Removes hastag in front of a word """
    tweets = [re.sub(r'#([^\s]+)', r'\1', tweet) for tweet in tweets]
    return tweets
# end



#### DEFINE MAIN PREPROCESSING FUNCTIONS ####


def preprocessTweets(tweets):
    tweets = [tweet.lower() for tweet in tweets]  #convert to lowercase
    tweets = replaceURL(tweets) #replace URLs
    tweets = replaceAtUser(tweets)  # replace @user
    tweets = removeHashtagInFrontOfWord(tweets)  # remove hashtag
    tweets = removeNonAlphaNumChars(tweets)  # remove non alphanumeric characters
    tweets = removeUnicode(tweets)  # remove other random unicode strings
    return tweets

def tokenize(tweets):
    tokens = [nltk.word_tokenize(tweet) for tweet in tweets]
    return tokens

def getFinalTokens(tokens):
    finalTokens = [[lemmatizer.lemmatize(word) for word in token if len(word) > 2 and (word not in stoplist)] for token in tokens]
    return finalTokens

In [3]:
#### Load data ####

# with open("covidtrack_50K.json") as f:
#     data = json.load(f)

client = MongoClient('localhost', 27017)
db = client.csm2020
collection = db.assignment3
df = pd.DataFrame(list(collection.find()))[["text"]]
print(df.head())

tweets = df["text"].values.tolist()
print("\nNumber of tweets: ", len(tweets))

                                                text
0  DAY-7\n\n#lockdown in #Lagos #Abuja #Nigeria a...
1  RT @Astartiel: Now that we know for a fact sad...
2  RT @2013Boodicca: The first UK conviction for ...
3  RT @PDChina: Made it! A 104-yr-old World War I...
4  RT @DarrenPlymouth: UK #coronavirus statistics...

Number of tweets:  50000


In [4]:
#### Preprocess tweets ####
stoplist = stopwords.words('english')
other_stopwords = "atUser URL RT covid coronavirus covid19 covid_19 covid-19" # I decided to include the covid terms since it's the main topic
stoplist = stoplist + other_stopwords.split()
lemmatizer = WordNetLemmatizer() # set lemmatizer

tweets_clean = preprocessTweets(tweets)
tokens = tokenize(tweets_clean)
final_tokens = getFinalTokens(tokens)

# print("\nTweets: ", tweets[:10])
# print("\nCleaned tweets: ", tweets_clean[:10])
# print("\nTokens: ", tokens[:10])
print("\nFinal tokens: ", final_tokens[:10])



Final tokens:  [['day', 'lockdown', 'lagos', 'abuja', 'nigeria', 'preventive', 'measure', 'combating', 'team', 'deci'], ['know', 'fact', 'sadly', 'crazy', 'transfer', 'even', 'asymptomatically', 'pet', 'wildlife'], ['first', 'conviction', 'breaching', 'lockdown', 'restriction', 'maria', 'dinou', 'black', 'woman', 'york', 'fined'], ['made', '104', 'old', 'world', 'war', 'veteran', 'state', 'oregon', 'become', 'oldest', 'known', 'survivor'], ['statistic', '3802', 'new', 'case', '51608', 'total', '439', 'new', 'death', '5373', 'total', 'dead', 'stay', 'safe'], ['regime', 'allocating', 'fund', 'irgc', 'instead', 'addressing', 'iran'], ['together', 'get', 'together', 'ssot', 'southsudan'], ['joke', 'candidate', 'biden', 'doesnt', 'even', 'know', 'message', 'response', 'listen', 'inter'], ['1918', 'flu', 'teach', 'economy'], ['monday', 'evening', 'update', 'european', 'call', 'galway', 'medical', 'grad', 'public', 'health', 'emergency', 'team', 'meeting', 'engaging']]


In [11]:
#### Perform LDA ####

def dummy(doc):
    return doc

vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy,max_df=0.9, min_df=10)

tfreq = vectorizer.fit_transform(final_tokens)
tfreq_fNames = vectorizer.get_feature_names()
# print(tfreq_fNames)

nTopics = 20
model = LatentDirichletAllocation(n_components=nTopics, random_state=0)
model.fit(tfreq)

def display_topic_names(model, feature_names, no_top_words):  # adapted from https://ourcodingclub.github.io/tutorials/topic-modelling-python/
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

# def display_topic_weights(model, feature_names, no_top_words):  # adapted from https://ourcodingclub.github.io/tutorials/topic-modelling-python/
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

nTopWords = 10
# topicWords = display_topics(model, tfreq_fNames, nTopWords)
topicWords = display_topic_names(model, tfreq_fNames, nTopWords)
# topicWeights = display_topic_weights(model, tfreq_fNames, nTopWords)
# topicWords.to_csv("words_perTopic.csv", encoding = "utf-8")
print(topicWords)
# print("\n\n", topicWeights)

  Topic 0 words Topic 1 words Topic 2 words Topic 3 words Topic 4 words  \
0         death      pandemic           new      pandemic           amp   
1         south           lot    government          weve         would   
2         korea       symptom           say         point           day   
3          rate           day          case          play          help   
4           per      fighting           job       feeling       country   
5         china           due         death      watching        people   
6        global          give          done       arrived          news   
7       country         share        spread         exact           use   
8       another      hospital      american           may          died   
9         caput         today          poll         today           big   

  Topic 5 words Topic 6 words Topic 7 words Topic 8 words Topic 9 words  \
0          case         press      positive         death          stay   
1         death         

## Selected topics (3), semantic names, and top 5 words:

- Topic 9: __Situation in the UK__ (stay, boris, johnson, minister, getting)
- Topic 8: __News headlines__ (death, number, toll, breaking, recorded)
- Topic 6: __Media/press communications__ (press, live, pandemic, fight, briefing)

## Discussion:
None of the 20 topics stood out as being particularly surprising, but it was quite difficult to interpret and characterise many of the topics. First, I felt that some topics were not "distinct" (i.e. overlapped with one or more other topics). For example, Topics 5 and 8 are similar and could be characterised as "Headlines". Similarly, Topics 6 and 12 are also comparable with both touching on the subject of media/press communications. Overall, a few key ideas could be inferred from the 20 topics, but I would say assuming that 20 different latent topics exist might be abit of a reach. Moreover, all the tweets were pre-filtered and talk about the same global topic: Covid-19. Thus, compared to the examples showed in the lecture slides, the "quality" of each topic identified via LDA is not at the same level. 

I found that topics relating to specific geographic locations were the clearest. For example, Topics 0, 9, and 19 quite clearly suggest Covid-19 subtopics that pertain specifically to South Korea, the UK, and the US respectively. 

Finally, there were also a few topics (e.g. Topics 3, 4, 10, and 16) that contained words that I could not fully comprehend (e.g. weve, amp, http, nh). Clearly, "http" comes from url strings in the tweets; but given the pre-processing procedure that removed URLs, I was suprised to see that "http" still appeared as a top keyword and I can't explain this occurrence...  
