In [109]:
import re
import pandas as pd

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# parameters for function
name = 'GovCanHealth'
lang = 'english'

def preprocessing(text):
    text = re.sub('((www\.[\s]+)|(https?://[^\s]+))', ' ', text)
    text = re.sub('@[A-Za-z0-9_-]+', ' ', text)
    text = re.sub('RT', ' ', text)
    text = re.sub('#', ' ', text)
    text = re.sub('[^\w\s]', ' ', text)
    text = re.sub('[0-9]', ' ', text) # replace numbers
    text = re.sub('\s\s+', ' ', text) # replace several spaces
    return text

original_tweets = []
documents = []

f = pd.read_json('data/qcri/' + name +'_geo.json', lines=True)
for line in f['text']:
    original_tweets.append(line)
    documents.append(preprocessing(line))

try:    
    f = pd.read_csv('data/out/' + name + '.csv', 'rb', delimiter = '\t')
    for line in f['text']:
        original_tweets.append(line)
        documents.append(preprocessing(line))
except:
    print('No file in WHO data')


# source — https://pythonprogramminglanguage.com/kmeans-text-clustering/
stopwordsList = set(stopwords.words(lang)) # load list of stopwords of target language
stopwordsList.add('covid') # here we add some covid-related words
stopwordsList.add('covid19')
stopwordsList.add('corona')
stopwordsList.add('coronavirus')
stopwordsList.add('covid-19')
stopwordsList.add('amp')
stopwordsList.add('via')


vectorizer = TfidfVectorizer(stop_words=stopwordsList)
X = vectorizer.fit_transform(documents)

true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    text = ''
    for ind in order_centroids[i, :15]:
        text = text + str(terms[ind]) + ', '
        #print(' %s' % terms[ind])
    print(text)


Top terms per cluster:
Cluster 0:
canada, health, cdnpoli, pandemic, people, public, us, new, test, masks, today, help, please, need, approved, 
Cluster 1:
please, editing, oph, tweets, btw, ant, impo, rather, posting, helpful, calling, tell, due, someone, updated, 
Cluster 2:
sources, information, info, consult, like, trustworthy, circulating, questionable, miracle, vaccinations, faster, internet, beware, cures, offers, 
Cluster 3:
epidemic, still, deck, releasing, strong, controls, models, technical, briefing, population, link, model, looking, deaths, thanks, 
Cluster 4:
broadcast, facebook, channel, phac, edt, officials, live, pm, healthy, situation, update, canadians, today, senior, canada, 


In [110]:
freq = {}

for d in documents:
    Y = vectorizer.transform([d])
    prediction = model.predict(Y)
    if str(prediction) not in freq:
        freq[str(prediction)] = 1
    else:
        freq[str(prediction)] += 1

freq

{'[2]': 1293, '[0]': 17813, '[3]': 222, '[1]': 477, '[4]': 758}

In [115]:
tweets = []
nums = []
for num, d in enumerate(documents):
    Y = vectorizer.transform([d])
    prediction = model.predict(Y)
    if len(tweets) < 20 and prediction == [4]:
        if str([d])[2:-2] not in tweets:
            tweets.append(str([d])[2:-2])
            nums.append(num)

tweets = []
for n in nums:
    tweets.append(original_tweets[n])

for t in sorted(tweets):
    print(t)
    print('- - - - - - - - - -')

An update on #COVID19 situation in Canada with #PHAC officials will be broadcast live today at 3:00PM EDT, on @GovCanHealth and on the Healthy Canadians Facebook channel: https://t.co/655ckkbYDX
- - - - - - - - - -
An update on #COVID19 situation in Canada with #PHAC senior officials will be broadcast live today at 12 PM EDT, on @GovCanHealth and on the Healthy Canadians Facebook channel: https://t.co/655ckkbYDX
- - - - - - - - - -
An update on #COVID19 situation in Canada with #PHAC senior officials will be broadcast live today at 12:00 EDT, on @GovCanHealth and on the Healthy Canadians Facebook channel: https://t.co/655ckkbYDX
- - - - - - - - - -
An update on #COVID19 situation in Canada, including #PHAC senior officials will be broadcast live today at 1:30PM EDT, on @GovCanHealth and on the Healthy Canadians Facebook channel: https://t.co/655ckkbYDX
- - - - - - - - - -
