In [1]:
# Import Python's json Encoder and Decoder
import json
from pprint import pprint # Pretty Print

# Parsing of json file as one list
# Name of file which combines location information with validated dengue tweets: 'tweet_master_data.json'
data = []
with open('tweet_master_data.json') as jdata:
    data = json.load(jdata)
    jdata.close()

## Example of a tweet data point

In [2]:
print('Example of a tweet data point')
pprint(data[0])

Example of a tweet data point
{'_id': '299714679532556288',
 'cc': 'BR',
 'country': 'Brasil',
 'county': 'Rondonópolis',
 'cr': '2013-02-07T22:02:02',
 'edits': [{'curator_id': '540e300a7673826b17a5604c',
            'date': '2015-07-06T01:18:27.900000',
            'field': 'tags',
            'new': 1},
           {'curator_id': '555232868624c82a1c6d2ca3',
            'date': '2015-07-06T17:40:18.407000',
            'field': 'tags',
            'new': 1,
            'old': 1}],
 'f': 'tw2013272123',
 'lang': 'pt',
 'loc': ' MT / PR',
 'microregion': 'Microrregião de Rondonópolis',
 'p': '48401b8f7232dfb8',
 'pln': -54.607,
 'plt': -16.572,
 'region': 'Região Centro-Oeste',
 'state': 'MT',
 't': 'Dengue 😫',
 'tags': {'540e300a7673826b17a5604c': 1, '555232868624c82a1c6d2ca3': 1},
 'tln': -54.649,
 'tlt': -16.463,
 'uid': '419780633',
 'v': True}


Each tweet comprises of several fields. Tweets are stored as a list of dictionaries in the json file 

### Keys in json file
* 'v' : validated (true/false)
* 'tags' : dictionary with key (curator ID) and value (label).  The labels are as follows: 1=junk, 2=report, 3=sickness
* 'edits' : a dictionary that keeps track of all tags applied by curators.  Not really useful unless you want to see if somebody is re-rating tweets or if they erase tags by accident.  
* "_id" : tweet ID (also the object ID for the mongo db)
* "lang" : language of tweet
* "loc" : user-entered location name
* "plt" : profile latitude coordinates
* "pln" : profile longitude
* "uid" : twitter user id
* "tlt" : tweet latitude
* "tln" : tweet longitude
* "cc" : country code
* "f" : our own backup coding
* "p" : twitter place ID (not sure if these can be looked up somehow via twitter) 
* "t" : tweet text
* "acr": time of the user’s account creation in UTC
* "cr" : time of the tweet in UTC 
* "flrs": number of followers
* "flng" : number of accounts following (friends)

In [3]:
# Check the size of dataset
print("Number of data points (tweets) = ", end = '')
print(len(data))

# EXtract portuguese tweets only from the dataset, since we are performing this analysis for Brazil
pt_tweets = []
for tweet in data:
    # Portuguese Tweets are encoded as 'pt' and spanish tweets are encoded as 'es' under the key 'lang'
    if tweet['lang'] == 'pt':
        pt_tweets.append(tweet)

print("Number of portuguese tweets = ", end = '')
print(len(pt_tweets))
# Release memory of redundant variables
del data

# The three types of tags used in manual classification of the tweets are 'report', 'junk' and 'sickness'
# Extraction of tweets tagged as junk(1) or sickness(3) only
pt13_tweets = []
for tweet in pt_tweets:
    # Identify curator ids of each tweet to refer their tags
    editlen = len(tweet['edits'])
    cidList = []
    for i in range(0,editlen):
        cidList.append(tweet['edits'][i]['curator_id'])
    cidListSet = set(cidList) # Eliminates redundancy in set elements
    cidList = list(cidListSet) 
    cid1 = cidList[0]
    cid2 = cidList[1]
    if (tweet['tags'][cid1] != 2 and tweet['tags'][cid2] != 2):
        pt13_tweets.append(tweet)

# Check the size of dataset containing portuguese tweets tagged as 'junk' or 'sickness' only
print("Number of tweets tagged as 'sick' or 'junk' only = ", end = '')
print(len(pt13_tweets))
# Release memory of redundant variables
del pt_tweets

# Differentiating tweets into those with common tags and those with different tags - easier for manipulation

cmntags = [] # For tweets with annotators' agreement
difftags = [] # For tweets with annotators' diasgreement
zerotag = []
for tweet in pt13_tweets:
    editlen = len(tweet['edits'])
    cidList = []
    for i in range(0,editlen):
        cidList.append(tweet['edits'][i]['curator_id'])
    cidListSet = set(cidList)
    cidList = list(cidListSet)
    cid1 = cidList[0]
    cid2 = cidList[1]
    # Curators do not agree on the annotation of all tweets - clash of tags
    if (tweet['tags'][cid1] == tweet['tags'][cid2]):
        cmntags.append(tweet)
    else:
        difftags.append(tweet)
    
#     # Tweets tagged as (0) - System Error
#     if ((tweet['tags'][cid1] == 0)|(tweet['tags'][cid2] == 0)):
#         zerotag.append(tweet)

print("Number of tweets with common tags by annotators = ", end = '')
print(len(cmntags)) # Check size of dataset containing tweets annotated with common tags
print("Number of tweets with different tags by annotators = ", end = '')
print(len(difftags)) # Check size of dataset containing tweets annotated with common tags
del pt13_tweets

Number of data points (tweets) = 13513
Number of portuguese tweets = 10116
Number of tweets tagged as 'sick' or 'junk' only = 9965
Number of tweets with common tags by annotators = 7758
Number of tweets with different tags by annotators = 2207


Only those tweets with annotators' agreement are considered for the training set

In [4]:
# Counting the number of tweets tagged as junk and sickness 
count3 = 0
for tweet in cmntags:
    cid = tweet['edits'][0]['curator_id']
    if(tweet['tags'][cid] == 3):
        count3 = count3 + 1
# end
print("Number of 'junk' tweets in training set:", end = " ")
print(len(cmntags)-count3)
print("Number of 'sickness' tweets in training set:", end = " ")
print(count3)

Number of 'junk' tweets in training set: 5261
Number of 'sickness' tweets in training set: 2497


In [5]:
import re
import string

# Function for pre-processing of tweets
def processTweet(tweet):

    # Convert to lower case
    tweet = tweet.lower()
    # Convert hyperlinks to a generic term 'URL' or an empty space
    tweet = re.sub('((www\.[^\s]+)|(https?:\/\/[^\s]+))','URL',tweet)
#     tweet = re.sub('((www\.[^\s]+)|(https?:\/\/[^\s]+))','',tweet)
    # Convert @username to USER or an empty space
    tweet = re.sub('(@[^\s]+)|(@[\s][^\s]+)','USER',tweet)
#     tweet = re.sub('(@[^\s]+)|(@[\s][^\s]+)','',tweet)
    # Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    # Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    # Trim special charaters
    tweet = tweet.strip('\'"')
    return tweet
#end

In [6]:
# Import NLTK library and portuguese components for manipulation of tweets
import nltk
stopwords = nltk.corpus.stopwords.words('portuguese')
# print(stopwords[0:10])
stopwords.append('USER')
stopwords.append('URL')

# Function to replace colloquial social meda words containing repeated character with a single instance of the character
def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.*)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)
#end

# Function to prepare the classifier feature vector from raw data
def getFeatureVector(tweet, featureVector):
    # split tweet into words
    words = tweet.split()
    for w in words:
        w = w.lower()
        w = re.sub(r'#([^\s]+)', r'\1', w)
        # replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        # strip punctuation
        w = w.strip('\'"?,*.(_!)/')
        w = w.replace('\\','')
        w = w.replace('/','')
        
        # check if the word starts with an alphabet
        # val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        val = re.search(r"^[a-zA-Z]", w)

        # check if the word contains only numbers
        valnum = re.search(r"^[0-9][0-9]*$", w)
    
        # Addition of emoticons to feature list - Commment the next paragraph to omit emojis
        u = w.encode('unicode-escape')
        bval = re.search(b'\\U', u)
        if (bval):
            s = u.split(b'\\U')
            for l in range(1,len(s)):
                a = (b'\\U' + s[l])
                astr = a.decode('unicode-escape')
                featureVector.append(astr)
                continue
                
        
        # ignore if it is a stop word or the word contains only numbers or the word does not start with an alphabet
        if (w in stopwords or (val is None) or valnum):
            continue
        else:
            featureVector.append(w.lower())
            
    featureVectorSet = set(featureVector) # Eliminate redundant features
    featureVector = list(featureVectorSet) 
    del featureVectorSet
    
    # Additional code to use bigrams and trigrams; these features have not been used for results reported in the paper
    # Addition of Bigrams as features        
#     for j in range(0,len(featureVector)-1):
#         featureVector.append((featureVector[j],featureVector[j+1]))
        
    # Addition of Trigrams as features        
#     for j in range(0,len(featureVector)-2):
#         featureVector.append((featureVector[j],featureVector[j+1],featureVector[j+2]))
    
    # Break down a hashtag into individual words whenever possible to get additional cues about topic of the tweet
    # Example: Hashtag1 - #ifyouknowwhatimean, Hashtag2 - #IfYouKnowWhatIMean
    # Hashtag1 can not be analysed further without a dictionary, but Hashtag2 can be fragmented at the capital letters            
    regex = re.compile(r'#([^\s]+)')
    matchObj = regex.findall(tweet)
#     print(matchObj)
    s = len(matchObj) # Multiple hashtags
    fv = []
    for i in range(0,s):
        word = matchObj[i]
        # Initialization
        startInd = len(word)
        stopInd = 0
        for i in range(0,len(word)):
            if (i==(len(word)-1)):
                        stopInd = i + 1
                        if (startInd == len(word)):
                            startInd = 0
                        # Single capital letter identified (Example: 'I')
                        fv.append(word[startInd:stopInd].lower()) # Single Capital Letter identified at the end of tag
                        continue
            if (word[i].isupper()):
                if (startInd != len(word)):
                    stopInd = i
                    # Word identified within the phrase
                    fv.append(word[startInd:stopInd].lower()) # 
                    startInd = i
                else:
                    if (i != 0):
                        startInd = 0
                        stopInd = i
                        # Word identified at the starting of the phrase
                        fv.append(word[startInd:stopInd].lower())
                        startInd = i
    
    featureVector.extend(fv)
    return featureVector
#end

In [7]:
#start extract_features
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        if (type(word) is tuple):
            temp = 'contains' + str(word)
            features[temp] = (word in tweet_words)
            del temp
        else:
            features['contains(%s)' % word] = (word in tweet_words)
    return features
#end

In [11]:
# Training & n-fold Cross Validation of Naive Bayes classifier
n = 5

count = 0
dsize = len(cmntags)
finacc = 0
finprec = 0
fintnr = 0
finrecall = 0
# flist = open('featureList.txt','w')

for i in range(0,n):
    testset = []
    trainset = []
    ind1 = int(i*dsize/n)
    ind2 = int((i+1)*dsize/n)
    testset = cmntags[ind1:ind2]
    trainset = cmntags[:ind1]
    trainset.extend(cmntags[ind2:])
    
    featureList = []
    tweetset = []
    
    #Exp
    tagList = []
    
#    trainset = cmntags[0:5999]
    for tweet in trainset:
        tweetFV = []
        text = tweet['t']
        cid = tweet['edits'][0]['curator_id']
        tag = tweet['tags'][cid]
        
        processedtext = processTweet(text)
        tweetFV = getFeatureVector(processedtext, tweetFV)
        
        # Addition of location data to feature vector - Comment the next 4 lines to omit location data from features
#         tweetFV.append(tweet['region'])
#         tweetFV.append(tweet['state'])
#         tweetFV.append(tweet['county'])
#         tweetFV.append(tweet['microregion'])
        
        tweetFVSet = set(tweetFV)
        tweetFV = list(tweetFVSet)
        del tweetFVSet
        featureList.extend(tweetFV)
        tweetset.append((tweetFV,tag))

    print(len(featureList))
    featureListSet = set(featureList)
    print(len(featureListSet))
    featureList = list(featureListSet)
    featureListStr = [str(item) for item in featureList]
#     flist.write("\t".join(featureListStr))
    
    print('Preparation of Feature Vectors completed')

    # Extract feature vector for all tweets in one shot
    training_set = nltk.classify.util.apply_features(extract_features, tweetset)

    # Train the classifier
    NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
    
    fn = 0
    fp = 0
    tp = 0
    tn = 0
    sick_tn = 0
    sick_tp = 0
    sick_fn = 0
    sick_tp = 0

    for j in range(0,len(testset)):
        tweet = testset[j]
        fv = []
        # Test the classifier
        testTweet = tweet['t']
        processedTestTweet = processTweet(testTweet)
        x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
#         pt_tweets[j]['ctags'] = x
        cid = tweet['edits'][0]['curator_id']
        if (x == tweet['tags'][cid]):
            if (x == 1):
                tn += 1
            else:
                tp += 1
        else:
            if (x == 1):
                fn += 1
            else:
                fp += 1

    acc = (tp + tn)/(tp + tn + fp + fn)
    prec = tp/(tp + fp)
    recall = tp/(tp + fn)
    tnr = tn/(tn + fp)
    print("Round", i, "in cross validation")
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", recall)
    print("TNR:", tnr)
    print('\n')
    
    finacc = finacc + acc
    finrecall = finrecall + recall
    finprec = finprec + prec
    fintnr = fintnr + tnr
    
    print(NBClassifier.show_most_informative_features(20))

finacc = finacc/n
finrecall = finrecall/n
finprec = finprec/n
fintnr = fintnr/n
print("Final Accuracy:", finacc)
print("Final Precision:", finprec)
print("Final Recall:", finrecall)
print("Final True Negative Rate:", fintnr)

44173
7738
Preparation of Feature Vectors completed
Round 0 in cross validation
Accuracy: 0.8175370728562218
Precision: 0.7019748653500898
Recall: 0.7696850393700787
TNR: 0.840843720038351


Most Informative Features
        contains(parado) = True                1 : 3      =    165.9 : 1.0
         contains(ebola) = True                1 : 3      =     35.1 : 1.0
      contains(mosquito) = True                1 : 3      =     30.2 : 1.0
         contains(dando) = True                1 : 3      =     19.2 : 1.0
          contains(agua) = True                1 : 3      =     17.9 : 1.0
         contains(saúde) = True                1 : 3      =     15.9 : 1.0
         contains(irmão) = True                3 : 1      =     15.4 : 1.0
      contains(hospital) = True                3 : 1      =     15.3 : 1.0
      contains(gusttavo) = True                3 : 1      =     14.8 : 1.0
          contains(acha) = True                3 : 1      =     14.1 : 1.0
       contains(cuidado) = True  

In [8]:
# To print the most informative features using the entire dataset

dsize = len(cmntags)
flist = open('featureList.txt','w')

trainset = cmntags

featureList = []
tweetset = []


for tweet in trainset:
    tweetFV = []
    text = tweet['t']
    cid = tweet['edits'][0]['curator_id']
    tag = tweet['tags'][cid]

    processedtext = processTweet(text)
    tweetFV = getFeatureVector(processedtext, tweetFV)
    
    # Addition of location data to feature vectors - commment the next 4 lines to omit location data from features
#     tweetFV.append(tweet['region'])
#     tweetFV.append(tweet['state'])
#     tweetFV.append(tweet['county'])
#     tweetFV.append(tweet['microregion'])
    
    tweetFVSet = set(tweetFV)
    tweetFV = list(tweetFVSet)
    del tweetFVSet
    featureList.extend(tweetFV)
    tweetset.append((tweetFV,tag))

print(len(featureList))
featureListSet = set(featureList)
print(len(featureListSet))
featureList = list(featureListSet)
featureListStr = [str(item) for item in featureList]
flist.write("\t".join(featureListStr))

print('FVs done')

# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweetset)

# Train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

57214
9368
FVs done


In [9]:
# print informative features about the classifier
print(NBClassifier.show_most_informative_features(100))

Most Informative Features
        contains(parado) = True                1 : 3      =    189.7 : 1.0
         contains(ebola) = True                1 : 3      =     42.3 : 1.0
      contains(mosquito) = True                1 : 3      =     31.3 : 1.0
         contains(dores) = True                3 : 1      =     28.8 : 1.0
          contains(agua) = True                1 : 3      =     22.3 : 1.0
         contains(dando) = True                1 : 3      =     21.6 : 1.0
          contains(irmã) = True                3 : 1      =     19.8 : 1.0
         contains(irmão) = True                3 : 1      =     19.6 : 1.0
             contains(💉) = True                3 : 1      =     19.0 : 1.0
          contains(copa) = True                1 : 3      =     16.6 : 1.0
      contains(gusttavo) = True                3 : 1      =     16.1 : 1.0
        contains(veneno) = True                1 : 3      =     15.0 : 1.0
     contains(professor) = True                3 : 1      =     14.7 : 1.0

In [64]:
# Write tweets into .csv files for 
# data2 = []
# with open('tweet_master_data.json') as f:
#     for line in f:
#         data2.extend(json.loads(line))

import csv
fp = open('brazil_sickness_tweets_val_NB.csv', 'w', newline='')
a = csv.writer(fp, delimiter=',')
a.writerow(('Time Stamp', 'Tweet Longitude', 'Tweet Latitude', 'Country', 'Region', 'State', 'County', 'Microregion'))

# Writing of commonly annotated sickness tweets into the csv file
for k in range(0,len(cmntags)):
    tweet = cmntags[k]
    cid = tweet['edits'][0]['curator_id']
    tag = tweet['tags'][cid]
    if(tag == 3):
        a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))

In [65]:
# Classification of tweets annotated differently by different annotators
for k in range(0,len(difftags)):
    tweet = difftags[k]
    if (tweet['lang'] != 'pt')|(tweet['country'] != 'Brasil'):
        continue
    testTweet = tweet['t']
    fv = []
    processedTestTweet = processTweet(testTweet)
    x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
    if (x == 3):
        a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))
# end

In [66]:
data2 = []
with open('tweet_master_data2.json') as f:
    for line in f:
        data2.extend(json.loads(line))

In [67]:
# Classification of non-validated tweets
for k in range(0,len(data2)):
    tweet = data2[k]
    if (tweet['lang'] != 'pt')|(tweet['country'] != 'Brasil'):
        continue
    testTweet = tweet['t']
    fv = []
    processedTestTweet = processTweet(testTweet)
    x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
    if (x == 3):
        a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))
# end

In [68]:
fp.close()