In [1]:
# Import Python's json Encoder and Decoder
import json
from pprint import pprint # Pretty Print

# Parsing of json file line-by-line
# Name of file containing validated dengue tweets: 'dengueDump_3.23.16_validated.json'
# Name of file containing validated dengue tweets: 'dengueDump_3.23.16_nonvalidated.json'
# data = []
# with open('tweet_master_data.json') as f:
#     for line in f:
#         data.append(json.loads(line))
        
# Parsing of json file as one list
# Name of file which combines location info with validated dengue tweets: 'tweet_master_data.json'
data = []
with open('tweet_master_data.json') as jdata:
    data = json.load(jdata)
    jdata.close()

In [2]:
# Check the size of dataset
print(len(data))

13513


Example of a tweet data entity:

{'_id': '299714679532556288',
 'cc': 'BR',
 'country': 'Brasil',
 'county': 'Rondonópolis',
 'cr': '2013-02-07T22:02:02',
 'edits': [{'curator_id': '540e300a7673826b17a5604c',
            'date': '2015-07-06T01:18:27.900000',
            'field': 'tags',
            'new': 1},
           {'curator_id': '555232868624c82a1c6d2ca3',
            'date': '2015-07-06T17:40:18.407000',
            'field': 'tags',
            'new': 1,
            'old': 1}],
 'f': 'tw2013272123',
 'lang': 'pt',
 'loc': ' MT / PR',
 'microregion': 'Microrregião de Rondonópolis',
 'p': '48401b8f7232dfb8',
 'pln': -54.607,
 'plt': -16.572,
 'region': 'Região Centro-Oeste',
 'state': 'MT',
 't': 'Dengue 😫',
 'tags': {'540e300a7673826b17a5604c': 1, '555232868624c82a1c6d2ca3': 1},
 'tln': -54.649,
 'tlt': -16.463,
 'uid': '419780633',
 'v': True}

Keys in json file:
1. v - validated (true/false)
2. tags - dictionary with key (curator ID) and value (label).  The labels are as follows: 
1=junk, 2=report, 3=sickness
3. edits - a dictionary that keeps track of all tags applied by curators.  Not really useful unless you want to see if somebody is re-rating tweets or if they erase tags by accident.  

More keys in json file:
    "_id" : tweet ID (also the object ID for the mongo db)
    "lang" : language of tweet
    "loc" : user-entered location name
    "plt" : profile latitude coordinates
    "pln" : profile longitude
    "uid" : twitter user id
    "tlt" : tweet latitude
    "tln" : tweet longitude
    "cc" : country code
    "f" : our own backup coding
    "p" : twitter place ID (not sure if these can be looked up somehow via twitter) 
    "t" : tweet text
    "acr": time of the user’s account creation in UTC
    "cr" : time of the tweet in UTC 
    "flrs": number of followers
    "flng" : number of accounts following (friends)

In [3]:
# Extraction of portuguese tweets
pt_tweets = []
for tweet in data:
    # Portuguese Tweets are encoded as 'pt' and spanish tweets are encoded as 'es' under the key 'lang'
    if tweet['lang'] == 'pt':
        pt_tweets.append(tweet)

# Check the size of dataset containing only portuguese tweets
print(len(pt_tweets))
# Release memory of redundant variables
del data

10116


In [4]:
# Extraction of tweets tagged as junk(1) or sickness(3) only
pt13_tweets = []
for tweet in pt_tweets:
    # Identify curator ids of each tweet to refer their tags
    editlen = len(tweet['edits'])
    cidList = []
    for i in range(0,editlen):
        cidList.append(tweet['edits'][i]['curator_id'])
    cidListSet = set(cidList) # Eliminates redundancy in set elements
    cidList = list(cidListSet) 
    cid1 = cidList[0]
    cid2 = cidList[1]
    if (tweet['tags'][cid1] != 2 and tweet['tags'][cid2] != 2):
        pt13_tweets.append(tweet)

# Check the size of dataset containing portuguese tweets tagged as 'junk' or 'sickness' only
print(len(pt13_tweets))
# Release memory of redundant variables
del pt_tweets

9965


In [5]:
# Differentiating tweets into those with common tags and those with different tags - easier for manipulation

cmntags = [] # For tweets with annotators' agreement
difftags = [] # For tweets with annotators' diasgreement
zerotag = []
for tweet in pt13_tweets:
    editlen = len(tweet['edits'])
    cidList = []
    for i in range(0,editlen):
        cidList.append(tweet['edits'][i]['curator_id'])
    cidListSet = set(cidList)
    cidList = list(cidListSet)
    cid1 = cidList[0]
    cid2 = cidList[1]
    # Curators do not agree on the annotation of all tweets - clash of tags
    if (tweet['tags'][cid1] == tweet['tags'][cid2]):
        cmntags.append(tweet)
    else:
        difftags.append(tweet)
    
#     # Tweets tagged as (0) - System Error
#     if ((tweet['tags'][cid1] == 0)|(tweet['tags'][cid2] == 0)):
#         zerotag.append(tweet)

print(len(cmntags)) # Check size of 
print(len(difftags))
del pt13_tweets

7758
2207


Only those tweets with annotators' agreement are considered for the training set

In [6]:
# Counting the number of tweets tagged as junk and sickness 
count3 = 0
for tweet in cmntags:
    cid = tweet['edits'][0]['curator_id']
    if(tweet['tags'][cid] == 3):
        count3 = count3 + 1
# end
print("Number of 'junk' tweets in training set:", end = " ")
print(len(cmntags)-count3)
print("Number of 'sickness' tweets in training set:", end = " ")
print(count3)

Number of 'junk' tweets in training set: 5261
Number of 'sickness' tweets in training set: 2497


In [10]:
import re
import string

# Define a function for pre-processing of tweets
def processTweet(tweet):

    # Convert to lower case
#     tweet = tweet.lower()
    # Convert hyperlinks to a generic term 'URL' or an empty space
#     tweet = re.sub('((www\.[^\s]+)|(https?:\/\/[^\s]+))','URL',tweet)
    tweet = re.sub('((www\.[^\s]+)|(https?:\/\/[^\s]+))','',tweet)
    # Convert @username to USER or an empty space
#     tweet = re.sub('(@[^\s]+)|(@[\s][^\s]+)','USER',tweet)
    tweet = re.sub('(@[^\s]+)|(@[\s][^\s]+)','',tweet)
    # Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    # Replace #word with word
#     tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    # Trim special charaters
    tweet = tweet.strip('\'"')
    return tweet
#end

In [44]:
# Import NLTK library and portuguese components for manipulation of tweets
import nltk
stopwords = nltk.corpus.stopwords.words('portuguese')
# print(stopwords[0:10])
stopwords.append('USER')
stopwords.append('URL')

#start replaceTwoOrMore
def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.*)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)
#end

#start getfeatureVector
def getFeatureVector(tweet, featureVector):
    #split tweet into words
    words = tweet.split()
    for w in words:
        w = w.lower()
        w = re.sub(r'#([^\s]+)', r'\1', w)
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,*.(_!)/')
        w = w.replace('\\','')
        w = w.replace('/','')
        
        # check if the word starts with an alphabet
        # val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        val = re.search(r"^[a-zA-Z]", w)

        # check if the word contains only numbers
        valnum = re.search(r"^[0-9][0-9]*$", w)
    
        # Addition of emoticons to feature list - Commment the next paragraph to omit emojis
        u = w.encode('unicode-escape')
        bval = re.search(b'\\U', u)
        if (bval):
            s = u.split(b'\\U')
            for l in range(1,len(s)):
                a = (b'\\U' + s[l])
                astr = a.decode('unicode-escape')
                featureVector.append(astr)
                continue
                
        
        # ignore if it is a stop word or the word contains only numbers or the word does not start with an alphabet
        if (w in stopwords or (val is None) or valnum)
            continue
        else:
            featureVector.append(w.lower())
            
    featureVectorSet = set(featureVector) # Eliminate redundant features
    featureVector = list(featureVectorSet) 
    del featureVectorSet
    
    # Addition of Bigrams as features        
#     for j in range(0,len(featureVector)-1):
#         featureVector.append((featureVector[j],featureVector[j+1]))
        
    # Addition of Trigrams as features        
#     for j in range(0,len(featureVector)-2):
#         featureVector.append((featureVector[j],featureVector[j+1],featureVector[j+2]))
    
    # Break down a hashtag into individual words whenever possible to get additional cues about topic of the tweet
    # Example: Hashtag1 - #ifyouknowwhatimean, Hashtag2 - #IfYouKnowWhatIMean
    # Hashtag1 can not be analysed further without a dictionary, but Hashtag2 can be fragmented at the capital letters            
    regex = re.compile(r'#([^\s]+)')
    matchObj = regex.findall(tweet)
#     print(matchObj)
    s = len(matchObj) # Multiple hashtags
    fv = []
    for i in range(0,s):
        word = matchObj[i]
        # Initialization
        startInd = len(word)
        stopInd = 0
        for i in range(0,len(word)):
            if (i==(len(word)-1)):
                        stopInd = i + 1
                        if (startInd == len(word)):
                            startInd = 0
                        # Single capital letter identified (Example: 'I')
                        fv.append(word[startInd:stopInd].lower()) # Single Capital Letter identified at the end of tag
                        continue
            if (word[i].isupper()):
                if (startInd != len(word)):
                    stopInd = i
#                     print(i)
#                     print(word[startInd:stopInd])
                    # Word identified within the phrase
                    fv.append(word[startInd:stopInd].lower()) # 
                    startInd = i
                else:
                    if (i != 0):
                        startInd = 0
                        stopInd = i
                        # Word identified at the starting of the phrase
                        fv.append(word[startInd:stopInd].lower())
                        startInd = i
    
    featureVector.extend(fv)
    return featureVector
#end

In [39]:
# Preparation of test data
data = []
with open('dengueDump_3.23.16_nonvalidated.json') as f:
    for line in f:
        data.append(json.loads(line))

len(data)

14611

In [40]:
pt_tweets = []
for tweet in data:
    # Portuguese Tweets are encoded as 'pt' and spanish tweets are encoded as 'es'
    if tweet['lang'] == 'pt':
        pt_tweets.append(tweet)

len(pt_tweets)
del data

In [41]:
#start extract_features
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        if (type(word) is tuple):
            temp = 'contains' + str(word)
            features[temp] = (word in tweet_words)
            del temp
        else:
            features['contains(%s)' % word] = (word in tweet_words)
    return features
#end

In [54]:
# Training & n-fold Cross Validation of Naive Bayes classifier

count = 0
n = 5
dsize = len(cmntags)
finacc = 0
finprec = 0
fintnr = 0
finrecall = 0
# flist = open('featureList.txt','w')

for i in range(0,n):
    testset = []
    trainset = []
    ind1 = int(i*dsize/n)
    ind2 = int((i+1)*dsize/n)
    testset = cmntags[ind1:ind2]
    trainset = cmntags[:ind1]
    trainset.extend(cmntags[ind2:])
    
    featureList = []
    tweetset = []
    
    #Exp
    tagList = []
    
#    trainset = cmntags[0:5999]
    for tweet in trainset:
        tweetFV = []
        text = tweet['t']
        cid = tweet['edits'][0]['curator_id']
        tag = tweet['tags'][cid]
        
        processedtext = processTweet(text)
        tweetFV = getFeatureVector(processedtext, tweetFV)
        
        # Addition of location data to feature vector - Comment the next 4 lines to omit location data form features
#         tweetFV.append(tweet['region'])
#         tweetFV.append(tweet['state'])
#         tweetFV.append(tweet['county'])
#         tweetFV.append(tweet['microregion'])
        
        tweetFVSet = set(tweetFV)
        tweetFV = list(tweetFVSet)
        del tweetFVSet
        featureList.extend(tweetFV)
        tweetset.append((tweetFV,tag))

    print(len(featureList))
    featureListSet = set(featureList)
    print(len(featureListSet))
    featureList = list(featureListSet)
    featureListStr = [str(item) for item in featureList]
#     flist.write("\t".join(featureListStr))
    
    print('FVs done')

    # Extract feature vector for all tweets in one shote
    training_set = nltk.classify.util.apply_features(extract_features, tweetset)

    # Train the classifier
    NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
    
    fn = 0
    fp = 0
    tp = 0
    tn = 0

    for j in range(0,len(testset)):
        tweet = testset[j]
        fv = []
        # Test the classifier
        testTweet = tweet['t']
        processedTestTweet = processTweet(testTweet)
        x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
        pt_tweets[j]['ctags'] = x
        cid = tweet['edits'][0]['curator_id']
        if (x == tweet['tags'][cid]):
            if (x == 1):
                tn = tn + 1
            else:
                tp = tp + 1
        else:
            if (x == 1):
                fn = fn + 1
            else:
                fp = fp + 1

    acc = (tp + tn)/(tp + tn + fp + fn)
    prec = tp/(tp + fp)
    recall = tp/(tp + fn)
    tnr = tn/(tn + fp)
    print("Round", i, "in cross validation")
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", recall)
    print("TNR:", tnr)
    finacc = finacc + acc
    finrecall = finrecall + recall
    finprec = finprec + prec
    fintnr = fintnr + tnr

finacc = finacc/n
finrecall = finrecall/n
finprec = finprec/n
fintnr = fintnr/n
print("Final Accuracy:", finacc)
print("Final Precision:", finprec)
print("Final Recall:", finrecall)
print("Final True Negative Rate:", fintnr)

43165
7900
FVs done
Round 0 in cross validation
Accuracy: 0.8201160541586073
Precision: 0.7026548672566372
Recall: 0.781496062992126
TNR: 0.8389261744966443
45417
8512
FVs done
Round 1 in cross validation
Accuracy: 0.8685567010309279
Precision: 0.7148760330578512
Recall: 0.8398058252427184
TNR: 0.8789473684210526
45594
8438
FVs done
Round 2 in cross validation
Accuracy: 0.8413926499032882
Precision: 0.747557003257329
Recall: 0.8345454545454546
TNR: 0.8451548451548452
43414
7895
FVs done
Round 3 in cross validation
Accuracy: 0.8344072164948454
Precision: 0.8003913894324853
Recall: 0.725177304964539
TNR: 0.8967611336032388
45982
8667
FVs done
Round 4 in cross validation
Accuracy: 0.8885309278350515
Precision: 0.7947154471544715
Recall: 0.8444924406047516
TNR: 0.9072543617998163
Final Accuracy: 0.8506007098845441
Final Precision: 0.7520389480317549
Final Recall: 0.8051034176699179
Final True Negative Rate: 0.8734087766951195


In [60]:
print(fp)
print(fn)
print(tp)
print(tn)

101
72
391
988


In [63]:
# To print the most informative features

dsize = len(cmntags)
flist = open('featureList.txt','w')

trainset = cmntags

featureList = []
tweetset = []


for tweet in trainset:
    tweetFV = []
    text = tweet['t']
    cid = tweet['edits'][0]['curator_id']
    tag = tweet['tags'][cid]

    processedtext = processTweet(text)
    tweetFV = getFeatureVector(processedtext, tweetFV)
    
    # Addition of location data to feature vectors - commment the next 4 lines to omit location data from features
#     tweetFV.append(tweet['region'])
#     tweetFV.append(tweet['state'])
#     tweetFV.append(tweet['county'])
#     tweetFV.append(tweet['microregion'])
    
    tweetFVSet = set(tweetFV)
    tweetFV = list(tweetFVSet)
    del tweetFVSet
    featureList.extend(tweetFV)
    tweetset.append((tweetFV,tag))

print(len(featureList))
featureListSet = set(featureList)
print(len(featureListSet))
featureList = list(featureListSet)
featureListStr = [str(item) for item in featureList]
flist.write("\t".join(featureListStr))

print('FVs done')

# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweetset)

# Train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

55893
9574
FVs done


In [None]:
# print informative features about the classifier
print(NBClassifier.show_most_informative_features(100))

In [64]:
# Write tweets into .csv files for 
# data2 = []
# with open('tweet_master_data.json') as f:
#     for line in f:
#         data2.extend(json.loads(line))

import csv
fp = open('brazil_sickness_tweets_val_NB.csv', 'w', newline='')
a = csv.writer(fp, delimiter=',')
a.writerow(('Time Stamp', 'Tweet Longitude', 'Tweet Latitude', 'Country', 'Region', 'State', 'County', 'Microregion'))

# Writing of commonly annotated sickness tweets into the csv file
for k in range(0,len(cmntags)):
    tweet = cmntags[k]
    cid = tweet['edits'][0]['curator_id']
    tag = tweet['tags'][cid]
    if(tag == 3):
        a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))

In [65]:
# Classification of tweets annotated differently by different annotators
for k in range(0,len(difftags)):
    tweet = difftags[k]
    if (tweet['lang'] != 'pt')|(tweet['country'] != 'Brasil'):
        continue
    testTweet = tweet['t']
    fv = []
    processedTestTweet = processTweet(testTweet)
    x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
    if (x == 3):
        a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))
# end

In [66]:
data2 = []
with open('tweet_master_data2.json') as f:
    for line in f:
        data2.extend(json.loads(line))

In [67]:
# Classification of non-validated tweets
for k in range(0,len(data2)):
    tweet = data2[k]
    if (tweet['lang'] != 'pt')|(tweet['country'] != 'Brasil'):
        continue
    testTweet = tweet['t']
    fv = []
    processedTestTweet = processTweet(testTweet)
    x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
    if (x == 3):
        a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))
# end

In [68]:
fp.close()

In [None]:
tweet = "Ss #rksa pod"
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
print(tweet)

f = open("Results-1-FV.txt","w")
# print(featureList, file=f)
f.writelines(["%s\t" % item  for item in featureList])

In [None]:
from sklearn import svm

def getSVMFeatureVectorAndLabels(tweets, featureList):
    sortedFeatures = sorted(featureList)
    map = {}
    feature_vector = []
    labels = []
    for t in tweets:
        label = 0
        map = {}
        #Initialize empty map
        for w in sortedFeatures:
            map[w] = 0

        tweet_words = t[0]
        tweet_tag = t[1]
        #Fill the map
        for word in tweet_words:
            #process the word (remove repetitions and punctuations)
            word = replaceTwoOrMore(word)
            word = word.strip('\'"?,.')
            #set map[word] to 1 if word exists
            if word in map:
                map[word] = 1
        #end for loop
        values = list(map.values())
        feature_vector.append(values)
        if(tweet_tag == 1):
            label = 1
        elif(tweet_tag == 3):
            label = 3
        labels.append(label)
    #return the list of feature_vector and labels
    return {'feature_vector' : feature_vector, 'labels': labels}
#end

def getSVMFeatureVector(tweet_words, featureList):
    sortedFeatures = sorted(featureList)
    map = {}
    feature_vector = []
    labels = []
    label = 0
    map = {}
    #Initialize empty map
    for w in sortedFeatures:
        map[w] = 0

    #Fill the map
    for word in tweet_words:
        #process the word (remove repetitions and punctuations)
        word = replaceTwoOrMore(word)
        word = word.strip('\'"?,.')
        #set map[word] to 1 if word exists
        if word in map:
            map[word] = 1
    #end for loop
    values = list(map.values())
    feature_vector.append(values)
    #return the list of feature_vector
    return feature_vector
#end

In [None]:
#SVM
n = 5
dsize = len(cmntags)
finacc = 0
acc = 0

for i in range(0,n):
    testset = []
    trainset = []
    ind1 = int(i*dsize/n)
    ind2 = int((i+1)*dsize/n)
    testset = cmntags[ind1:ind2]
    trainset = cmntags[:ind1]
    trainset.extend(cmntags[ind2:])

    featureList = []
    tweetset = []

    #Exp
    tagList = []

    #    trainset = cmntags[0:5999]
    for tweet in trainset:
        tweetFV = []
        text = tweet['t']
        cid = tweet['edits'][0]['curator_id']
        tag = tweet['tags'][cid]

        # Experiment
        tagList.append(tag)

        processedtext = processTweet(text)
        tweetFV = getFeatureVector(processedtext, tweetFV)
        featureList.extend(tweetFV)
        tweetset.append((tweetFV,tag))

    print(len(featureList))
    featureListSet = set(featureList)
    print(len(featureListSet))
    featureList = list(featureListSet)

    print('Got tweetset')
    result = getSVMFeatureVectorAndLabels(tweetset, featureList)
    print('Got FVs')
    clf = svm.SVC(kernel = 'linear')
    feature_vector = result['feature_vector']
    labels = result['labels']
    clf.fit(feature_vector, labels)
    print('Trained')
    

# acc = 0
# finacc = 0

    for tweet in testset:
        fv = []
        # Test the classifier
        testTweet = tweet['t']
        processedTestTweet = processTweet(testTweet)
        fv = getFeatureVector(processedTestTweet,fv)
        res = getSVMFeatureVector(fv, featureList)
        if (res == []):
            continue
        x = clf.predict(res)

    #         fp.write(testTweet)
    #         fp.write('\t')
    #         fp.write(str(x))
    #         fp.write('\t')
        cid = tweet['edits'][0]['curator_id']
    #         fp.write(str(tweet['tags'][cid]))
    #         fp.write('\n')
        if (x[0] == tweet['tags'][cid]):
            acc = acc + 1

    # fp.close()
    print(acc)
    acc = acc/len(testset)
    print(acc)
    finacc = finacc + acc

finacc = finacc/n
print(finacc)

    
    

In [None]:
print(x)
print(res)

In [None]:
tm = {}
tm['t1'] = 0
tm['t2'] = 1
tm['t3'] = 4
val = list(tm.values())
print(val)

In [None]:
tweet = testset[2]
text = tweet['t']
# text = "46 people alive \m/"
print(text)
ptt = processTweet(text)
print(ptt)
words = ptt.split()
print(words)
for w in words:
    u = w.encode('unicode-escape')
    print(u)
        #replace two or more with two occurrences
    w = replaceTwoOrMore(w)
    print(w)
        #strip punctuation
    w = w.strip('\'"?,*.')
    print(w)    
#       check if the word starts with an alphabet
    val = re.search(r"^[a-zA-Z]", w)
    print(val)
    val = re.search(b'\\U', u)
    print(val)
    

In [None]:
# Training & n-fold Cross Validation of Max Entropy classifier

n = 5
dsize = len(cmntags)
finacc = 0
fp = open('featureList.txt','w')

for i in range(0,1):
    testset = []
    trainset = cmntags
#     ind1 = int(i*dsize/n)
#     ind2 = int((i+1)*dsize/n)
#     testset = cmntags[ind1:ind2]
#     trainset = cmntags[:ind1]
#     trainset.extend(cmntags[ind2:])
    
    featureList = []
    tweetset = []
    
    #Exp
    tagList = []
    
#    trainset = cmntags[0:5999]
    for tweet in trainset:
        tweetFV = []
        text = tweet['t']
        cid = tweet['edits'][0]['curator_id']
        tag = tweet['tags'][cid]
        
        # Experiment
        tagList.append(tag)
        
        processedtext = processTweet(text)
        tweetFV = getFeatureVector(processedtext, tweetFV)
        fp.write(' '.join(tweetFV))
        featureList.extend(tweetFV)
        tweetset.append((tweetFV,tag))

    print(len(featureList))
    featureListSet = set(featureList)
    print(len(featureListSet))
    featureList = list(featureListSet)
    
    tagListSet = set(tagList)
    tagList = list(tagListSet)
    print(tagList)

    # Extract feature vector for all tweets in one shote
    training_set = nltk.classify.util.apply_features(extract_features, tweetset)

    # Train the classifier
    MaxEntClassifier = nltk.classify.maxent.MaxentClassifier.train(training_set, 'GIS', trace=3, \
    encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 5)
    print('Training done')
    
    acc = 0
    for tweet in testset:
        fv = []
        # Test the classifier
        testTweet = tweet['t']
        processedTestTweet = processTweet(testTweet)
        x = MaxEntClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
#         fp.write(testTweet)
#         fp.write('\t')
#         fp.write(str(x))
#         fp.write('\t')
        cid = tweet['edits'][0]['curator_id']
#         fp.write(str(tweet['tags'][cid]))
#         fp.write('\n')
        if (x == tweet['tags'][cid]):
            acc = acc + 1

# fp.close()
    print(acc)
    acc = acc/len(testset)
    print(acc)
    finacc = finacc + acc

finacc = finacc/n
print(finacc)


In [None]:
for tweet in data:
        tweet = cmntags[i]
        cid1 = tweet['edits'][0]['curator_id']
        cid2 = tweet['edits'][0]['curator_id']
        tag1 = tweet['tags'][cid1]
        tag2 = tweet['tags'][cid1]
        if (tag1 == 0)|(tag2 == 0):
            pprint(tweet)

In [None]:
import re
t = "blah #ToKi9llAMockingbird #onemore blah"
# t1 = re.match(r'#([^\s])', t)
# t = "#A"
regex = re.compile(r'#([^\s]+)')
matchObj = regex.findall(t)
print(matchObj)
s = len(matchObj)
fv = []
for i in range(0,s):
    word = matchObj[i]
    startInd = len(word)
    stopInd = 0
    for i in range(0,len(word)):
        if (i==(len(word)-1)):
                    stopInd = i + 1
                    if (startInd == len(word)):
                        startInd = 0
                    fv.append(word[startInd:stopInd])
                    continue
        if (word[i].isupper()):
            if (startInd != len(word)):
                stopInd = i
                print(i)
                print(word[startInd:stopInd])
                fv.append(word[startInd:stopInd])
                startInd = i
            else:
                if (i != 0):
                    startInd = 0
                    stopInd = i
                    fv.append(word[startInd:stopInd])
                    startInd = i
                
print(fv)
        
        
# matchObj = re.sub(r'#([^\s]+)',r'\1',matchObj)

In [None]:
s = "hhaahahhahahaa"
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
s = pattern.sub(r"\1\1", s)
print(s)

In [None]:

testset = difftags
# trainset = []
# ind1 = int(i*dsize/n)
# ind2 = int((i+1)*dsize/n)
# testset = cmntags[ind1:ind2]
trainset = cmntags
# trainset.extend(cmntags[ind2:])

featureList = []
tweetset = []

#Exp
tagList = []

#    trainset = cmntags[0:5999]
for tweet in trainset:
    tweetFV = []
    text = tweet['t']
    cid = tweet['edits'][0]['curator_id']
    tag = tweet['tags'][cid]

    # Experiment
    tagList.append(tag)

    processedtext = processTweet(text)
    tweetFV = getFeatureVector(processedtext, tweetFV)
    featureList.extend(tweetFV)
    for i in range(0,len(tweetID)):
        if (tweetID[i] == tweet['_id']):
            tweetFV.extend(region[i])
            tweetFV.extend(state[i])
            tweetFV.extend(county[i])
            tweetFV.extend(mregion[i])
            break

    tweetset.append((tweetFV,tag))

featureList.extend(list(regionSet))
featureList.extend(list(stateSet))
featureList.extend(list(countySet))
featureList.extend(list(mregionSet))
featureList.extend(list(stateSet))
print(len(featureList))
featureListSet = set(featureList)
print(len(featureListSet))
featureList = list(featureListSet)
#     flist.write("\n".join(featureList))

tagListSet = set(tagList)
tagList = list(tagListSet)
print(tagList)
print('FVs done')

# Extract feature vector for all tweets in one shote
training_set = nltk.classify.util.apply_features(extract_features, tweetset)

# Train the classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)

fn = 0
fp = 0
tp = 0
tn = 0
for tweet in testset:
    fv = []
    # Test the classifier
    testTweet = tweet['t']
    processedTestTweet = processTweet(testTweet)
    x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
    tweet['ctags'] = x

In [None]:
testset = difftags
for tweet in testset:
    fv = []
    # Test the classifier
    testTweet = tweet['t']
    processedTestTweet = processTweet(testTweet)
    x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
    tweet['ctags'] = x

In [None]:
count1 = 0
count3 = 0
for tweet in testset:
        if (tweet['ctags'] == 1):
            count1 = count1 + 1
        if (tweet['ctags'] == 3):
            count3 = count3 + 1
print(count1)
print(count3)
print(len(testset))

In [None]:
print(count1)
print(count3)
print(len(testset))
cache_testdata = testset
print(len(cache_testdata))

In [None]:
data = []
with open('tweet_master_data.json') as f:
    for line in f:
        data.append(json.loads(line))

In [None]:
import csv

# hello = [['Me','You'],['293', '219'],['13','15']]
hello = testset[0:3]
length = len(hello[0])

with open('test1.csv', 'w') as testfile:
    csv_writer = csv.writer(testfile)
#     for y in range(length):
    csv_writer.writerow([x['_id'] for x in hello])
    csv_writer.writerow([x['cc'] for x in hello])


    

In [None]:
import csv
fp = open('test.csv', 'w', newline='')
a = csv.writer(fp, delimiter=',')
a.writerow(('Time Stamp', 'Tweet Longitude', 'Tweet Latitude', 'Country', 'Region', 'State', 'County', 'Microregion'))


In [None]:
data2 = []
with open('tweet_master_data.json') as f:
    for line in f:
        data2.extend(json.loads(line))
        
dsize = len(data2)

In [None]:
count3cmn = 0
count3diff = 0
count2cmn = 0
countcmn = 0
for i in range(0,len(data2)):
    tweet = data2[i]
    if (tweet['lang'] == 'es')|(tweet['country'] != 'Brasil'):
        continue
    editlen = len(tweet['edits'])
    cidList = []
    for j in range(0,editlen):
        cidList.append(tweet['edits'][j]['curator_id'])
    cidListSet = set(cidList)
    cidList = list(cidListSet)
    cid1 = cidList[0]
    cid2 = cidList[1]
    # Curators do not agree on the annotation of all tweets - clash of tags
    if (tweet['tags'][cid1] == tweet['tags'][cid2]):
        countcmn = countcmn + 1
        if (tweet['tags'][cid1] == 3):
            count3cmn = count3cmn + 1
        if (tweet['tags'][cid1] == 2):
            count2cmn = count2cmn + 1
#     else:
#         if(tweet['ctags'] == 3):
#             count3diff = count3diff + 1
print(count3cmn)
print(countcmn)
print(count2cmn)
print(i)

In [None]:
        
for i in range(0,dsize):
    tweet = data2[i]
    if (tweet['lang'] == 'es')|(tweet['country'] != 'Brasil'):
        continue
    editlen = len(tweet['edits'])
    cidList = []
    for j in range(0,editlen):
        cidList.append(tweet['edits'][j]['curator_id'])
    cidListSet = set(cidList)
    cidList = list(cidListSet)
    cid1 = cidList[0]
    cid2 = cidList[1]
    # Curators do not agree on the annotation of all tweets - clash of tags
    if (tweet['tags'][cid1] == tweet['tags'][cid2]):
        continue
#         print('Common Tags')
#         if (tweet['tags'][cid1] == 3):
#             print('Sickness Tweet')
#             a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))
    else:
#         print('Different Tags')
        testTweet = tweet['t']
        processedTestTweet = processTweet(testTweet)
        x = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet,fv)))
        data2[i].update({'ctags':x})
#         print('Classified')
#         if (x == 3):
#             print('Sickness Tweet')
#             a.writerow((tweet['cr'], tweet['tln'], tweet['tlt'], tweet['country'], tweet['region'], tweet['state'], tweet['county'], tweet['microregion']))       
# end

In [None]:
fp.close()

In [None]:
pprint(data2[6])

In [None]:
count3cmn = 0
count3diff = 0
for i in range(0,len(data2)):
    tweet = data2[i]
    if (tweet['lang'] == 'es')|(tweet['country'] != 'Brasil'):
        continue
    editlen = len(tweet['edits'])
    cidList = []
    for j in range(0,editlen):
        cidList.append(tweet['edits'][j]['curator_id'])
    cidListSet = set(cidList)
    cidList = list(cidListSet)
    cid1 = cidList[0]
    cid2 = cidList[1]
    # Curators do not agree on the annotation of all tweets - clash of tags
    if (tweet['tags'][cid1] == tweet['tags'][cid2]):
        if (tweet['tags'][cid1] == 3):
            count3cmn = count3cmn + 1
    else:
        if(tweet['ctags'] == 3):
            count3diff = count3diff + 1
print(count3cmn)
print(count3diff)
print(i)

In [None]:
print(i)

In [None]:
pprint(data2[6])