In [1]:
import builtins
from IPython.lib import deepreload
builtins.reload = deepreload.reload
import json

%load_ext autoreload
%autoreload 2

In [2]:
#group clean text, id and hashtags from tweets
from tweeta.text import extract_hashtags
from nltk.stem import PorterStemmer
import csv

grouped_data = []

ps = PorterStemmer()

with open("data/classified_hpv.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    print(reader.fieldnames)
    for each in reader:
        d = dict()
        d['id'] = str(each['id'])
        d['text'] = each['clean_text'].lower()
        d['raw'] = each['text'].lower()
        d['orignal_hashtags'] = extract_hashtags(each["text"].lower())
        d['hashtags'] = [
            ps.stem(ht[1:]) for ht in extract_hashtags(each["text"].lower())
        ]
        grouped_data.append(d)

grouped_data[:10]

['id', 'text', 'clean_text', 'place', 'user_location', 'us_state', 'created_at', 'username', 'user_id', 'class', 'is_quote_status']


[{'hashtags': [],
  'id': '418175306319421440',
  'orignal_hashtags': [],
  'raw': 'over half a million gardasil vaccines recalled due to glass particle contamination - http://t.co/knuerlgxnl',
  'text': 'over half a million gardasil vaccines recalled due to glass particle contamination -'},
 {'hashtags': [],
  'id': '418188525217923072',
  'orignal_hashtags': [],
  'raw': 'oh, and the hpv vaccine will make you sterile. promise.',
  'text': 'oh, and the hpv vaccine will make you sterile. promise.'},
 {'hashtags': ['hpv', 'vaccin'],
  'id': '418263863772327936',
  'orignal_hashtags': ['#hpv', '#vaccine'],
  'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
  'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>'},
 {'hashtags': ['hpv', 'vaccin'],
  'id': '418194978310209536',
  'orignal_hashtags': ['#hpv', '#vaccine']

In [4]:
#count how many distinct hashtags in the data
import collections
import operator

help = []

for each_group in grouped_data:
    help.extend(each_group['hashtags'])

counter = collections.Counter(help)
print("total number of distinct hashtags: {}".format(len(counter)))
print("total number of tweets: {}".format(len(grouped_data)))

ene = 0
for each_group in grouped_data:
    ene += sum([1 if each_group['hashtags'] else 0])
print("total number of tweets with hashtags: {}".format(ene))

cut_off = 100
filtered_res = []
for k, v in counter.items():
    if v >= cut_off:
        filtered_res.append(k)

print(filtered_res[:5])
print(
    "after filtered the results with cut-off={}, the left number of hashtags is {}".
    format(cut_off, len(filtered_res)))

sorted_hashtags_freq = sorted(
    counter.items(), key=operator.itemgetter(1), reverse=True)
print("top 20 hashtags: ")
sorted_hashtags_freq[:20]

total number of distinct hashtags: 12644
total number of tweets: 271533
total number of tweets with hashtags: 115165
['hpv', 'vaccin', 'gardasil', 'ffrnn', 'wakingup']
after filtered the results with cut-off=100, the left number of hashtags is 200
top 20 hashtags: 


[('hpv', 52454),
 ('vaccin', 20715),
 ('gardasil', 14558),
 ('cervicalcanc', 13457),
 ('cancer', 12978),
 ('learntherisk', 3738),
 ('health', 2939),
 ('hpvvaccin', 2724),
 ('studi', 2458),
 ('vax', 2368),
 ('vaccineswork', 2196),
 ('cervicalhealthmonth', 1797),
 ('vaccineinjuri', 1760),
 ('pathogenposs', 1389),
 ('endcanc', 1309),
 ('gyncsm', 1285),
 ('cdcwhistleblow', 1187),
 ('clinician', 1009),
 ('cervic', 941),
 ('women', 879)]

In [5]:
#obtain a set of hashtags with freq larger than 1000
top_hashtags = set()
for each in sorted_hashtags_freq:
    if each[1] > 2000:
        top_hashtags.add(each[0])
top_hashtags

{'cancer',
 'cervicalcanc',
 'gardasil',
 'health',
 'hpv',
 'hpvvaccin',
 'learntherisk',
 'studi',
 'vaccin',
 'vaccineswork',
 'vax'}

In [6]:
#load general stop words
from nltk.corpus import stopwords
import string
stopWords = set(stopwords.words('english'))
punc = [each for each in string.punctuation] + ["...", "...."]

#get word freq
flag = 0 #use flag to control if we need to keep the stop words in the twitter
from nltk.tokenize import word_tokenize
for each in grouped_data:  
    #filter off stop words and punctuation and hashtags
    if flag == 2:
        each['words'] = [
            w
            for w in
            map(lambda word: word if word not in stopWords and word not in punc and word not in hashtag_list else "",
                [word for word in word_tokenize(each['text'])]) if w
        ]
    #filter off hashtags and punctuation
    elif flag == 1:
        each['words'] = [
            w
            for w in
            map(lambda word: word if word not in punc and word not in hashtag_list else "",
                [word for word in word_tokenize(each['text'])]) if w
        ]
    #keep everyting
    elif flag == 0:
        each['words'] = [word for word in word_tokenize(each['text'])]
        

print(len(grouped_data))
grouped_data[:5]

#might not need to filter the stopwords
#provide three types of data set

271533


[{'hashtags': [],
  'id': '418175306319421440',
  'orignal_hashtags': [],
  'raw': 'over half a million gardasil vaccines recalled due to glass particle contamination - http://t.co/knuerlgxnl',
  'text': 'over half a million gardasil vaccines recalled due to glass particle contamination -',
  'words': ['over',
   'half',
   'a',
   'million',
   'gardasil',
   'vaccines',
   'recalled',
   'due',
   'to',
   'glass',
   'particle',
   'contamination',
   '-']},
 {'hashtags': [],
  'id': '418188525217923072',
  'orignal_hashtags': [],
  'raw': 'oh, and the hpv vaccine will make you sterile. promise.',
  'text': 'oh, and the hpv vaccine will make you sterile. promise.',
  'words': ['oh',
   ',',
   'and',
   'the',
   'hpv',
   'vaccine',
   'will',
   'make',
   'you',
   'sterile',
   '.',
   'promise',
   '.']},
 {'hashtags': ['hpv', 'vaccin'],
  'id': '418263863772327936',
  'orignal_hashtags': ['#hpv', '#vaccine'],
  'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave

In [7]:
#splite the data into two sets, one contains the tweets with labels, one contains tweets without labels
tw_labeled = []
tw_no_label = []

for each in grouped_data:
    if each['hashtags']:
        tw_labeled.append(each)
    else:
        tw_no_label.append(each) 

len(tw_labeled)

115165

In [92]:
#output to json respectively
file_labeled = "temp/labeled_tweets.csv"
file_noLabeled = "temp/not_labeled_tweets.csv"

def save2csv(data, file):
    headers = data[0].keys()
    with open(file, "w", newline="", encoding="utf-8") as fw:
        writer = csv.DictWriter(fw, fieldnames=headers)
        writer.writeheader()
        for each in data:
            writer.writerow(each)
            
save2csv(tw_labeled, file_labeled)
save2csv(tw_no_label, file_noLabeled)

In [93]:
tw_labeled[:2]

[{'hashtags': ['hpv', 'vaccin'],
  'id': '418263863772327936',
  'orignal_hashtags': ['#hpv', '#vaccine'],
  'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
  'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
  'words': ['rt',
   ':',
   'hpv',
   'vax',
   'coverage',
   'could',
   'be',
   '93',
   '%',
   'if',
   'doctors',
   'gave',
   'hpv',
   'vaccine',
   'each',
   'time',
   'a',
   'preteen',
   '/',
   'teen',
   'got',
   'any',
   'other',
   'vaccine',
   '>']},
 {'hashtags': ['hpv', 'vaccin'],
  'id': '418194978310209536',
  'orignal_hashtags': ['#hpv', '#vaccine'],
  'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine....',
  'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen

In [8]:
#extract the tweets with the tag(s) in the most freq tag list
modeling_data = []
neg_sample = []

top_hashtags_map = dict()
for i, each in enumerate(top_hashtags):
    top_hashtags_map[each] = i
flag = 0
for each in tw_labeled:
    each['hashtag_label'] = []
    for tg in each['hashtags']:
        if tg in top_hashtags:
            each['hashtag_label'].append(top_hashtags_map[tg])
            flag = 1
    if flag:
        modeling_data.append(each)
        flag = 0
    else:
        neg_sample.append(each)

modeling_data[:2]

[{'hashtag_label': [2, 5],
  'hashtags': ['hpv', 'vaccin'],
  'id': '418263863772327936',
  'orignal_hashtags': ['#hpv', '#vaccine'],
  'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
  'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
  'words': ['rt',
   ':',
   'hpv',
   'vax',
   'coverage',
   'could',
   'be',
   '93',
   '%',
   'if',
   'doctors',
   'gave',
   'hpv',
   'vaccine',
   'each',
   'time',
   'a',
   'preteen',
   '/',
   'teen',
   'got',
   'any',
   'other',
   'vaccine',
   '>']},
 {'hashtag_label': [2, 5],
  'hashtags': ['hpv', 'vaccin'],
  'id': '418194978310209536',
  'orignal_hashtags': ['#hpv', '#vaccine'],
  'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine....',
  'text': 'rt : hpv vax coverage could 

In [9]:
neg_sample[0]

{'hashtag_label': [],
 'hashtags': ['cervicalhealthawarenessmonth', 'obamacar'],
 'id': '418769437261455360',
 'orignal_hashtags': ['#cervicalhealthawarenessmonth', '#obamacare'],
 'raw': 'rt @ppsne: jan. is #cervicalhealthawarenessmonth. thanks to #obamacare, pap tests + the hpv vaccine are covered without a copay. http://t.c…',
 'text': 'rt : jan. is cervicalhealthawarenessmonth. thanks to obamacare, pap tests + the hpv vaccine are covered without a copay.',
 'words': ['rt',
  ':',
  'jan.',
  'is',
  'cervicalhealthawarenessmonth',
  '.',
  'thanks',
  'to',
  'obamacare',
  ',',
  'pap',
  'tests',
  '+',
  'the',
  'hpv',
  'vaccine',
  'are',
  'covered',
  'without',
  'a',
  'copay',
  '.']}

In [95]:
#twitter length
tmax = 0
tmin = 500
for each in modeling_data:
    size = len(each['words'])
    tmax = max(tmax, size)
    tmin = min(tmin, size)
    if tmin == 0:
        print(each)
print("The longest tweets has {} words".format(tmax))
print("The shortest tweets has {} words".format(tmin))
print("The averaged tweets has {} words".format((tmax - tmin)/2))
print("Total number of modeling sample: {}".format(len(modeling_data)))

The longest tweets has 39 words
The shortest tweets has 2 words
The averaged tweets has 18.5 words
Total number of modeling sample: 81049


In [98]:
#save data for modeling
file_data4CNN = "temp/tweets4classification.json"

with open(file_data4CNN, "w", encoding="utf-8") as f:
    json.dump({'data':modeling_data, 'categorical_num':len(top_hashtags)}, f)

with open(file_data4CNN, "r", encoding="utf-8") as f:
    md = json.load(f)

print(len(md['data']))
print(md['categorical_num'])
md['data'][:1]

81049
11


[{'hashtag_label': [1, 5],
  'hashtags': ['hpv', 'vaccin'],
  'id': '418263863772327936',
  'orignal_hashtags': ['#hpv', '#vaccine'],
  'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
  'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
  'words': ['rt',
   ':',
   'hpv',
   'vax',
   'coverage',
   'could',
   'be',
   '93',
   '%',
   'if',
   'doctors',
   'gave',
   'hpv',
   'vaccine',
   'each',
   'time',
   'a',
   'preteen',
   '/',
   'teen',
   'got',
   'any',
   'other',
   'vaccine',
   '>']}]

In [11]:
# save no labeled data as json
with open("temp/no_labeled_data.json", "w", encoding="utf-8") as f:
    json.dump({'data':tw_no_label}, f)

with open("temp/no_labeled_data.json", "r", encoding="utf-8") as f:
    md = json.load(f)
md['data'][0]

{'hashtags': [],
 'id': '418175306319421440',
 'orignal_hashtags': [],
 'raw': 'over half a million gardasil vaccines recalled due to glass particle contamination - http://t.co/knuerlgxnl',
 'text': 'over half a million gardasil vaccines recalled due to glass particle contamination -',
 'words': ['over',
  'half',
  'a',
  'million',
  'gardasil',
  'vaccines',
  'recalled',
  'due',
  'to',
  'glass',
  'particle',
  'contamination',
  '-']}

In [12]:
# save neg_sample data as json
with open("temp/neg_sample_data.json", "w", encoding="utf-8") as f:
    json.dump({'data':neg_sample}, f)

with open("temp/neg_sample_data.json", "r", encoding="utf-8") as f:
    md = json.load(f)
md['data'][0]

{'hashtag_label': [],
 'hashtags': ['cervicalhealthawarenessmonth', 'obamacar'],
 'id': '418769437261455360',
 'orignal_hashtags': ['#cervicalhealthawarenessmonth', '#obamacare'],
 'raw': 'rt @ppsne: jan. is #cervicalhealthawarenessmonth. thanks to #obamacare, pap tests + the hpv vaccine are covered without a copay. http://t.c…',
 'text': 'rt : jan. is cervicalhealthawarenessmonth. thanks to obamacare, pap tests + the hpv vaccine are covered without a copay.',
 'words': ['rt',
  ':',
  'jan.',
  'is',
  'cervicalhealthawarenessmonth',
  '.',
  'thanks',
  'to',
  'obamacare',
  ',',
  'pap',
  'tests',
  '+',
  'the',
  'hpv',
  'vaccine',
  'are',
  'covered',
  'without',
  'a',
  'copay',
  '.']}