# Keras generic implementation of tweetshashtag2vec

In [1]:
#import tensorflow backend
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
import keras
from keras.layers import *
from keras.models import Sequential, Model
from keras.optimizers import *
from keras.preprocessing import sequence
print(keras.__version__)
print(keras.backend.backend())
import numpy as np

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17913481044391591244
]
2.1.2
tensorflow


Using TensorFlow backend.


In [2]:
#load data
with open("temp/neg_sample_data.json", "r", encoding="utf-8") as f:
    md_nfreq_sample = json.load(f)
    
with open("temp/tweets4classification.json", "r", encoding="utf-8") as f:
    md_freq_sample = json.load(f)

In [3]:
md_nfreq_sample['data'][0]

{'hashtag_label': [],
 'hashtags': ['cervicalhealthawarenessmonth', 'obamacar'],
 'id': '418769437261455360',
 'orignal_hashtags': ['#cervicalhealthawarenessmonth', '#obamacare'],
 'raw': 'rt @ppsne: jan. is #cervicalhealthawarenessmonth. thanks to #obamacare, pap tests + the hpv vaccine are covered without a copay. http://t.c…',
 'text': 'rt : jan. is cervicalhealthawarenessmonth. thanks to obamacare, pap tests + the hpv vaccine are covered without a copay.',
 'words': ['rt',
  ':',
  'jan.',
  'is',
  'cervicalhealthawarenessmonth',
  '.',
  'thanks',
  'to',
  'obamacare',
  ',',
  'pap',
  'tests',
  '+',
  'the',
  'hpv',
  'vaccine',
  'are',
  'covered',
  'without',
  'a',
  'copay',
  '.']}

In [4]:
md_freq_sample['data'][0]

{'hashtag_label': [1, 5],
 'hashtags': ['hpv', 'vaccin'],
 'id': '418263863772327936',
 'orignal_hashtags': ['#hpv', '#vaccine'],
 'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
 'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
 'words': ['rt',
  ':',
  'hpv',
  'vax',
  'coverage',
  'could',
  'be',
  '93',
  '%',
  'if',
  'doctors',
  'gave',
  'hpv',
  'vaccine',
  'each',
  'time',
  'a',
  'preteen',
  '/',
  'teen',
  'got',
  'any',
  'other',
  'vaccine',
  '>']}

# gensim implementation of tweetshashtag2vec using doc2vec api

In [6]:
import gensim

## train a doc2vec based on a dataset with all hashtags

In [82]:
#data prepare
train_raw_data = md_freq_sample['data'] +  md_nfreq_sample['data']
print(train_raw_data[0]['words'])
print(len(train_raw_data))
train_raw_data[0]

['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '>']
115165


{'hashtag_label': [1, 5],
 'hashtags': ['hpv', 'vaccin'],
 'id': '418263863772327936',
 'orignal_hashtags': ['#hpv', '#vaccine'],
 'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
 'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
 'words': ['rt',
  ':',
  'hpv',
  'vax',
  'coverage',
  'could',
  'be',
  '93',
  '%',
  'if',
  'doctors',
  'gave',
  'hpv',
  'vaccine',
  'each',
  'time',
  'a',
  'preteen',
  '/',
  'teen',
  'got',
  'any',
  'other',
  'vaccine',
  '>']}

In [83]:
from gensim.models.doc2vec import TaggedDocument
docs = [TaggedDocument(each['words'], each['orignal_hashtags']) for each in train_raw_data]
words_count = sum(len(each[0]) for each in docs)
tags_count = sum(len(each[1]) for each in docs)
print(words_count)
print(tags_count)
np.random.shuffle(docs)
docs[0]

1960039
239984


TaggedDocument(words=['new', 'concerns', 'about', 'the', 'human', 'papillomavirus', 'vaccine'], tags=['#vaccine'])

In [84]:
%%time 
#training model manully tune alpha
import multiprocessing
#using PV-DM w/ concatenation - window=5 (both sides) 
model = gensim.models.doc2vec.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, alpha=0.025, min_alpha=0.025, workers=multiprocessing.cpu_count(), iter = 50)
model.build_vocab(docs)

for epoch in range(100):
    if epoch % 10 == 0:
        print ('Now training epoch %s'%epoch)
    model.train(docs, total_examples = words_count, epochs = model.iter)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

KeyboardInterrupt: 

In [85]:
%%time
#training model
model2 = gensim.models.doc2vec.Doc2Vec(dm=0, dbow_words=1, size=100, alpha=0.025, min_alpha=0.025, min_count=1, workers=multiprocessing.cpu_count(), iter=100, window = 30)
model2.build_vocab(docs)
model2.train(docs, total_examples = words_count, epochs = model.iter)

KeyboardInterrupt: 

In [86]:
# shows the similar words
print (model.wv.most_similar('vaccine'))
print() 
# shows the learnt embedding
print (model.wv['cancer'])
print() 
# shows the similar docs with tag = #hpv
print (model.docvecs.most_similar("#vaccine"))
print (model.docvecs.most_similar("#vaccination"))
print (model.docvecs.most_similar("#vaccines"))
print() 
#hashtag embeddings
print(model.docvecs['#cervical'])

[('agenda-driven', 11.459138870239258), ('ssmear', 10.960464477539062), ('donotmakeexcuses', 10.914814949035645), ('billgatesfoundation', 10.402748107910156), ('122,844', 9.931833267211914), ('hpvfirst', 9.367964744567871), ('thinkcurefightscancer', 8.569284439086914), ('gymsm', 8.376276016235352), ('hinduwomen', 7.613397121429443), ('repostwhiz', 7.118128299713135)]

[ 0.11264648 -0.21668991  0.13107809 -0.9405598   0.15381834 -0.05187678
  0.20910028 -0.02861865 -0.01222217 -0.18312216 -0.39389199  0.58489096
  0.47533345  0.0807405  -0.19685933 -0.26060975  0.22280709  0.43311453
 -0.15438248  0.06454793 -0.07205147 -0.19594242  0.11337518  0.27306393
  0.24934128  0.30060491 -0.31004781 -0.44996673  0.20935149  0.16052416
 -0.56223422  0.45934319 -0.01562869 -0.64408451  0.1227384   0.14081834
  0.31652755  0.38460553  0.69540685 -0.015424   -0.10904443 -0.17931001
  0.39052817 -0.15917586 -0.47400111 -0.37571183  0.07580312  0.38070074
 -0.13670345 -0.04284313  0.41090503  0.49555

In [87]:
#assess model
origin = model.docvecs['#hpv'] 
word_sims = [('word', word, score) for word, score in model.most_similar([origin],topn=20)]
tag_sims = [('tag', tag, score) for tag, score in model.docvecs.most_similar([origin],topn=20)]
results = sorted((tag_sims + word_sims),key=lambda tup: -tup[2])
results[:20]

  This is separate from the ipykernel package so we can avoid doing imports until


[('tag', '#nocure', 13.237060546875),
 ('tag', '#arhp', 13.116081237792969),
 ('tag', '#actslikecancer', 13.039281845092773),
 ('word', 'ssmear', 12.815620422363281),
 ('word', 'agenda-driven', 12.494791984558105),
 ('word', 'donotmakeexcuses', 11.706401824951172),
 ('tag', '#canturntocancer', 11.06768798828125),
 ('word', '122,844', 10.25916862487793),
 ('word', 'hpvfirst', 10.030200004577637),
 ('word', 'thinkcurefightscancer', 9.484886169433594),
 ('word', 'gymsm', 9.359027862548828),
 ('word', 'billgatesfoundation', 9.23710823059082),
 ('word', 'hinduwomen', 8.3189697265625),
 ('tag', '#usnews', 7.730393886566162),
 ('word', 'gene-editing', 7.472973823547363),
 ('word', 'rar', 7.298567771911621),
 ('word', 'repostwhiz', 7.176763534545898),
 ('tag', '#500pxrtg', 7.152878761291504),
 ('word', 'ahem', 7.11164665222168),
 ('word', "'india", 5.6226806640625)]

In [88]:
#another evaluation
test_set = np.random.choice(train_raw_data, int(len(train_raw_data)*0.2))
test_tag = [each['orignal_hashtags'] for each in test_set]
test_seq = [each['words'] for each in test_set]
print(test_tag[:2])
print(test_seq[:2])

[['#acip', '#hpv', '#vaccine', '#vaccin'], ['#gardasil', '#vaccine']]
[['rt', ':', 'good', 'news', 'for', 'everyone', ':', 'cdc', 'acip', 'approved', 'a', '2-dose', 'schedule', ',', '6', 'months', 'apart', ',', 'for', 'hpv', 'vaccine', 'for', '9-14', 'year', 'olds', '.', 'vaccin'], ['rt', ':', 'dr.', 'harper', ':', 'gardasil', 'was', 'fast-tracked', 'vaccine', ',', 'so', 'we', 'do', "n't", 'know', 'long-term', 'side', 'effects']]


In [89]:
print(test_set[0]['text'])
inferred_vec = model.infer_vector(test_set[0]['words'])
print(test_set[0]['orignal_hashtags'])
model.docvecs.most_similar([inferred_vec])

rt : good news for everyone: cdc acip approved a 2-dose schedule, 6 months apart, for hpv vaccine for 9-14 year olds. vaccin
['#acip', '#hpv', '#vaccine', '#vaccin']


[('#actslikecancer', 18.694869995117188),
 ('#nocure', 16.895462036132812),
 ('#arhp', 16.645540237426758),
 ('#canturntocancer', 16.18458366394043),
 ('#usnews', 10.804193496704102),
 ('#500pxrtg', 8.893560409545898),
 ('#terrible', 0.9951874017715454),
 ('#acip', 0.8571861386299133),
 ('#sucks', 0.8467624187469482),
 ('#therebelpatient', 0.8244351148605347)]

In [90]:
for i, each in enumerate(test_set):
    if i < 10:
        inferred_vec = model.infer_vector(each['words'])
        print("tweet >> \n {}".format(each['text']))
        print("hashtags of tweet >>\n {}".format(each['orignal_hashtags']))
        inferred_res = model.docvecs.most_similar([inferred_vec], topn=5)
        print("predicated hashtags based on tweet >>\n {}".format(inferred_res))
        print("the top 5 second level infer hashtags >>\n {}".format(model.docvecs.most_similar(inferred_res[0][0], topn=5)))
        print()

tweet >> 
 rt : good news for everyone: cdc acip approved a 2-dose schedule, 6 months apart, for hpv vaccine for 9-14 year olds. vaccin
hashtags of tweet >>
 ['#acip', '#hpv', '#vaccine', '#vaccin']
predicated hashtags based on tweet >>
 [('#actslikecancer', 18.50225257873535), ('#nocure', 16.6639347076416), ('#arhp', 16.425540924072266), ('#canturntocancer', 15.531455993652344), ('#usnews', 10.321876525878906)]
the top 5 second level infer hashtags >>
 [('#nocure', 27.818897247314453), ('#canturntocancer', 26.703472137451172), ('#arhp', 24.8717041015625), ('#usnews', 15.32882022857666), ('#500pxrtg', 13.759662628173828)]

tweet >> 
 rt : dr. harper: gardasil was fast-tracked vaccine, so we don't know long-term side effects
hashtags of tweet >>
 ['#gardasil', '#vaccine']
predicated hashtags based on tweet >>
 [('#actslikecancer', 19.759796142578125), ('#nocure', 19.584598541259766), ('#arhp', 17.96923065185547), ('#canturntocancer', 17.141000747680664), ('#usnews', 9.855619430541992)]


## train a t2v model with a dataset containing only tweets with top 11 freq hashtags 

In [78]:
docs_1 = [TaggedDocument(each['words'], each['hashtags']) for each in md_freq_sample['data']]
np.random.shuffle(docs_1)
docs_1[0]

TaggedDocument(words=['hpv', 'vax', 'does', 'not', 'provide', 'herd', 'immunity', 'for', 'unvaccinated', 'women', 'or', 'cross-protection', 'for', 'nonvaccine', 'hpv', 'types', '[', 'a', ']'], tags=['hpv'])

In [79]:
%%time
#using skip-gram model
model_1 = gensim.models.doc2vec.Doc2Vec(dm=0, dbow_words=1, size=100, alpha=0.025, min_alpha=0.025, min_count=0, workers=multiprocessing.cpu_count(), iter = 50)
model_1.build_vocab(docs_1)

for epoch in range(100):
    if epoch % 10 == 0:
        print ('Now training epoch %s'%epoch)
    model_1.train(docs, total_examples = words_count, epochs = model.iter)
    model_1.alpha -= 0.002  # decrease the learning rate
    model_1.min_alpha = model.alpha  # fix the learning rate, no decay

Wall time: 1h 6min 46s


In [95]:
# shows the similar words
print (model_1.wv.most_similar('vaccine'))
print() 
# shows the learnt embedding
print (model_1.wv['cancer'])
print() 
# print (model_1.docvecs.most_similar("vaccine"))
# print (model_1.docvecs.most_similar("vaccination"))
# print (model_1.docvecs.most_similar("vaccines"))
print() 
#hashtag embeddings
print(model_1.docvecs['hpv'])

[('hpv', 0.9193835854530334), ('...', 0.9155786633491516), ('prevent', 0.914875864982605), ('on', 0.9136378765106201), ('how', 0.9131883382797241), ('for', 0.9126969575881958), ('gardasil', 0.9106822609901428), ('an', 0.9084890484809875), ('japan', 0.9073395729064941), ('rt', 0.907110333442688)]

[ 0.14842507  0.12082995  1.05221498  0.33384553  0.69298142  0.1518153
 -0.06987447 -0.42861989  0.15084557  1.83809054 -0.29634374  1.61741209
  0.40982625  0.68114412  1.62130857 -0.00614449  0.22143015  0.76318103
 -0.87167609 -0.1135737   0.56196189 -0.06929655  0.84938121  0.12623297
  0.61328274  0.03627477  1.02752554  0.12261541 -0.25614911 -0.35393363
 -0.24924704  0.55768597  0.2414221   1.00235426 -0.13558559  0.19496635
 -0.8374896  -1.28433609  0.11568487 -0.78347546  0.72523242  0.19256008
  0.49237818 -0.88311279  0.37306106  0.81254292 -2.28756976 -0.46370482
 -0.20897618  0.52372319  1.20068955  1.03341687 -0.86554331 -0.19607796
  0.72531003 -0.07576013  1.20754457  1.027128

In [92]:
test_set = np.random.choice(train_raw_data, int(len(md_freq_sample['data'])*0.2))

for i, each in enumerate(test_set):
    if i < 10:
        inferred_vec = model_1.infer_vector(each['words'])
        print("tweet >> \n {}".format(each['text']))
        print("hashtags of tweet >>\n {}".format(each['hashtags']))
        inferred_res = model_1.docvecs.most_similar([inferred_vec], topn=5)
        print("predicated hashtags based on tweet >>\n {}".format(inferred_res))
        print("the top 5 second level infer hashtags >>\n {}".format(model_1.docvecs.most_similar(inferred_res[0][0], topn=5)))
        print()

tweet >> 
 rt : tune in to a live stream @4:30 est to hear  addressing questions regarding hpv vaccine safety.
hashtags of tweet >>
 ['hpv', 'vaccin']
predicated hashtags based on tweet >>
 [('scotu', 0.39072296023368835), ('toxin', 0.3800101578235626), ('cancercannot', 0.35010913014411926), ('ldconf', 0.3358132243156433), ('activebeat', 0.33009272813796997)]
the top 5 second level infer hashtags >>
 [('mommyproblem', 0.37642332911491394), ('ye', 0.34130701422691345), ('notrust', 0.33203554153442383), ('naturalmedicin', 0.314901202917099), ('verruciformi', 0.31052085757255554)]

tweet >> 
 rt : . . . stop the medical mandates! children are dying! hpv kills. showupday
hashtags of tweet >>
 ['showupday']
predicated hashtags based on tweet >>
 [('zimbabw', 0.4156832695007324), ('womenshealthca', 0.3363494873046875), ('scotland', 0.32562756538391113), ('vaccinateyourfamili', 0.32155245542526245), ('team', 0.30946558713912964)]
the top 5 second level infer hashtags >>
 [('hcihealthequ', 0.3