# Keras generic implementation of tweetshashtag2vec

In [7]:
#import tensorflow backend
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
import keras
from keras.layers import *
from keras.models import Sequential, Model
from keras.optimizers import *
from keras.preprocessing import sequence
print(keras.__version__)
print(keras.backend.backend())
import numpy as np

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7388235101743159890
]


Using TensorFlow backend.


2.1.2
tensorflow


In [3]:
#load data
with open("temp/neg_sample_data.json", "r", encoding="utf-8") as f:
    md_nfreq_sample = json.load(f)
    
with open("temp/tweets4classification.json", "r", encoding="utf-8") as f:
    md_freq_sample = json.load(f)

In [5]:
md_nfreq_sample['data'][0]

{'hashtag_label': [],
 'hashtags': ['cervicalhealthawarenessmonth', 'obamacar'],
 'id': '418769437261455360',
 'orignal_hashtags': ['#cervicalhealthawarenessmonth', '#obamacare'],
 'raw': 'rt @ppsne: jan. is #cervicalhealthawarenessmonth. thanks to #obamacare, pap tests + the hpv vaccine are covered without a copay. http://t.c…',
 'text': 'rt : jan. is cervicalhealthawarenessmonth. thanks to obamacare, pap tests + the hpv vaccine are covered without a copay.',
 'words': ['rt',
  ':',
  'jan.',
  'is',
  'cervicalhealthawarenessmonth',
  '.',
  'thanks',
  'to',
  'obamacare',
  ',',
  'pap',
  'tests',
  '+',
  'the',
  'hpv',
  'vaccine',
  'are',
  'covered',
  'without',
  'a',
  'copay',
  '.']}

In [6]:
md_freq_sample['data'][0]

{'hashtag_label': [1, 5],
 'hashtags': ['hpv', 'vaccin'],
 'id': '418263863772327936',
 'orignal_hashtags': ['#hpv', '#vaccine'],
 'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
 'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
 'words': ['rt',
  ':',
  'hpv',
  'vax',
  'coverage',
  'could',
  'be',
  '93',
  '%',
  'if',
  'doctors',
  'gave',
  'hpv',
  'vaccine',
  'each',
  'time',
  'a',
  'preteen',
  '/',
  'teen',
  'got',
  'any',
  'other',
  'vaccine',
  '>']}

# gensim implementation of tweetshashtag2vec using doc2vec api

In [1]:
import gensim



In [16]:
#data prepare
train_raw_data = md_freq_sample['data'] +  md_nfreq_sample['data']
token_count = sum(len(each['words']) for each in train_raw_data)
print(token_count)
print(len(train_raw_data))
train_raw_data[0]

1960039
115165


{'hashtag_label': [1, 5],
 'hashtags': ['hpv', 'vaccin'],
 'id': '418263863772327936',
 'orignal_hashtags': ['#hpv', '#vaccine'],
 'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
 'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
 'words': ['rt',
  ':',
  'hpv',
  'vax',
  'coverage',
  'could',
  'be',
  '93',
  '%',
  'if',
  'doctors',
  'gave',
  'hpv',
  'vaccine',
  'each',
  'time',
  'a',
  'preteen',
  '/',
  'teen',
  'got',
  'any',
  'other',
  'vaccine',
  '>']}

In [17]:
from gensim.models.doc2vec import TaggedDocument
docs = [TaggedDocument(each['words'], each['orignal_hashtags']) for each in train_raw_data]
docs[0]

TaggedDocument(words=['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '>'], tags=['#hpv', '#vaccine'])

In [20]:
#training model
model = gensim.models.Doc2Vec(docs, dm = 0, alpha=0.025, min_alpha=0.025, min_count=1)

for epoch in range(100):
    if epoch % 20 == 0:
        print ('Now training epoch %s'%epoch)
    model.train(docs, total_examples = token_count, epochs = model.iter)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

Now training epoch 0
Now training epoch 20
Now training epoch 40
Now training epoch 60
Now training epoch 80


In [23]:
# shows the similar words
print (model.most_similar('hpv'))
 
# shows the learnt embedding
print (model['hpv'])
 
# shows the similar docs with id = 3
print (model.docvecs.most_similar(str(3)))

[('liverhealth', 0.3523634374141693), ('hillarys', 0.35067397356033325), ('usfbulls', 0.3452232778072357), ('screenis', 0.3431748151779175), ('meckbocc', 0.3351533114910126), ('linda', 0.33509790897369385), ('reliably', 0.3346606194972992), ('opposehb', 0.3339211046695709), ('curran', 0.3336401581764221), ('heath', 0.33256030082702637)]
[ -3.93741485e-03  -1.98754738e-03  -2.70795985e-03   2.24512909e-03
   3.73278628e-03  -2.55319849e-03   4.50297352e-03  -2.68983818e-03
  -4.09605261e-03   1.00099901e-03   3.14083928e-03  -4.35443129e-03
   3.29009513e-03   1.52686203e-03   3.23361019e-03   5.49465534e-04
   4.65575280e-03  -2.86946818e-03  -2.41307425e-03  -4.73104045e-03
  -1.65909692e-03  -1.75934541e-03   3.03406874e-03   4.34236415e-03
   2.77704652e-03   3.93071445e-03   1.71775371e-03   4.69600409e-03
   1.46136095e-03   3.92845552e-03  -1.66990375e-03  -3.22868675e-03
   1.73513603e-03  -3.32616572e-03   3.40629416e-03   4.16628597e-03
   4.51601576e-03   2.02062516e-03  -9.8

  
  """


TypeError: '<' not supported between instances of 'str' and 'int'