# Tweets classification by its hashtags and no labeled tweets hashtag predication


**Using top n hashtags as label to build a supervised model for tweets classification and hashtag predication**

[1.1 load data](1.1)

## <a id='1.1'> load packages and modeling data </a>

In [1]:
#import tensorflow backend
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12547262523055465833
]


In [22]:
import keras
from keras.layers import *
from keras.models import Sequential, Model
from keras.optimizers import *
from keras.preprocessing import sequence

print(keras.__version__)
print(keras.backend.backend())

2.1.2
tensorflow


In [3]:
import numpy as np
import bcolz
import pickle

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
    
def load_array(fname):
    return bcolz.open(fname)[:]

def save_dict(fname, dictionary):
    with open(fname, 'wb') as f:
        pickle.dump(dictionary, f)

def load_dict(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

In [4]:
#load tweets data
import json
tweets_file = "temp/tweets4classification.json"
with open(tweets_file, "r", encoding="utf-8") as f:
    modeling_data = json.load(f)
modeling_data['data'][0]

{'hashtag_label': [1, 5],
 'hashtags': ['hpv', 'vaccin'],
 'id': '418263863772327936',
 'orignal_hashtags': ['#hpv', '#vaccine'],
 'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
 'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
 'words': ['rt',
  ':',
  'hpv',
  'vax',
  'coverage',
  'could',
  'be',
  '93',
  '%',
  'if',
  'doctors',
  'gave',
  'hpv',
  'vaccine',
  'each',
  'time',
  'a',
  'preteen',
  '/',
  'teen',
  'got',
  'any',
  'other',
  'vaccine',
  '>']}

## process word embeddings 

In [5]:
#load processed word enbeddings
path = 'wordsenbeddings/'
res_path = path + 'results/'

def load_vectors(name):
    loc = res_path + name
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [6]:
def get_glove(name, dim):
    with open(path+ 'glove.' + name + '.txt', 'r', encoding="utf-8") as f:
        vecs = []
        words = []
        
        for i, line in enumerate(f):
            d = line.split()
            word = d[0]
            vec = np.array(d[1:], dtype=np.float32)
            if (len(d) == dim): # this is space
                word = ' '
                vec = np.array(d, dtype=np.float32)
            
            words.append(word)            
            vecs.append(vec)

        wordidx = {o:i for i,o in enumerate(words)}
        save_array(res_path+name+'.dat', vecs)
        pickle.dump(words, open(res_path+name+'_words.pkl','wb'))
        pickle.dump(wordidx, open(res_path+name+'_idx.pkl','wb'))

In [7]:
get_glove('twitter.27B.200d', 200)
get_glove('twitter.27B.25d', 25)
get_glove('twitter.27B.50d', 50)
get_glove('twitter.27B.100d', 100)

## prepare train and test sample

In [6]:
data = np.asarray([each['words'] for each in modeling_data['data']])
label = np.asarray([each['hashtag_label'] for each in modeling_data['data']])

print(data[:2])
print(label[:2])
print(len(data))
print(len(label))

[ list(['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '>'])
 list(['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '...', '.'])]
[list([1, 5]) list([1, 5])]
81049
81049


In [7]:
def flat_labels(labels):
    flatted = []
    l = modeling_data['categorical_num']
    for label in labels:
        m = [0.] * l
        for each in label:
            m = list(map(lambda x: x[0] + x[1], zip(m, each)))
        flatted.append(m)
    return np.asarray(flatted)    

In [8]:
from keras.utils.np_utils import to_categorical
categorical_label = list(map(lambda x: to_categorical(x, num_classes=modeling_data['categorical_num']), label))

categorical_label_flatted = flat_labels(categorical_label)

print(len(categorical_label))
categorical_label_flatted[:20]

81049


array([[ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0

In [9]:
from sklearn.model_selection import train_test_split
X_train_0, X_test, y_train_0, y_test = train_test_split(data, categorical_label_flatted, test_size=0.2, random_state=42)

In [10]:
print(X_train_0[:2])
print(X_test[:2])
print(y_train_0[:2])
print(y_test[:2])

[ list(['rt', ':', 'new', 'infographic', 'on', 'how', 'most', 'cases', 'of', 'cervicalcancer', 'can', 'be', 'prevented', 'w', '/', 'tests', '&', 'hpv', 'vaccine', '.', 'vitalsigns'])
 list(['check', 'out', 'the', 'gci', 'team', "'s", 'newest', 'publication', 'on', 'hpv', 'vaccine', 'implementation', 'for', 'cancer', 'prevention', 'in', 'latinamerica', '!'])]
[ list(['two', 'uk', 'girls', 'left', 'paralyzed', 'after', 'hpv', 'jabs', '.', 'authorities', 'still', 'claim', 'it', "'s", 'coincidence', '.'])
 list(['cervicalcancer', 'deaths', 'have', 'decreased', 'dramatically', 'over', 'the', 'past', '40', 'years', ',', 'mostly', 'due', 'to', 'increased', 'screening', '.'])]
[[ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]]
[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]]


In [11]:
#save 5% train samples to predication task
cut = int(len(X_train_0) * 0.95)
X_train = X_train_0[:cut]
y_train = y_train_0[:cut]
X_pred = X_train_0[cut:]
y_pred = y_train_0[cut:]

In [34]:
#!!!!!do not run this, run the third one below this to directly load the dictionary
#create words dictionary for the data
from functools import reduce

from collections import defaultdict
frequency = defaultdict(int)

all_tokens = list(reduce(lambda x, y: x + y, [l['words'] for l in modeling_data['data']]))
print(all_tokens[:5])

KeyboardInterrupt: 

In [None]:
print(len(all_tokens))

for token in all_tokens:
    frequency[token] += 1
    
dictionary = sorted(frequency.items(), key=lambda x:x[1], reverse=True)
dictionary = [k for k,v in dictionary[:vocab_size]]
print(dictionary[:10])
len(dictionary)

In [None]:
save_dict('model/dict.dd', dictionary)

In [12]:
vocab_size = 0
dictionary = load_dict('model/dict.dd')
if (not vocab_size):
    vocab_size = len(dictionary)
dictionary[:20]

[':',
 'hpv',
 'rt',
 'vaccine',
 '.',
 'the',
 ',',
 'of',
 'to',
 'cancer',
 'in',
 'gardasil',
 'for',
 'cervicalcancer',
 'a',
 '&',
 'and',
 'is',
 '!',
 'cervical']

In [13]:
#word to index
X_train_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_train]
X_test_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_test]
X_pred_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_pred]

In [14]:
print(X_train_f[:5])
print(X_test_f[:5])
print(X_pred_f[:5])

[[2, 0, 46, 781, 22, 74, 100, 121, 7, 13, 35, 55, 352, 54, 40, 162, 15, 1, 3, 4, 1126], [364, 93, 5, 10430, 1019, 20, 1770, 3765, 22, 1, 3, 1637, 12, 9, 61, 10, 8726, 18], [1159, 49, 251, 22, 97, 4382, 12, 448, 3863, 8, 20, 1, 25, 493, 18, 400, 13], [2, 0, 122, 312, 90, 662, 57, 8, 369, 33, 13, 4, 529, 26, 967, 16, 1777, 831, 8, 39, 365, 12, 1], [2, 0, 27, 0, 11, 131, 19, 328, 10, 31, 244, 54, 40, 1, 51]]
[[210, 338, 41, 215, 165, 34, 1, 471, 4, 459, 182, 409, 29, 20, 411, 4], [13, 180, 57, 1813, 2238, 296, 5, 1477, 1003, 134, 6, 3875, 616, 8, 448, 67, 4], [2, 0, 27, 0, 14, 262, 7, 346, 362, 10, 14, 509, 110, 118, 250, 34, 168, 1, 3], [726, 12, 8803, 423, 23, 155, 15, 281, 223, 397, 8, 1, 0, 492, 287], [97, 1, 16, 9, 153, 2965, 3326, 3551, 22, 2585, 193, 18, 1077, 125, 8, 159, 0]]
[[3, 1413, 264, 41, 177, 1, 68, 32, 757, 37, 697], [2, 0, 4, 84, 981, 18, 11, 3379, 6, 226, 2928, 10352, 1760, 81, 10353, 6, 4817, 19993, 4], [2, 0, 30, 24, 478, 11, 11792, 21], [1, 25, 68, 569, 145, 917, 41,

## embedding words using GloVe

In [15]:
lens = np.array([len(doc) for doc in X_train_f])
(lens.max(), lens.min(), lens.mean())

(39, 2, 17.012127213987693)

In [16]:
seq_len = 39
embedding_dim = 50
vecs, words, wordidx = load_vectors('twitter.27B.%dd'%(embedding_dim))

In [17]:
X_train_u = sequence.pad_sequences(X_train_f, maxlen=seq_len)
X_test_u = sequence.pad_sequences(X_test_f, maxlen=seq_len)
X_pred_u = sequence.pad_sequences(X_pred_f, maxlen=seq_len)

In [18]:
print(X_train_u[:5])
print(X_test_u[:5])
print(X_pred_u[:5])

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     2     0    46   781    22    74
    100   121     7    13    35    55   352    54    40   162    15     1
      3     4  1126]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0   364    93     5
  10430  1019    20  1770  3765    22     1     3  1637    12     9    61
     10  8726    18]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0  1159    49
    251    22    97  4382    12   448  3863     8    20     1    25   493
     18   400    13]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     2     0   122   312    90   662    57     8
    369    33    13     4   529    26   967    16  1777   831     8    39
    365    12     1]
 [    0     0     0     0   

In [19]:
def create_embedding(dictionary):
    print(vecs.shape)
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i, word in enumerate(dictionary):
        #if word:# and re.match(r"^[a-zA-Z0-9\-]*$", word):
        src_idx = wordidx[word] if word in wordidx else 0
        
        if src_idx:
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [20]:
embedding = create_embedding(dictionary)

(1193517, 50)


## create CNN mode

In [24]:
dth = 0.4

model = Sequential()

#embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length=seq_len, weights=[embedding], trainable=False))
# model.add(Dropout(dth))

#hidden layer
model.add(Conv1D(embedding_dim, 4, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, 4, padding='same', activation='relu'))
model.add(Dropout(dth))
# model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(embedding_dim, 4, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, 4, padding='same', activation='relu'))
model.add(Dropout(dth))
# model.add(MaxPooling1D(pool_size=2))
# crf = ChainCRF()
# model.add(crf)
model.add(Conv1D(embedding_dim, 4, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, 4, padding='same', activation='relu'))
model.add(Dropout(dth))

#output layer
model.add(Flatten())
# model.add(Dense(512, activation='relu'))
# model.add(Dense(1024, activation='relu'))
# model.add(Dropout(dth))
model.add(Dense(modeling_data['categorical_num'], activation='sigmoid'))

#compile model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
#also can use SDG as optimizer
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 39, 50)            1165350   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 39, 50)            10050     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 39, 50)            10050     
_________________________________________________________________
dropout_5 (Dropout)          (None, 39, 50)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 39, 50)            10050     
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 39, 50)            10050     
_________________________________________________________________
dropout_6 (Dropout)          (None, 39, 50)            0         
__________

In [26]:
model.optimizer.lr=1e-4
from keras.callbacks import EarlyStopping, ModelCheckpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

model_chk_path = 'model/cnn_hashtagcls_emb{}_weights.validation.h5'.format(embedding_dim)
mcp = ModelCheckpoint(model_chk_path, monitor="val_loss", verbose=1,
                      save_best_only=True, save_weights_only=False)

In [27]:
#quick test
model.fit(X_train_u, y_train, validation_data=(X_test_u, y_test), epochs=100, batch_size=64,  callbacks=[early_stopping])

Train on 61597 samples, validate on 16210 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100


<keras.callbacks.History at 0x1d2268b0d68>

#### result
- 0.7513
- 0.7524
- 0.8014

## train and save model for future usage

In [55]:
model.fit(X_train_u, y_train, validation_data=(X_test_u, y_test), epochs=100, batch_size=64, callbacks=[early_stopping])

Train on 69296 samples, validate on 8105 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

KeyboardInterrupt: 

In [None]:
model.layers[0].trainable=True
model.optimizer.lr=1e-3
model.fit(X_train_u, y_train, validation_data=(X_test_u, y_test), epochs=100, batch_size=64, callbacks=[early_stopping])

In [None]:
#save the model
model.save_weights('model/cnn_hashtagcls_emb{}_weights.h5'.format(embedding_dim))

In [None]:
#combine train and test data and retrain the model as the final model to use for predication
early_stopping = EarlyStopping(monitor='loss', patience=5)
model.fit(
    np.concatenate((X_train_u, y_train), axis=0),
    np.concatenate((X_test_u, y_test), axis=0),
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping])
model.save_weights(
    "model/all_cnn_hashtagcls_emb{}_weights.h5".format(embedding_dim))

## extract the last hidden layer information
>https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer

# Predication
**The predication will be performed on both no_labeled data set and neg_sample**

## load two data sources