# Tweets classification by its hashtags and no labeled tweets hashtag predication


**Using top n hashtags as label to build a supervised model for tweets classification and hashtag predication**

[1.1 load data](1.1)

## <a id='1.1'> load packages and modeling data </a>

In [5]:
#import tensorflow backend
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4736411848066457814
]


In [6]:
import keras
from keras.layers import *
from keras.models import Sequential, Model
from keras.optimizers import *
from keras.preprocessing import sequence

print(keras.__version__)
print(keras.backend.backend())

2.1.2
tensorflow


In [7]:
import numpy as np
import bcolz
import pickle

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
    
def load_array(fname):
    return bcolz.open(fname)[:]

def save_dict(fname, dictionary):
    with open(fname, 'wb') as f:
        pickle.dump(dictionary, f)

def load_dict(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

In [8]:
#load tweets data
import json
tweets_file = "temp/tweets4classification.json"
with open(tweets_file, "r", encoding="utf-8") as f:
    modeling_data = json.load(f)
modeling_data['data'][0]

{'hashtag_label': [1, 5],
 'hashtags': ['hpv', 'vaccin'],
 'id': '418263863772327936',
 'orignal_hashtags': ['#hpv', '#vaccine'],
 'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
 'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
 'words': ['rt',
  ':',
  'hpv',
  'vax',
  'coverage',
  'could',
  'be',
  '93',
  '%',
  'if',
  'doctors',
  'gave',
  'hpv',
  'vaccine',
  'each',
  'time',
  'a',
  'preteen',
  '/',
  'teen',
  'got',
  'any',
  'other',
  'vaccine',
  '>']}

## process word embeddings 

In [9]:
#load processed word enbeddings
path = 'wordsenbeddings/'
res_path = path + 'results/'

def load_vectors(name):
    loc = res_path + name
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [6]:
def get_glove(name, dim):
    with open(path+ 'glove.' + name + '.txt', 'r', encoding="utf-8") as f:
        vecs = []
        words = []
        
        for i, line in enumerate(f):
            d = line.split()
            word = d[0]
            vec = np.array(d[1:], dtype=np.float32)
            if (len(d) == dim): # this is space
                word = ' '
                vec = np.array(d, dtype=np.float32)
            
            words.append(word)            
            vecs.append(vec)

        wordidx = {o:i for i,o in enumerate(words)}
        save_array(res_path+name+'.dat', vecs)
        pickle.dump(words, open(res_path+name+'_words.pkl','wb'))
        pickle.dump(wordidx, open(res_path+name+'_idx.pkl','wb'))

In [7]:
get_glove('twitter.27B.200d', 200)
get_glove('twitter.27B.25d', 25)
get_glove('twitter.27B.50d', 50)
get_glove('twitter.27B.100d', 100)

## prepare train and test sample

In [10]:
data = np.asarray([each['words'] for each in modeling_data['data']])
label = np.asarray([each['hashtag_label'] for each in modeling_data['data']])

print(data[:2])
print(label[:2])
print(len(data))
print(len(label))

[ list(['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '>'])
 list(['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '...', '.'])]
[list([1, 5]) list([1, 5])]
81049
81049


In [77]:
def flat_labels(labels):
    flatted = []
    l = modeling_data['categorical_num']
    for label in labels:
        m = [0.] * l
        for each in label:
            m = list(map(lambda x: x[0] or x[1], zip(m, each)))
        flatted.append(m)
    return np.asarray(flatted)    

In [78]:
from keras.utils.np_utils import to_categorical
categorical_label = list(map(lambda x: to_categorical(x, num_classes=modeling_data['categorical_num']), label))

categorical_label_flatted = flat_labels(categorical_label)

print(len(categorical_label))
categorical_label_flatted[:20]

81049


array([[ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0

In [60]:
from sklearn.model_selection import train_test_split
X_train_0, X_test, y_train_0, y_test = train_test_split(data, categorical_label_flatted, test_size=0.2, random_state=42)

In [61]:
print(X_train_0[:2])
print(X_test[:2])
print(y_train_0[:2])
print(y_test[:2])

[ list(['rt', ':', 'new', 'infographic', 'on', 'how', 'most', 'cases', 'of', 'cervicalcancer', 'can', 'be', 'prevented', 'w', '/', 'tests', '&', 'hpv', 'vaccine', '.', 'vitalsigns'])
 list(['check', 'out', 'the', 'gci', 'team', "'s", 'newest', 'publication', 'on', 'hpv', 'vaccine', 'implementation', 'for', 'cancer', 'prevention', 'in', 'latinamerica', '!'])]
[ list(['two', 'uk', 'girls', 'left', 'paralyzed', 'after', 'hpv', 'jabs', '.', 'authorities', 'still', 'claim', 'it', "'s", 'coincidence', '.'])
 list(['cervicalcancer', 'deaths', 'have', 'decreased', 'dramatically', 'over', 'the', 'past', '40', 'years', ',', 'mostly', 'due', 'to', 'increased', 'screening', '.'])]
[[0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 0 0 0 0]]
[[0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0]]


In [62]:
#save 5% train samples to predication task
cut = int(len(X_train_0) * 0.95)
X_train = X_train_0[:cut]
y_train = y_train_0[:cut]
X_pred = X_train_0[cut:]
y_pred = y_train_0[cut:]

In [63]:
#!!!!!do not run this, run the third one below this to directly load the dictionary
#create words dictionary for the data
from functools import reduce

from collections import defaultdict
frequency = defaultdict(int)

all_tokens = list(reduce(lambda x, y: x + y, [l['words'] for l in modeling_data['data']]))
print(all_tokens[:5])

KeyboardInterrupt: 

In [None]:
print(len(all_tokens))

for token in all_tokens:
    frequency[token] += 1
    
dictionary = sorted(frequency.items(), key=lambda x:x[1], reverse=True)
dictionary = [k for k,v in dictionary[:vocab_size]]
print(dictionary[:10])
len(dictionary)

In [None]:
save_dict('model/dict.dd', dictionary)

In [64]:
vocab_size = 0
dictionary = load_dict('model/dict.dd')
if (not vocab_size):
    vocab_size = len(dictionary)
dictionary[:20]

[':',
 'hpv',
 'rt',
 'vaccine',
 '.',
 'the',
 ',',
 'of',
 'to',
 'cancer',
 'in',
 'gardasil',
 'for',
 'cervicalcancer',
 'a',
 '&',
 'and',
 'is',
 '!',
 'cervical']

In [65]:
#word to index
X_train_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_train]
X_test_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_test]
X_pred_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_pred]

In [66]:
print(X_train_f[:5])
print(X_test_f[:5])
print(X_pred_f[:5])

[[2, 0, 46, 781, 22, 74, 100, 121, 7, 13, 35, 55, 352, 54, 40, 162, 15, 1, 3, 4, 1126], [364, 93, 5, 10430, 1019, 20, 1770, 3765, 22, 1, 3, 1637, 12, 9, 61, 10, 8726, 18], [1159, 49, 251, 22, 97, 4382, 12, 448, 3863, 8, 20, 1, 25, 493, 18, 400, 13], [2, 0, 122, 312, 90, 662, 57, 8, 369, 33, 13, 4, 529, 26, 967, 16, 1777, 831, 8, 39, 365, 12, 1], [2, 0, 27, 0, 11, 131, 19, 328, 10, 31, 244, 54, 40, 1, 51]]
[[210, 338, 41, 215, 165, 34, 1, 471, 4, 459, 182, 409, 29, 20, 411, 4], [13, 180, 57, 1813, 2238, 296, 5, 1477, 1003, 134, 6, 3875, 616, 8, 448, 67, 4], [2, 0, 27, 0, 14, 262, 7, 346, 362, 10, 14, 509, 110, 118, 250, 34, 168, 1, 3], [726, 12, 8803, 423, 23, 155, 15, 281, 223, 397, 8, 1, 0, 492, 287], [97, 1, 16, 9, 153, 2965, 3326, 3551, 22, 2585, 193, 18, 1077, 125, 8, 159, 0]]
[[3, 1413, 264, 41, 177, 1, 68, 32, 757, 37, 697], [2, 0, 4, 84, 981, 18, 11, 3379, 6, 226, 2928, 10352, 1760, 81, 10353, 6, 4817, 19993, 4], [2, 0, 30, 24, 478, 11, 11792, 21], [1, 25, 68, 569, 145, 917, 41,

## embedding words using GloVe

In [67]:
lens = np.array([len(doc) for doc in X_train_f])
(lens.max(), lens.min(), lens.mean())

(39, 2, 17.012127213987693)

In [96]:
seq_len = 39
embedding_dim = 200
vecs, words, wordidx = load_vectors('twitter.27B.%dd'%(embedding_dim))

In [97]:
X_train_u = sequence.pad_sequences(X_train_f, maxlen=seq_len)
X_test_u = sequence.pad_sequences(X_test_f, maxlen=seq_len)
X_pred_u = sequence.pad_sequences(X_pred_f, maxlen=seq_len)

In [98]:
print(X_train_u[:5])
print(X_test_u[:5])
print(X_pred_u[:5])

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     2     0    46   781    22    74
    100   121     7    13    35    55   352    54    40   162    15     1
      3     4  1126]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0   364    93     5
  10430  1019    20  1770  3765    22     1     3  1637    12     9    61
     10  8726    18]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0  1159    49
    251    22    97  4382    12   448  3863     8    20     1    25   493
     18   400    13]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     2     0   122   312    90   662    57     8
    369    33    13     4   529    26   967    16  1777   831     8    39
    365    12     1]
 [    0     0     0     0   

In [99]:
def create_embedding(dictionary):
    print(vecs.shape)
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i, word in enumerate(dictionary):
        #if word:# and re.match(r"^[a-zA-Z0-9\-]*$", word):
        src_idx = wordidx[word] if word in wordidx else 0
        
        if src_idx:
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [100]:
embedding = create_embedding(dictionary)

(1193517, 200)


## create NN mode

In [103]:
#pure cnn model
dth = 0.5
filter_num = 5

model = Sequential()

#embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length=seq_len, weights=[embedding], trainable=False))
# model.add(Dropout(dth))

#hidden layer
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Dropout(dth))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Dropout(dth))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Dropout(dth))

#output layer
model.add(Flatten())
# model.add(Dense(512, activation='relu'))
# model.add(Dense(1024, activation='relu'))
# model.add(Dropout(dth))
model.add(Dense(modeling_data['categorical_num'], activation='softmax'))

#compile model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 39, 200)           4661400   
_________________________________________________________________
dropout_18 (Dropout)         (None, 39, 200)           0         
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 39, 200)           200200    
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 39, 200)           200200    
_________________________________________________________________
dropout_19 (Dropout)         (None, 39, 200)           0         
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 39, 200)           200200    
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 39, 200)           200200    
__________

In [102]:
#Bi-LSTM model
dth = 0.5
filter_num = 5
model = Sequential()

#embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length=seq_len, weights=[embedding], trainable=False))
model.add(Dropout(dth))

#hidden layers
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Bidirectional(LSTM(embedding_dim)))
# model.add(Bidirectional(LSTM(embedding_dim,  batch_input_shape=(64, 39, 50), stateful=True)))
model.add(Dropout(dth))

#output layer
model.add(Dense(modeling_data['categorical_num'], activation='softmax'))

#compile model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 39, 200)           4661400   
_________________________________________________________________
dropout_16 (Dropout)         (None, 39, 200)           0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 39, 200)           200200    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 400)               641600    
_________________________________________________________________
dropout_17 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 11)                4411      
Total params: 5,507,611
Trainable params: 846,211
Non-trainable params: 4,661,400
____________________________________________________________

In [82]:
model.optimizer.lr=1e-4
from keras.callbacks import EarlyStopping, ModelCheckpoint
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

model_chk_path = 'model/cnn_hashtagcls_emb{}_weights.validation.h5'.format(embedding_dim)
mcp = ModelCheckpoint(model_chk_path, monitor="val_loss", verbose=1,
                      save_best_only=True, save_weights_only=False)

In [93]:
#class_weight
cw_map = {'cancer': 4,
 'cervicalcanc': 6,
 'gardasil': 2,
 'health': 0,
 'hpv': 9,
 'hpvvaccin': 3,
 'learntherisk': 8,
 'studi': 7,
 'vaccin': 5,
 'vaccineswork': 10,
 'vax': 1}

cw = [('hpv', 52454),
 ('vaccin', 20715),
 ('gardasil', 14558),
 ('cervicalcanc', 13457),
 ('cancer', 12978),
 ('learntherisk', 3738),
 ('health', 2939),
 ('hpvvaccin', 2724),
 ('studi', 2458),
 ('vax', 2368),
 ('vaccineswork', 2196)]

class_weight = dict()

for each in cw :
    class_weight[cw_map[each[0]]] = each[1]
class_weight

{0: 2939,
 1: 2368,
 2: 14558,
 3: 2724,
 4: 12978,
 5: 20715,
 6: 13457,
 7: 2458,
 8: 3738,
 9: 52454,
 10: 2196}

In [104]:
#quick test
model.fit(X_train_u, y_train, validation_data=(X_test_u, y_test), class_weight=None, epochs=500, batch_size=64, callbacks=[early_stopping])

Train on 61597 samples, validate on 16210 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500


<keras.callbacks.History at 0x228143378d0>

#### best result
- 0.8353

#### current best model snapshot:

```python
dth = 0.5
filter_num = 5

model = Sequential()

#embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length=seq_len, weights=[embedding], trainable=False))

#hidden layer
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Dropout(dth))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Dropout(dth))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Conv1D(embedding_dim, filter_num, padding='same', activation='relu'))
model.add(Dropout(dth))

#output layer
model.add(Flatten())
model.add(Dense(modeling_data['categorical_num'], activation='sigmoid'))
```

In [None]:
model.layers[0].trainable=True
model.optimizer.lr=1e-4
model.fit(X_train_u, y_train, validation_data=(X_test_u, y_test), epochs=100, batch_size=64, callbacks=[early_stopping])

## train and save model for future usage

In [86]:
#save the model
model.save_weights('model/cnn_hashtagcls_emb{}_weights.h5'.format(embedding_dim))

In [None]:
#combine train and test data and retrain the model as the final model to use for predication
early_stopping = EarlyStopping(monitor='loss', patience=5)
model.fit(
    np.concatenate((X_train_u, y_train), axis=0),
    np.concatenate((X_test_u, y_test), axis=0),
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping])
model.save_weights(
    "model/all_cnn_hashtagcls_emb{}_weights.h5".format(embedding_dim))

## extract the last hidden layer information
>https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer

In [38]:
model.layers[-1]

<function Dense.__dir__>

# Predication
**The predication will be performed on both no_labeled data set and neg_sample and pred set**

## load two data sources

In [41]:
#load data
with open("temp/neg_sample_data.json", "r", encoding="utf-8") as f:
    neg_sample = json.load(f)
    
with open("temp/no_labeled_data.json", "r", encoding="utf-8") as f:
    nolabel_sample = json.load(f)

In [42]:
print(X_pred_u)
print(len(X_pred_u))
print(len(y_pred))
print(len(y_pred))

[[    0     0     0 ...,   757    37   697]
 [    0     0     0 ...,  4817 19993     4]
 [    0     0     0 ...,    11 11792    21]
 ..., 
 [    0     0     0 ...,     5     1     3]
 [    0     0     0 ...,   164    87    75]
 [    0     0     0 ...,    37  1632     0]]
3242
3242
3242


In [106]:
model. predict_classes(X_pred_u)[0]

0