# Tweets classification by its hashtags and no labeled tweets hashtag predication


**Using top n hashtags as label to build a supervised model for tweets classification and hashtag predication**

## load packages and modeling data

In [1]:
#import tensorflow backend
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1727432216832600299
]


In [2]:
import keras
from keras.layers import *
from keras.models import Sequential, Model
from keras.optimizers import *
from keras.preprocessing import sequence

print(keras.__version__)
print(keras.backend.backend())

Using TensorFlow backend.


2.0.8
tensorflow


In [3]:
import numpy as np
import bcolz
import pickle

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
    
def load_array(fname):
    return bcolz.open(fname)[:]

def save_dict(fname, dictionary):
    with open(fname, 'wb') as f:
        pickle.dump(dictionary, f)

def load_dict(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

In [4]:
#load tweets data
import json
tweets_file = "temp/tweets4classification.json"
with open(tweets_file, "r", encoding="utf-8") as f:
    modeling_data = json.load(f)
modeling_data['data'][0]

{'hashtag_label': [1, 5],
 'hashtags': ['hpv', 'vaccin'],
 'id': '418263863772327936',
 'orignal_hashtags': ['#hpv', '#vaccine'],
 'raw': 'rt @cdcstd: #hpv vax coverage could be 93% if doctors gave hpv #vaccine each time a preteen/teen got any other vaccine&gt; http://t.co/xxryga5…',
 'text': 'rt : hpv vax coverage could be 93% if doctors gave hpv vaccine each time a preteen / teen got any other vaccine>',
 'words': ['rt',
  ':',
  'hpv',
  'vax',
  'coverage',
  'could',
  'be',
  '93',
  '%',
  'if',
  'doctors',
  'gave',
  'hpv',
  'vaccine',
  'each',
  'time',
  'a',
  'preteen',
  '/',
  'teen',
  'got',
  'any',
  'other',
  'vaccine',
  '>']}

## process word embeddings 

In [5]:
#load processed word enbeddings
path = 'wordsenbeddings/'
res_path = path + 'results/'

def load_vectors(name):
    loc = res_path + name
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [6]:
def get_glove(name, dim):
    with open(path+ 'glove.' + name + '.txt', 'r', encoding="utf-8") as f:
        vecs = []
        words = []
        
        for i, line in enumerate(f):
            d = line.split()
            word = d[0]
            vec = np.array(d[1:], dtype=np.float32)
            if (len(d) == dim): # this is space
                word = ' '
                vec = np.array(d, dtype=np.float32)
            
            words.append(word)            
            vecs.append(vec)

        wordidx = {o:i for i,o in enumerate(words)}
        save_array(res_path+name+'.dat', vecs)
        pickle.dump(words, open(res_path+name+'_words.pkl','wb'))
        pickle.dump(wordidx, open(res_path+name+'_idx.pkl','wb'))

In [7]:
get_glove('twitter.27B.200d', 200)
get_glove('twitter.27B.25d', 25)
get_glove('twitter.27B.50d', 50)
get_glove('twitter.27B.100d', 100)

## prepare train and test sample

In [8]:
data = np.asarray([each['words'] for each in modeling_data['data']])
label = np.asarray([each['hashtag_label'] for each in modeling_data['data']])

print(data[:2])
print(label[:2])
print(len(data))
print(len(label))

[ list(['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '>'])
 list(['rt', ':', 'hpv', 'vax', 'coverage', 'could', 'be', '93', '%', 'if', 'doctors', 'gave', 'hpv', 'vaccine', 'each', 'time', 'a', 'preteen', '/', 'teen', 'got', 'any', 'other', 'vaccine', '...', '.'])]
[list([1, 5]) list([1, 5])]
81049
81049


In [30]:
def flat_labels(labels):
    flatted = []
    l = modeling_data['categorical_num']
    for label in labels:
        m = [0.] * l
        for each in label:
            m = list(map(lambda x: x[0] + x[1], zip(m, each)))
        flatted.append(m)
    return np.asarray(flatted)    

In [31]:
from keras.utils.np_utils import to_categorical
categorical_label = list(map(lambda x: to_categorical(x, num_classes=modeling_data['categorical_num']), label))

categorical_label_flatted = flat_labels(categorical_label)

print(len(categorical_label))
categorical_label_flatted[:20]

81049


array([[ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0

In [47]:
from sklearn.model_selection import train_test_split
X_train_0, X_test, y_train_0, y_test = train_test_split(data, categorical_label_flatted, test_size=0.1, random_state=42)

In [48]:
print(X_train_0[:2])
print(X_test[:2])
print(y_train_0[:2])
print(y_test[:2])

[ list(['rt', ':', 'hpv', 'fact', ':', 'hpv', 'is', 'the', 'primary', 'cause', 'of', 'cervicalcancer', ',', 'certain', 'types', 'of', 'head', '&', 'neck', 'cancer', ',', 'in', 'addition', 'to', 'several', 'rare', 'cance'])
 list(['rt', ':', 'study', ':', 'hpv', 'vaccine', 'linked', 'to', 'premature', 'menopause', 'in', 'young', 'girls'])]
[ list(['two', 'uk', 'girls', 'left', 'paralyzed', 'after', 'hpv', 'jabs', '.', 'authorities', 'still', 'claim', 'it', "'s", 'coincidence', '.'])
 list(['cervicalcancer', 'deaths', 'have', 'decreased', 'dramatically', 'over', 'the', 'past', '40', 'years', ',', 'mostly', 'due', 'to', 'increased', 'screening', '.'])]
[[ 0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  0.  1.  0.  1.  0.  0.  0.]]
[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]]


In [53]:
#save 5% train samples to predication task
cut = int(len(X_train_0) * 0.95)
X_train = X_train_0[:cut]
y_train = y_train_0[:cut]
X_pred = X_train_0[cut:]
y_pred = y_train_0[cut:]

In [34]:
#!!!!!do not run this, run the third one below this to directly load the dictionary
#create words dictionary for the data
from functools import reduce

from collections import defaultdict
frequency = defaultdict(int)

all_tokens = list(reduce(lambda x, y: x + y, [l['words'] for l in modeling_data['data']]))
print(all_tokens[:5])

KeyboardInterrupt: 

In [None]:
print(len(all_tokens))
vocab_size = 0

for token in all_tokens:
    frequency[token] += 1
    
dictionary = sorted(frequency.items(), key=lambda x:x[1], reverse=True)
if (not vocab_size):
    vocab_size = len(dictionary)
dictionary = [k for k,v in dictionary[:vocab_size]]
print(dictionary[:10])
len(dictionary)

In [None]:
save_dict('model/dict.dd', dictionary)

In [54]:
dictionary = load_dict('model/dict.dd')
dictionary[:20]

[':',
 'hpv',
 'rt',
 'vaccine',
 '.',
 'the',
 ',',
 'of',
 'to',
 'cancer',
 'in',
 'gardasil',
 'for',
 'cervicalcancer',
 'a',
 '&',
 'and',
 'is',
 '!',
 'cervical']

In [55]:
#word to index
X_train_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_train]
X_test_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_test]
X_pred_f = [[dictionary.index(word) if word in dictionary else -1 for word in doc] for doc in X_pred]

In [56]:
print(X_train_f[:5])
print(X_test_f[:5])
print(X_pred_f[:5])

[[2, 0, 1, 417, 0, 1, 17, 5, 730, 88, 7, 13, 6, 353, 105, 7, 96, 15, 77, 9, 6, 10, 3236, 8, 347, 755, 1537], [2, 0, 27, 0, 1, 3, 186, 8, 307, 327, 10, 91, 41], [1892, 2084, 3492, 22, 1, 25, 1012], [57, 24, 682, 7, 5, 238, 245, 73, 16, 311, 326, 21], [1, 28, 208, 201, 164, 503, 5388, 126, 66, 1, 3, 63, 592, 53, 8, 117, 119, 69, 4194]]
[[210, 338, 41, 215, 165, 34, 1, 471, 4, 459, 182, 409, 29, 20, 411, 4], [13, 180, 57, 1813, 2238, 296, 5, 1477, 1003, 134, 6, 3875, 616, 8, 448, 67, 4], [2, 0, 27, 0, 14, 262, 7, 346, 362, 10, 14, 509, 110, 118, 250, 34, 168, 1, 3], [726, 12, 8803, 423, 23, 155, 15, 281, 223, 397, 8, 1, 0, 492, 287], [97, 1, 16, 9, 153, 2965, 3326, 3551, 22, 2585, 193, 18, 1077, 125, 8, 159, 0]]
[[2, 0, 1, 76, 872, 12, 175, 229, 78, 17, 14, 174, 38, 35, 226, 8, 353, 47, 16, 354, 648, 10, 140, 4], [2, 0, 13, 5664, 30, 396, 81, 844, 484, 158, 1003, 44, 7, 41, 1808, 499, 30, 98], [596, 40, 714, 24, 6139, 182, 151, 48, 16837, 5, 417, 38, 11, 17, 36, 561, 38, 175, 992, 174, 62

## embedding words using GloVe

In [57]:
lens = np.array([len(doc) for doc in X_train_f])
(lens.max(), lens.min(), lens.mean())

(39, 2, 17.016061533133225)

In [58]:
seq_len = 40
embedding_dim = 50
vecs, words, wordidx = load_vectors('twitter.27B.%dd'%(embedding_dim))

In [59]:
X_train_u = sequence.pad_sequences(X_train_f, maxlen=seq_len)
X_test_u = sequence.pad_sequences(X_test_f, maxlen=seq_len)
X_pred_u = sequence.pad_sequences(X_pred_f, maxlen=seq_len)

In [60]:
print(X_train_u[:5])
print(X_test_u[:5])
print(X_pred_u[:5])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    2
     0    1  417    0    1   17    5  730   88    7   13    6  353  105
     7   96   15   77    9    6   10 3236    8  347  755 1537]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    2
     0   27    0    1    3  186    8  307  327   10   91   41]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 1892 2084 3492   22    1   25 1012]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
    57   24  682    7    5  238  245   73   16  311  326   21]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    1   28  208  201  164  503 5388
   126   66    1    3   63  

In [61]:
def create_embedding(dictionary):
    print(vecs.shape)
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i, word in enumerate(dictionary):
        #if word:# and re.match(r"^[a-zA-Z0-9\-]*$", word):
        src_idx = wordidx[word] if word in wordidx else 0
        
        if src_idx:
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [62]:
wordidx

{'<user>': 0,
 '.': 1,
 ':': 2,
 'rt': 3,
 ',': 4,
 '<repeat>': 5,
 '<hashtag>': 6,
 '<number>': 7,
 '<url>': 8,
 '!': 9,
 'i': 10,
 'a': 11,
 '"': 12,
 'the': 13,
 '?': 14,
 'you': 15,
 'to': 16,
 '(': 17,
 '<allcaps>': 18,
 '<elong>': 19,
 ')': 20,
 'me': 21,
 'de': 22,
 '<smile>': 23,
 '！': 24,
 'que': 25,
 'and': 26,
 '。': 27,
 '-': 28,
 'my': 29,
 'no': 30,
 '、': 31,
 'is': 32,
 'it': 33,
 '…': 34,
 'in': 35,
 'n': 36,
 'for': 37,
 '/': 38,
 'of': 39,
 'la': 40,
 "'s": 41,
 '*': 42,
 'do': 43,
 "n't": 44,
 'that': 45,
 'on': 46,
 'y': 47,
 "'": 48,
 'e': 49,
 'o': 50,
 'u': 51,
 'en': 52,
 'this': 53,
 'el': 54,
 'so': 55,
 'be': 56,
 "'m": 57,
 'with': 58,
 'just': 59,
 '>': 60,
 'your': 61,
 '^': 62,
 'like': 63,
 'have': 64,
 'te': 65,
 'at': 66,
 '？': 67,
 'love': 68,
 'se': 69,
 'are': 70,
 '<': 71,
 'm': 72,
 'r': 73,
 'if': 74,
 'all': 75,
 'b': 76,
 '・': 77,
 'not': 78,
 'but': 79,
 'we': 80,
 'es': 81,
 'ya': 82,
 '&': 83,
 'follow': 84,
 'up': 85,
 'what': 86,
 'get': 87

In [63]:
embedding = create_embedding(dictionary)

(1193517, 50)


## create CNN mode

In [64]:
dropout_threshold = 0.2
model = Sequential()

#embedding layer
model.add(Embedding(vocab_size, embedding_dim, input_length=seq_len, weights=[embedding], trainable=False, dropout=0.2))
# model.add(Dropout(dropout_threshold))
model.add(Conv1D(embedding_dim, 5, padding='same', activation='relu'))
model.add(Dropout(dropout_threshold))
model.add(Dense(11, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
#also can use SDG as optimizer
model.summary()

  """


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 50)            1165350   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 40, 50)            12550     
_________________________________________________________________
dropout_1 (Dropout)          (None, 40, 50)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 40, 11)            561       
Total params: 1,178,461
Trainable params: 13,111
Non-trainable params: 1,165,350
_________________________________________________________________


In [66]:
model.layers[0].trainable=True
model.fit(X_train_u, y_train, validation_data=(X_test_u, y_test), epochs=100, batch_size=64)

ValueError: Error when checking target: expected dense_1 to have 3 dimensions, but got array with shape (69296, 11)