# CNN Dynamic + ELMO 
This is one of the experiments with the CNN (Convolutional Neural Network) using random trainable (dynamic) and ELMO embeddings. Please note that we previously had to obtain the ELMO embeddings file for the preprocessed dataset.

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Concatenate, Bidirectional
from keras.models import Model, Sequential
from keras.metrics import categorical_accuracy
from keras.metrics import binary_accuracy
from keras.utils import to_categorical
from keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import KFold
import numpy as np
import json
import pandas as pd
import h5py

Using TensorFlow backend.


In [2]:
texts = []
texts_valid = []
labels_index = {}
labels = []
labels_valid = []
word_index = {}
mlb = MultiLabelBinarizer()
max_nb_words = 20000 # vocabulary size
#h5path = '/media/ssd/mydataset.hdf5'
h5path = '/media/ssd/mydataset-preprocessed.hdf5'
embedding_size = 1024
kfold = KFold(n_splits=10, shuffle=True)
max_sequence_length = 50

In [4]:
dataset_file="../data/train-preprocessed.tsv"
#dataset_file="../data/train.tsv"
dataset_sample= pd.read_csv(dataset_file,sep='\t',header=None,names=['user','label','none','text'],usecols=['label','text'])

In [5]:
#Prepare training data
for indx, doc in dataset_sample.iterrows():
  fields = [doc["label"]]
  label_ids = set()
  for field in fields:
      # Check if the field is already stored and if not, assign a new label to it.
      if field not in labels_index:
          label_id = len(labels_index)
          labels_index[field] = label_id
      else:
          label_id = labels_index[field]
      # Add the corresponding field label
      label_ids.add(label_id)
  texts.append(doc["text"])
  labels.append(label_ids)

In [6]:
# Tokenize the sentences of all the articles
tokenizer = Tokenizer(num_words=max_nb_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Get the vocabulary index
# word_index = tokenizer.word_index
word_index = { w:c for (w,c) in tokenizer.word_index.items() if c < max_nb_words}
print("Found %s unique tokens." % len(word_index))

# Fit the sequences into the maximum length
data = pad_sequences(sequences, maxlen=max_sequence_length, padding="post", truncating="post")
print("Shape of data tensor:", data.shape)
# Transform the labels into a binary vector, with one element for each category
labels_cat = mlb.fit_transform(labels)

print("Shape of label tensor:", labels_cat.shape)


Found 19999 unique tokens.
Shape of data tensor: (500000, 50)
Shape of label tensor: (500000, 2)


# Embeddings

## ELMO

This is to pass data in order to work with mixing embeddgings

In [8]:
def gen_elmo_tweets_embeddings (h5path, indices,batchSize, shuffle,data):  
  db = h5py.File(h5path, "r")
  while True:
    if shuffle:
        np.random.shuffle(indices)
    for i in range(0, len(indices), batchSize):
        batch_indices = indices[i:i+batchSize]
        batch_indices.sort()
        
        bx = db['mydataset'][batch_indices,:max_sequence_length,:]
        by = db["labels"][batch_indices,:]

        yield ([bx,data[batch_indices]], by)

# CNN

## Random + ELMO

In [None]:
import time
precisions = []
recalls = []
f1s = []
times = []
datasize = 500000
batchSize=100
for train, test in kfold.split([None] * datasize):
    # Let's train our 3-layer CNN with embedding combinations, MAX_SEQUENCE_LENGTH 200, and usin preprocessing
    embedded_sequences = Input(shape=(max_sequence_length,embedding_size,), dtype="float32")
    
    sequence_input = Input(shape=(max_sequence_length,), dtype="int32")
    modelRandom = Embedding(len(word_index)+1, 300, embeddings_initializer="normal", input_length=max_sequence_length, trainable=True)(sequence_input)      
    
    modelMergeEmbeddings = Concatenate()([modelRandom, embedded_sequences])

    x = Conv1D(128, 5, activation="relu")(modelMergeEmbeddings)
    x = MaxPooling1D(2)(x)
    x = Conv1D(128, 5, activation="relu")(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(128, 5, activation="relu")(x)
    x = MaxPooling1D(4)(x)
    x = Flatten()(x)
    #x = Dropout(0.2)(x)
    x = Dense(128, activation="relu")(x)
    #x = Dropout(0.2)(x)
    preds = Dense(2, activation="softmax")(x)
    model = Model([embedded_sequences, sequence_input], preds)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[categorical_accuracy])
    print (model.summary())
    
    t0 = time.time()
    model.fit_generator(gen_elmo_tweets_embeddings (h5path,train,batchSize=batchSize,shuffle=True,data=data), 
                        validation_data=gen_elmo_tweets_embeddings(h5path, test, batchSize=batchSize,shuffle=False,data=data), 
                        steps_per_epoch = len(train)//batchSize,  
                        validation_steps = len(test)//batchSize, 
                        epochs=5)
    t1 = time.time()
    times.append(t1-t0)
    
    db = h5py.File(h5path, "r")
    labels_test = db["labels"][test,:]
    db.close()

    # Evaluate the model assigning zeros and ones according to a threshold
    pred = model.predict_generator(gen_elmo_tweets_embeddings(h5path, test, batchSize=batchSize,shuffle=False,data=data),steps = len(test)//batchSize)
    print(pred.shape)
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
    print(classification_report(labels_test, pred,digits=4))
    precisions.append(precision_score(labels_test, pred, average="weighted"))
    recalls.append(recall_score(labels_test, pred, average="weighted"))
    f1s.append(f1_score(labels_test, pred, average="weighted"))
print("Precision: %.4f (+/- %.4f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.4f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.4f)" % (np.mean(f1s), np.std(f1s)))
print("Training time: %.4f (+/- %.4f)" % (np.mean(times), np.std(times)))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 300)      6000000     input_4[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 50, 1024)     0                                            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 50, 1324)     0           embedding_2[0][0]                
                                                                 input_3[0][0]                    
__________

  % delta_t_median)
  % delta_t_median)


Epoch 4/5
Epoch 5/5
   3/4500 [..............................] - ETA: 19:01 - loss: 0.2884 - categorical_accuracy: 0.8800

  % delta_t_median)


(50000, 2)
              precision    recall  f1-score   support

           0     0.8268    0.7065    0.7619     22088
           1     0.7917    0.8828    0.8348     27912

   micro avg     0.8049    0.8049    0.8049     50000
   macro avg     0.8092    0.7947    0.7984     50000
weighted avg     0.8072    0.8049    0.8026     50000
 samples avg     0.8049    0.8049    0.8049     50000

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 50, 300)      6000000     input_6[0][0]                    
__________________________________________________________________________________________________
input_5 (Input

  % delta_t_median)


   5/4500 [..............................] - ETA: 26:59 - loss: 0.4039 - categorical_accuracy: 0.8280

  % delta_t_median)
  % delta_t_median)


Epoch 3/5
   4/4500 [..............................] - ETA: 25:17 - loss: 0.3515 - categorical_accuracy: 0.8525

  % delta_t_median)


Epoch 4/5
  18/4500 [..............................] - ETA: 15:41 - loss: 0.3260 - categorical_accuracy: 0.8533

  % delta_t_median)


Epoch 5/5
   3/4500 [..............................] - ETA: 26:26 - loss: 0.3081 - categorical_accuracy: 0.8700

  % delta_t_median)


   6/4500 [..............................] - ETA: 24:34 - loss: 0.3054 - categorical_accuracy: 0.8733

  % delta_t_median)


   7/4500 [..............................] - ETA: 26:55 - loss: 0.3010 - categorical_accuracy: 0.8757

  % delta_t_median)


   8/4500 [..............................] - ETA: 24:32 - loss: 0.2941 - categorical_accuracy: 0.8800

  % delta_t_median)


  11/4500 [..............................] - ETA: 20:50 - loss: 0.2950 - categorical_accuracy: 0.8791

  % delta_t_median)


(50000, 2)
              precision    recall  f1-score   support

           0     0.8358    0.7193    0.7732     22171
           1     0.7987    0.8874    0.8407     27829

   micro avg     0.8129    0.8129    0.8129     50000
   macro avg     0.8173    0.8033    0.8069     50000
weighted avg     0.8152    0.8129    0.8108     50000
 samples avg     0.8129    0.8129    0.8129     50000

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 50, 300)      6000000     input_8[0][0]                    
__________________________________________________________________________________________________
input_7 (Input

  % delta_t_median)


   4/4500 [..............................] - ETA: 18:36 - loss: 0.4074 - categorical_accuracy: 0.8150

  % delta_t_median)
  % delta_t_median)


Epoch 3/5
Epoch 4/5
Epoch 5/5
   3/4500 [..............................] - ETA: 16:46 - loss: 0.2400 - categorical_accuracy: 0.9000

  % delta_t_median)


   5/4500 [..............................] - ETA: 23:30 - loss: 0.2550 - categorical_accuracy: 0.8860

  % delta_t_median)


   8/4500 [..............................] - ETA: 20:58 - loss: 0.2667 - categorical_accuracy: 0.8800

  % delta_t_median)


(50000, 2)
              precision    recall  f1-score   support

           0     0.8239    0.7163    0.7663     22038
           1     0.7973    0.8793    0.8363     27962

   micro avg     0.8075    0.8075    0.8075     50000
   macro avg     0.8106    0.7978    0.8013     50000
weighted avg     0.8090    0.8075    0.8054     50000
 samples avg     0.8075    0.8075    0.8075     50000

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 50, 300)      6000000     input_10[0][0]                   
__________________________________________________________________________________________________
input_9 (Input

  % delta_t_median)
  % delta_t_median)


Epoch 5/5
   2/4500 [..............................] - ETA: 17:37 - loss: 0.2326 - categorical_accuracy: 0.9000

  % delta_t_median)


   8/4500 [..............................] - ETA: 24:20 - loss: 0.3106 - categorical_accuracy: 0.8687

  % delta_t_median)


(50000, 2)
              precision    recall  f1-score   support

           0     0.8270    0.7093    0.7637     21967
           1     0.7951    0.8837    0.8371     28033

   micro avg     0.8071    0.8071    0.8071     50000
   macro avg     0.8110    0.7965    0.8004     50000
weighted avg     0.8091    0.8071    0.8048     50000
 samples avg     0.8071    0.8071    0.8071     50000

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 50, 300)      6000000     input_12[0][0]                   
__________________________________________________________________________________________________
input_11 (Inpu

  % delta_t_median)
  % delta_t_median)


   5/4500 [..............................] - ETA: 21:54 - loss: 0.4005 - categorical_accuracy: 0.8140

  % delta_t_median)
  % delta_t_median)




   8/4500 [..............................] - ETA: 22:18 - loss: 0.4036 - categorical_accuracy: 0.8125

  % delta_t_median)


Epoch 3/5
   2/4500 [..............................] - ETA: 16:42 - loss: 0.4228 - categorical_accuracy: 0.7900

  % delta_t_median)
  % delta_t_median)


Epoch 4/5
   3/4500 [..............................] - ETA: 19:37 - loss: 0.3289 - categorical_accuracy: 0.8367

  % delta_t_median)
  % delta_t_median)


   7/4500 [..............................] - ETA: 22:12 - loss: 0.3277 - categorical_accuracy: 0.8557

  % delta_t_median)


  11/4500 [..............................] - ETA: 19:42 - loss: 0.3451 - categorical_accuracy: 0.8445

  % delta_t_median)


Epoch 5/5
   2/4500 [..............................] - ETA: 16:15 - loss: 0.2917 - categorical_accuracy: 0.8900

  % delta_t_median)


   3/4500 [..............................] - ETA: 24:43 - loss: 0.2826 - categorical_accuracy: 0.8900

  % delta_t_median)
  % delta_t_median)


   7/4500 [..............................] - ETA: 22:38 - loss: 0.2850 - categorical_accuracy: 0.8814

  % delta_t_median)


(50000, 2)
              precision    recall  f1-score   support

           0     0.8373    0.7090    0.7678     22093
           1     0.7945    0.8910    0.8400     27907

   micro avg     0.8105    0.8105    0.8105     50000
   macro avg     0.8159    0.8000    0.8039     50000
weighted avg     0.8134    0.8105    0.8081     50000
 samples avg     0.8105    0.8105    0.8105     50000

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 50, 300)      6000000     input_14[0][0]                   
__________________________________________________________________________________________________
input_13 (Inpu

  % delta_t_median)


   5/4500 [..............................] - ETA: 23:16 - loss: 0.3741 - categorical_accuracy: 0.8260

  % delta_t_median)
  % delta_t_median)
  % delta_t_median)
  % delta_t_median)


   6/4500 [..............................] - ETA: 20:57 - loss: 0.3707 - categorical_accuracy: 0.8283

  % delta_t_median)


Epoch 4/5
   2/4500 [..............................] - ETA: 21:17 - loss: 0.2881 - categorical_accuracy: 0.8900

  % delta_t_median)


Epoch 5/5
   2/4500 [..............................] - ETA: 16:50 - loss: 0.2475 - categorical_accuracy: 0.9000

  % delta_t_median)
  % delta_t_median)


   4/4500 [..............................] - ETA: 22:40 - loss: 0.2796 - categorical_accuracy: 0.8800

  % delta_t_median)


   6/4500 [..............................] - ETA: 27:28 - loss: 0.2692 - categorical_accuracy: 0.8867

  % delta_t_median)


   9/4500 [..............................] - ETA: 23:02 - loss: 0.2845 - categorical_accuracy: 0.8778

  % delta_t_median)


  10/4500 [..............................] - ETA: 21:39 - loss: 0.2818 - categorical_accuracy: 0.8800

  % delta_t_median)
  % delta_t_median)


(50000, 2)
              precision    recall  f1-score   support

           0     0.8250    0.7146    0.7659     21934
           1     0.7981    0.8816    0.8377     28066

   micro avg     0.8083    0.8083    0.8083     50000
   macro avg     0.8116    0.7981    0.8018     50000
weighted avg     0.8099    0.8083    0.8062     50000
 samples avg     0.8083    0.8083    0.8083     50000

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 50, 300)      6000000     input_16[0][0]                   
__________________________________________________________________________________________________
input_15 (Inpu

  % delta_t_median)
  % delta_t_median)


   6/4500 [..............................] - ETA: 22:24 - loss: 0.3812 - categorical_accuracy: 0.8167

  % delta_t_median)
  % delta_t_median)


  10/4500 [..............................] - ETA: 19:28 - loss: 0.3758 - categorical_accuracy: 0.822

  % delta_t_median)


Epoch 4/5
   3/4500 [..............................] - ETA: 26:01 - loss: 0.3467 - categorical_accuracy: 0.8533

  % delta_t_median)


Epoch 5/5
   3/4500 [..............................] - ETA: 19:16 - loss: 0.2714 - categorical_accuracy: 0.8900

  % delta_t_median)


   6/4500 [..............................] - ETA: 17:46 - loss: 0.2863 - categorical_accuracy: 0.8817

  % delta_t_median)


(50000, 2)
              precision    recall  f1-score   support

           0     0.8680    0.6762    0.7602     21953
           1     0.7839    0.9195    0.8463     28047

   micro avg     0.8127    0.8127    0.8127     50000
   macro avg     0.8260    0.7979    0.8033     50000
weighted avg     0.8208    0.8127    0.8085     50000
 samples avg     0.8127    0.8127    0.8127     50000

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_18 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 50, 300)      6000000     input_18[0][0]                   
__________________________________________________________________________________________________
input_17 (Inpu

  % delta_t_median)


   5/4500 [..............................] - ETA: 23:08 - loss: 0.3968 - categorical_accuracy: 0.8420

  % delta_t_median)


Epoch 4/5
   1/4500 [..............................] - ETA: 7:47 - loss: 0.2700 - categorical_accuracy: 0.8800

  % delta_t_median)


Epoch 5/5
   2/4500 [..............................] - ETA: 17:11 - loss: 0.3294 - categorical_accuracy: 0.8550

  % delta_t_median)


   5/4500 [..............................] - ETA: 19:47 - loss: 0.3045 - categorical_accuracy: 0.8740

  % delta_t_median)
  % delta_t_median)


   8/4500 [..............................] - ETA: 19:01 - loss: 0.2953 - categorical_accuracy: 0.8762

  % delta_t_median)




In [11]:
print("Precision: %.4f (+/- %.4f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.4f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.4f)" % (np.mean(f1s), np.std(f1s)))
print("Training time: %.4f (+/- %.4f)" % (np.mean(times), np.std(times)))

Precision: 0.8125 (+/- 0.0039)
Recall: 0.8097 (+/- 0.0028)
F1 Score: 0.8073 (+/- 0.0027)
Training time: 4079.3108 (+/- 46.7983)
