In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Concatenate, Bidirectional
from keras.models import Model, Sequential
from keras.metrics import categorical_accuracy
from keras.metrics import binary_accuracy
from keras.utils import to_categorical
from keras.optimizers import Adam
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import KFold
import numpy as np
import json
import pandas as pd
import h5py

Using TensorFlow backend.


In [2]:
texts = []
texts_valid = []
labels_index = {}
labels = []
labels_valid = []
word_index = {}
mlb = MultiLabelBinarizer()
max_nb_words = 20000 # vocabulary size
#h5path = '/media/ssd/mydataset.hdf5'
h5path = '/media/ssd/mydataset-preprocessed.hdf5'
embedding_size = 1024
kfold = KFold(n_splits=10, shuffle=True)
max_sequence_length = 50

In [4]:
dataset_file="../data/train-preprocessed.tsv"
#dataset_file="../data/train.tsv"
dataset_sample= pd.read_csv(dataset_file,sep='\t',header=None,names=['user','label','none','text'],usecols=['label','text'])

In [5]:
#Prepare training data
for indx, doc in dataset_sample.iterrows():
  fields = [doc["label"]]
  label_ids = set()
  for field in fields:
      # Check if the field is already stored and if not, assign a new label to it.
      if field not in labels_index:
          label_id = len(labels_index)
          labels_index[field] = label_id
      else:
          label_id = labels_index[field]
      # Add the corresponding field label
      label_ids.add(label_id)
  texts.append(doc["text"])
  labels.append(label_ids)

In [6]:
# Tokenize the sentences of all the articles
tokenizer = Tokenizer(num_words=max_nb_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Get the vocabulary index
# word_index = tokenizer.word_index
word_index = { w:c for (w,c) in tokenizer.word_index.items() if c < max_nb_words}
print("Found %s unique tokens." % len(word_index))

# Fit the sequences into the maximum length
data = pad_sequences(sequences, maxlen=max_sequence_length, padding="post", truncating="post")
print("Shape of data tensor:", data.shape)
# Transform the labels into a binary vector, with one element for each category
labels_cat = mlb.fit_transform(labels)

print("Shape of label tensor:", labels_cat.shape)


Found 19999 unique tokens.
Shape of data tensor: (500000, 50)
Shape of label tensor: (500000, 2)


# Embeddings

## ELMO

In [8]:
def gen_elmo_tweets_embeddings (h5path, indices,batchSize, shuffle,data):  
  db = h5py.File(h5path, "r")
  while True:
    if shuffle:
        np.random.shuffle(indices)
    for i in range(0, len(indices), batchSize):
        batch_indices = indices[i:i+batchSize]
        batch_indices.sort()
        
        bx = db['mydataset'][batch_indices,:max_sequence_length,:]
        by = db["labels"][batch_indices,:]

        yield ([bx,data[batch_indices]], by)

# BiLSTM

## Random + ELMO

In [23]:
import time
precisions = []
recalls = []
f1s = []
times = []
datasize = 500000
batchSize=100
for train, test in kfold.split(data, labels_cat):
  # Define, train and validate the neural network model
  embedded_sequences = Input(shape=(max_sequence_length,embedding_size,), dtype="float32")
    
  sequence_input = Input(shape=(max_sequence_length,), dtype="int32")
  modelRandom = Embedding(len(word_index)+1, 300, embeddings_initializer="normal", input_length=max_sequence_length, trainable=True)(sequence_input)     
  
  modelMergeEmbeddings = Concatenate()([modelRandom, embedded_sequences])

  x = Bidirectional(LSTM(50))(modelMergeEmbeddings)
    
  preds = Dense(len(labels_index), activation="softmax")(x)
  
  model = Model([embedded_sequences, sequence_input], preds)
  model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[categorical_accuracy])
  print (model.summary())

  t0 = time.time()
  model.fit_generator(gen_elmo_tweets_embeddings (h5path,train,batchSize=batchSize,shuffle=True,data=data), 
                        validation_data=gen_elmo_tweets_embeddings(h5path, test, batchSize=batchSize,shuffle=False,data=data), 
                        steps_per_epoch = len(train)//batchSize,  
                        validation_steps = len(test)//batchSize, 
                        epochs=5)
  t1 = time.time()
  times.append(t1-t0)

  # Evaluate the model assigning zeros and ones according to a threshold
  pred = pred = model.predict_generator(gen_elmo_tweets_embeddings(h5path, test, batchSize=batchSize,shuffle=False,data=data),steps = len(test)//batchSize)
  pred[pred >= 0.5] = 1
  pred[pred < 0.5] = 0
  print(classification_report(labels_cat[test], pred))
  precisions.append(precision_score(labels_cat[test], pred, average="weighted"))
  recalls.append(recall_score(labels_cat[test], pred, average="weighted"))
  f1s.append(f1_score(labels_cat[test], pred, average="weighted"))
print("Precision: %.4f (+/- %.4f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.4f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.4f)" % (np.mean(f1s), np.std(f1s)))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_58 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_39 (Embedding)        (None, 50, 300)      6000000     input_58[0][0]                   
__________________________________________________________________________________________________
input_57 (InputLayer)           (None, 50, 1024)     0                                            
__________________________________________________________________________________________________
concatenate_29 (Concatenate)    (None, 50, 1324)     0           embedding_39[0][0]               
                                                                 input_57[0][0]                   
__________

In [24]:
print("Precision: %.4f (+/- %.4f)" % (np.mean(precisions), np.std(precisions)))
print("Recall: %.4f (+/- %.4f)" % (np.mean(recalls), np.std(recalls)))
print("F1 Score: %.4f (+/- %.4f)" % (np.mean(f1s), np.std(f1s)))

Precision: 0.8095 (+/- 0.0028)
Recall: 0.8088 (+/- 0.0022)
F1 Score: 0.8074 (+/- 0.0020)


In [25]:
print("Time: %.4f (+/- %.4f)" % (np.mean(times), np.std(times)))

Time: 3863.7398 (+/- 39.0300)
