In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import pandas as pd
import numpy as np

from lib.services import config
import lib.embedding_utils as utils

Using TensorFlow backend.


In [None]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout

## Load train and test datasets

In [None]:
train = pd.read_csv(config['dataset']['path']['train'])
train_commnets = train[config['dataset']['features'][0]].fillna("_na_").values

In [None]:
test = pd.read_csv(config['dataset']['path']['test'])
test_comments = test[config['dataset']['features'][0]].fillna("_na_").values

In [None]:
print("First comment:\n\t", train_commnets[0])

In [None]:
print("Second comment: ", test_comments[1])

In [None]:
train_labels = train[config['dataset']['labels']].values
train_labels

Create a tokenizer

In [None]:
tokenizer = Tokenizer(num_words=config['word_embeding']['rows_count'])
tokenizer.fit_on_texts(list(train_commnets))

A tokenization convert comments to sequence of word indexes. It aboint to:
* Get an array of unrepeted words taken of all comments.
* Replace each word under a comments to his index.
* As result each comment is tranformed to an array of word indexes.

In [None]:
tokenized_train_comments = tokenizer.texts_to_sequences(train_commnets)
tokenized_test_comments = tokenizer.texts_to_sequences(test_comments)

In [None]:
print("First train comment sequence:\n\t", tokenized_train_comments[0])
print("First test comment sequence:\n\t", tokenized_test_comments[0])

Next is necessary take each comment sequence and complet with zeros up to fixes size. As result all comment sequences have same len.

In [None]:
train_set = pad_sequences(tokenized_train_comments, maxlen=config['word_embeding']['columns_count'])

In [None]:
test_set = pad_sequences(tokenized_test_comments, maxlen=config['word_embeding']['columns_count'])

In [None]:
print("First train fixed size comment sequence:\n\t", train_set[0])

In [None]:
embedding_matrix = utils.build_embedding_matrix(
    embedding_file=config['word_embeding']['path'],
    rows_count=config['word_embeding']['rows_count'],
    columns_count=config['word_embeding']['columns_count'],
    word_index=tokenizer.word_index
)

In [None]:
print("Word embedding shape: ", embedding_matrix.shape)

In [None]:
print("Unrepeated words from all comments: ", len(tokenizer.word_index))

In [None]:
model_input = Input(shape=(config['word_embeding']['columns_count'],))

x = Embedding(
    config['word_embeding']['rows_count'], 
    config['word_embeding']['columns_count'], 
    weights=[embedding_matrix]
)(model_input)

x = Bidirectional(
    LSTM(
        50, 
        return_sequences=True, 
        dropout=0.1, 
        recurrent_dropout=0.1)
)(x)

x = GlobalMaxPool1D()(x)

x = Dense(
    50, 
    activation="relu"
)(x)
x = Dropout(0.1)(x)
x = Dense(
    6, 
    activation="sigmoid"
)(x)

model = Model(inputs=model_input, outputs=x)

model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

print(model.summary())

In [None]:
model.fit(
    train_set,
    train_labels,
    batch_size=32,
    epochs=2,
    validation_split=0.1,
    verbose=1
)

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)

print('Accuracy: %f' % (accuracy*100))

In [None]:
y_test = model.predict([test_set], batch_size=1024, verbose=1)