In [1]:
import csv, codecs
import gc
import json
import os
import re
import sys

import keras.backend
import numpy as np
import pandas as pd
import tensorflow as tf
import unidecode

from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D, MaxPooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Bidirectional
from keras.models import Model
from keras.models import load_model
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
config = tf.ConfigProto(device_count={'GPU': 1, 'CPU': 4}) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [3]:
model_json_file = 'ascii-3-model/model-ascii.json'
model_h5_file = 'ascii-3-model/model-ascii.h5'
char_to_index = 'ascii-3-model/ascii-char-map.json'

max_features = 200
maxlen = 500
embedding_size = 12
dropout = 0.2

Make sure you have `train.csv` downloaded from
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

In [5]:
train = pd.read_csv('train.csv')

X_train, X_test, y_train, y_test = train_test_split(
    train, 
    train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]], 
    test_size=0.10, random_state=42
)

list_sentences_train = X_train['comment_text'].apply(unidecode.unidecode)
list_sentences_test = X_test['comment_text'].apply(unidecode.unidecode)

tokenizer = Tokenizer(num_words=max_features,char_level=True)

tokenizer.fit_on_texts(list(list_sentences_train))


list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_sentences_test = tokenizer.texts_to_sequences(list_sentences_test)

with open(char_to_index, 'w') as f:
    f.write(json.dumps(tokenizer.word_index))
    print('write out tokenizer index')

X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_sentences_test, maxlen=maxlen)

def get_model(embed_size,dropout=0.2):
    inp = Input(shape=(maxlen, ))
    x = Embedding(len(tokenizer.word_index)+1, embed_size)(inp)
    x = Conv1D(filters=100, kernel_size=4, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=4)(x)

    x = Bidirectional(GRU(60, return_sequences=True,
                          name='lstm_layer', dropout=dropout,
                          recurrent_dropout=0.2))(x)

    x = GlobalMaxPool1D()(x)

    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                   metrics=['accuracy'])
    return model


model = get_model(embed_size=embedding_size, dropout=dropout)
model.summary()

write out tokenizer index
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 500, 12)           864       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 100)          4900      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 125, 100)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 125, 120)          57960     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)           

In [6]:
batch_size = 32
epochs = 6
hist = model.fit(X_t, y_train, 
                 batch_size=batch_size,
                 epochs=epochs,
                 validation_data=(X_te, y_test)))

Train on 143613 samples, validate on 15958 samples
Epoch 1/1


In [8]:
# serialize model to JSON
model_json = model.to_json()
with open(model_json_file, "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(model_h5_file)
print("Saved model to disk")

Saved model to disk
