# Toxic Comment Classification Challenge

## https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
import numpy as np
import pandas as pd

from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import ModelCheckpoint
from collections import Counter

Using TensorFlow backend.


### Prepare dataset

In [2]:
# Download from https://www.kaggle.com/c/8076/download/train.csv.zip
train = pd.read_csv('data/train.csv')[:100]
# Download from https://www.kaggle.com/c/8076/download/test.csv.zip
test = pd.read_csv('data/test.csv')[:100]

classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_train = train[classes].values

train_sentences = train.comment_text.fillna('FILLNA').values
test_sentences = test.comment_text.fillna('FILLNA').values

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(np.concatenate((train_sentences, test_sentences), axis=0))
train_tokenized = tokenizer.texts_to_sequences(train_sentences)
test_tokenized = tokenizer.texts_to_sequences(test_sentences)

maxlen = max(max(len(l) for l in train_tokenized), max(len(l) for l in test_tokenized))
X_train = sequence.pad_sequences(train_tokenized, maxlen=maxlen)
X_test = sequence.pad_sequences(test_tokenized, maxlen=maxlen)

### Hyperparameters

In [3]:
flat_list = [token for sublist in train_tokenized + test_tokenized for token in sublist]
vocab_size = len(Counter(flat_list)) + 1

hypterparameters = {
    'validation_split': 0.1,
    'is_verbose': 1,
    'batch_size': 32,
    'epochs': 10,
    'embedding_size': 128,
    'keep_probability': 0.9,
    'lstm_size': 50,
    'dense_size': 50
}

### Build the network

In [4]:
model = Sequential()

model.add(Embedding(vocab_size, hypterparameters['embedding_size']))
model.add(LSTM(hypterparameters['lstm_size'], return_sequences=True))
model.add(LSTM(hypterparameters['lstm_size']))
model.add(Dropout(1 - hypterparameters['keep_probability']))
model.add(Dense(hypterparameters['dense_size'], activation='relu'))
model.add(Dropout(1 - hypterparameters['keep_probability']))
model.add(Dense(len(classes), activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

### Train the network

In [5]:
model.fit(X_train, y_train, batch_size=hypterparameters['batch_size'], epochs=hypterparameters['epochs'], 
          verbose=hypterparameters['is_verbose'], validation_split=hypterparameters['validation_split'], 
          callbacks=[ModelCheckpoint('model.ckpt', verbose=hypterparameters['is_verbose'])])

Train on 90 samples, validate on 10 samples
Epoch 1/10
Epoch 00001: saving model to model.ckpt
Epoch 2/10
Epoch 00002: saving model to model.ckpt
Epoch 3/10
Epoch 00003: saving model to model.ckpt
Epoch 4/10
Epoch 00004: saving model to model.ckpt
Epoch 5/10
Epoch 00005: saving model to model.ckpt
Epoch 6/10
Epoch 00006: saving model to model.ckpt
Epoch 7/10
Epoch 00007: saving model to model.ckpt
Epoch 8/10
Epoch 00008: saving model to model.ckpt
Epoch 9/10
Epoch 00009: saving model to model.ckpt
Epoch 10/10
Epoch 00010: saving model to model.ckpt


<keras.callbacks.History at 0x1827790590>

### Prepare submission file

In [6]:
# Download from https://www.kaggle.com/c/8076/download/sample_submission.csv.zip
sample_submission = pd.read_csv('data/sample_submission.csv')[:100]
sample_submission[classes] = model.predict(X_test)
sample_submission.to_csv('baseline.csv', index=False)