# Toxic Comment Classification Challenge

## https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
import numpy as np
import pandas as pd

from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.preprocessing import text, sequence
from keras import regularizers

from collections import Counter

Using TensorFlow backend.


### Prepare dataset

In [2]:
# Download from https://www.kaggle.com/c/8076/download/train.csv.zip
train = pd.read_csv('data/train.csv')
train_sentences = train.comment_text.values

# Download from https://www.kaggle.com/c/8076/download/test.csv.zip
test = pd.read_csv('data/test.csv')
test_sentences = test.comment_text.values

CLASSES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(np.concatenate((train_sentences, test_sentences), axis=0))
train_tokenized = tokenizer.texts_to_sequences(train_sentences)
test_tokenized = tokenizer.texts_to_sequences(test_sentences)

maxlen = max(max(len(l) for l in train_tokenized), max(len(l) for l in test_tokenized))

X_train = sequence.pad_sequences(train_tokenized, maxlen=maxlen)
y_train = train[CLASSES].values
X_test = sequence.pad_sequences(test_tokenized, maxlen=maxlen)

### Hyperparameters

In [3]:
vocab_size = len(Counter([token for sublist in train_tokenized + test_tokenized for token in sublist])) + 1

hyper_params = {
    'l1_regularization': 0.01,
    'l2_regularization': 0.01,
    'validation_split': 0.1,
    'batch_size': 32,
    'epochs': 3,
    'embedding_size': 128,
    'keep_probability': 0.9,
    'lstm_size': 50,
    'dense_size': 50
}

### Build the network

In [4]:
model = Sequential()

model.add(Embedding(vocab_size, hyper_params['embedding_size']))
model.add(LSTM(hyper_params['lstm_size'], return_sequences=True))
model.add(LSTM(hyper_params['lstm_size']))
model.add(Dropout(1 - hyper_params['keep_probability']))
model.add(Dense(hyper_params['dense_size'], activation='relu', 
                kernel_regularizer=regularizers.l2(hyper_params['l2_regularization']), 
                activity_regularizer=regularizers.l1(hyper_params['l1_regularization'])))
model.add(Dropout(1 - hyper_params['keep_probability']))
model.add(Dense(len(CLASSES), activation='sigmoid', 
                kernel_regularizer=regularizers.l2(hyper_params['l2_regularization']), 
                activity_regularizer=regularizers.l1(hyper_params['l1_regularization'])))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

### Train the network

In [None]:
history = model.fit(X_train, y_train, batch_size=hyper_params['batch_size'], epochs=hyper_params['epochs'], 
                    validation_split=hyper_params['validation_split'])

Train on 143613 samples, validate on 15958 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


### Prepare submission file

In [None]:
# Download from https://www.kaggle.com/c/8076/download/sample_submission.csv.zip
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission[CLASSES] = model.predict(X_test)
sample_submission.to_csv('submission.csv', index=False)