In [2]:
import numpy as np
import pandas as pd
from subprocess import check_output
print(check_output(["ls", "./"]).decode("utf8"))

sample_submission.csv
Toxic Benchmark-checkpoint.ipynb
Toxic_Comments_Capstone.ipynb
Toxic_Comments_Sequential_LSTM.ipynb
Toxic_self_benchmarking.ipynb
train.csv
weights_base.best.hdf5



In [3]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.cross_validation import train_test_split

max_features = 30000
maxlen = 200

df = pd.read_csv("./train.csv")

train, test = train_test_split(df, test_size=0.2)

list_sentences_train = train["comment_text"].fillna("cbarcelon").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("cbarcelon").values

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [4]:
def get_model():
    embed_size = 512
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(200, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.7)(x)
    x = Dense(200, activation="relu")(x)
    x = Dropout(0.7)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [6]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 512)          15360000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 400)          1140800   
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 400)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               80200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
__________

In [26]:
model = get_model()
batch_size = 500
epochs = 10


file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)


Train on 69012 samples, validate on 7668 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fecb01e7e10>

In [27]:
model.load_weights(file_path)
y_test = model.predict(X_te)

In [28]:
from sklearn.metrics import log_loss
def calc_loss(y_true, y_pred):
    return np.mean([log_loss(y_true[:, i], y_pred[:, i]) 
                    for i in range(y_true.shape[1])])

In [7]:
print(y_test.shape)

(19171, 6)


In [8]:
print(test.shape)
test.head()

(19171, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
95258,993803407001,"Maybe he should leave e.g. to FC Mars, but I'm...",1,0,1,0,0,0
36738,383075159232,Are you defending Peter or not? Since your co...,0,0,0,0,0,0
92937,969939490051,"""\n\n WOW!!!! You edited the comment making me...",1,0,0,0,0,0
64725,674422717410,"""\nI think the location is fine. The material ...",0,0,0,0,0,0
39293,410340345798,Fuck off and die you old timer,1,1,1,0,1,0


In [23]:
score = calc_loss(test[list_classes].values, y_test)

In [29]:
#score after 1 epoch
print(score)


0.0548385984347


In [24]:
#score after 4 epochs
print(score)

0.0548385984347


In [11]:
model.summary

<bound method Container.summary of <keras.engine.training.Model object at 0x7f95f8817c18>>

In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 256)          5120000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 100)          122800    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________