In [1]:
#import needed libraries
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Embedding, Input, Flatten
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, AveragePooling1D, AveragePooling2D
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.cross_validation import train_test_split


Using TensorFlow backend.


In [2]:
#read the input dataset
df = pd.read_csv("./train.csv")
#split the data into train and test sets
train, test = train_test_split(df, test_size=0.2)

#Replace all blank comments with text in training set
#extract training comments 
comments_train = train["comment_text"].fillna("cbarcelon").values
#extract the toxciity ratings
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_ratings = train[classes].values
test_ratings = test[classes].values
#extract test comments
comments_test = test["comment_text"].fillna("cbarcelon").values

#tokenizer the text
#vectorize text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(comments_train))
tokenized_comments_train = tokenizer.texts_to_sequences(comments_train)
tokenized_comments_test = tokenizer.texts_to_sequences(comments_test)
#pad the text so each comment is uniform in length
X_train = sequence.pad_sequences(tokenized_comments_train, maxlen=100, truncating='post')
X_test = sequence.pad_sequences(tokenized_comments_test, maxlen=100,  truncating='post')

In [32]:
#define LSTM sequential model
lstm = Sequential()
lstm.add(Embedding(134323, output_dim=512, input_length=100))
lstm.add(Bidirectional(LSTM(150, return_sequences=True), merge_mode='sum'))
lstm.add(AveragePooling1D())
lstm.add(Bidirectional(LSTM(150, return_sequences=True), merge_mode='sum'))
lstm.add(AveragePooling1D())
lstm.add(Bidirectional(LSTM(150, return_sequences=True), merge_mode='sum'))
#lstm.add(AveragePooling1D())
lstm.add(Flatten())
lstm.add(Dropout(.5))
lstm.add(Dense(500, activation='relu'))
lstm.add(Dropout(.5))
lstm.add(Dense(6, activation = "sigmoid"))

lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 512)          68773376  
_________________________________________________________________
bidirectional_22 (Bidirectio (None, 100, 150)          795600    
_________________________________________________________________
average_pooling1d_19 (Averag (None, 50, 150)           0         
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 50, 150)           361200    
_________________________________________________________________
average_pooling1d_20 (Averag (None, 25, 150)           0         
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 25, 150)           361200    
_________________________________________________________________
flatten_6 (Flatten)          (None, 3750)              0         
__________

In [33]:
#compile the model
lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
#create checkpoint file
file_path = "weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

#early stop checkpoint
early = EarlyStopping(monitor='val_loss', mode='min', patience=10)
callbacks_list = [checkpoint, early] 

In [35]:
#train/fit the model
lstm.fit(X_train, train_ratings, batch_size=50, epochs=4, validation_split=0.1, callbacks=callbacks_list)

Train on 69012 samples, validate on 7668 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f5fe8a4bf60>

In [36]:
#load the best weights
lstm.load_weights(file_path)

#make predictions on test set
pred = lstm.predict(X_test)

In [37]:
#log loss score function
from sklearn.metrics import log_loss
def calc_loss(y_true, y_pred):
    return np.mean([log_loss(y_true[:, i], y_pred[:, i]) 
                    for i in range(y_true.shape[1])])

In [38]:
score = calc_loss(test_ratings, pred)
print(score)

0.0520870097858


In [5]:
print(X_train[2])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  478   37 5892  663   10 4526 6989]


In [20]:
print(len(tokenizer.word_counts))

134323
