In [1]:
import pandas as pd
import numpy as np
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, BatchNormalization, AveragePooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from Attention import *
import h5py

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
data = pd.read_csv('../data/train.csv')

In [3]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
classes = data.columns[2:].values

In [5]:
train_comments = data['comment_text'].fillna('UNK')
labels = data.loc[:, classes].values

In [6]:
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100

In [7]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_comments)

In [8]:
train_sequences = tokenizer.texts_to_sequences(train_comments)
X_train = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [9]:
y_train = data[classes].values

In [10]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 153188 unique tokens.


In [11]:
! ls ../embeddings

glove.840B.300d.txt  glove.840B.300d.zip


In [12]:
embeddings_index = {}
f = open('../embeddings/glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("error reading word", word)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

error reading word .
error reading word at
error reading word .
error reading word to
error reading word .
error reading word .
error reading word email
error reading word or
error reading word contact
error reading word Email
error reading word on
error reading word At
error reading word by
error reading word in
error reading word emailing
error reading word Contact
error reading word at
error reading word •
error reading word at
error reading word is
Found 2195884 word vectors.


In [13]:
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 68661


In [14]:
embedding_matrix.shape

(153189, 300)

In [15]:
embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

In [16]:
# comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# x = embedding_layer(comment_input)
# x = Bidirectional(LSTM(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
# x = AveragePooling1D()(x)
# x = Bidirectional(LSTM(200, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
# x = Attention()(x)
# x = Dense(200, activation='relu')(x)
# x = Dropout(0.1)(x)
# x = BatchNormalization()(x)
# x = Dense(50, activation='relu')(x)
# x = Dropout(0.1)(x)
# x = BatchNormalization()(x)
# predictions = Dense(6, activation='sigmoid')(x)

In [17]:
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(comment_input)
x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Attention()(x)
x = Dense(200, activation='relu')(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
predictions = Dense(6, activation='sigmoid')(x)

In [23]:
model = Model(comment_input, predictions)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          45956700  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 600)          1442400   
_________________________________________________________________
attention_1 (Attention)      (None, 600)               700       
_________________________________________________________________
dense_1 (Dense)              (None, 200)               120200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 200)               800       
__________

In [22]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_ckpt = ModelCheckpoint(filepath='../models/bstlm_attention.h5', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5)
callbacks = [early_stopping, model_ckpt, reduce_lr]

In [24]:
model.fit(X_train, y_train, batch_size=256, epochs=10, validation_split=0.2, callbacks=callbacks)

Train on 76680 samples, validate on 19171 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6abd64aeb8>

In [25]:
test = pd.read_csv('../data/test.csv')

In [26]:
test_comments = test['comment_text'].fillna('UNK')
test_sequences = tokenizer.texts_to_sequences(test_comments)
X_test = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [27]:
model.load_weights('../models/bstlm_attention.h5')

In [28]:
y_pred = model.predict(X_test)

In [30]:
sub = pd.read_csv('../data/sample_submission.csv')

In [37]:
sub.iloc[:, 1:] = y_pred

In [38]:
sub.to_csv('../submissions/bstlm_attention.csv', index=False)

In [40]:
sub.to_csv('bstlm_attention.csv', index=False)

In [39]:
pd.read_csv('../submissions/bstlm_attention.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.000402,2.5e-05,4.4e-05,4.169984e-06,4.1e-05,4e-06
1,6102620,0.000703,1e-05,0.000268,7.16911e-07,4.2e-05,8e-06
2,14563293,0.000136,5e-06,7.7e-05,4.495609e-07,1e-05,3e-06
3,21086297,0.001266,3.8e-05,0.000113,5.858633e-06,0.000108,9e-06
4,22982444,0.000594,3.3e-05,6.3e-05,6.175662e-06,5.7e-05,6e-06
