In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
import sys
import re
import string
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, confusion_matrix
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, MaxPooling2D, Conv2D, Reshape, concatenate,\
Embedding, BatchNormalization, Activation, Dropout, Bidirectional, LSTM, GRU
from keras.models import Model
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from AttentionWithContext import AttentionWithContext

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [4]:
data = pd.read_csv('../data/train.csv')

In [5]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
classes = ['toxic', 'severe_toxic',
           'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
def clean(s):
    pattern = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return pattern.sub(r' \1 ', s)

In [8]:
comments = data['comment_text'].fillna('UNK').apply(clean).values
y = data[classes].values

In [9]:
train_comments, val_comments, y_train, y_val = train_test_split(comments, y, test_size=0.1)

In [10]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 150

In [11]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_comments)

In [12]:
train_sequences = tokenizer.texts_to_sequences(train_comments)
val_sequences   = tokenizer.texts_to_sequences(val_comments)
X_train         = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
X_val           = pad_sequences(val_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [13]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 178237 unique tokens.


In [14]:
! ls ../embeddings

glove.840B.300d.txt  glove.840B.300d.zip


In [15]:
embeddings_index = {}
f = open('../embeddings/glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("error reading word", word)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

error reading word .
error reading word at
error reading word .
error reading word to
error reading word .
error reading word .
error reading word email
error reading word or
error reading word contact
error reading word Email
error reading word on
error reading word At
error reading word by
error reading word in
error reading word emailing
error reading word Contact
error reading word at
error reading word •
error reading word at
error reading word is
Found 2195884 word vectors.


In [16]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 814


In [17]:
embedding_matrix.shape

(20000, 300)

In [22]:
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(comment_input)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = AttentionWithContext()(x)

x = Dense(100, activation='relu')(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)

# x = Dense(64, activation='relu')(x)
# x = Dropout(0.3)(x)
# x = BatchNormalization()(x)

predictions = Dense(6, activation='sigmoid')(x)

In [23]:
model = Model(comment_input, predictions)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 150, 300)          6000000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 150, 200)          320800    
_________________________________________________________________
attention_with_context_3 (At (None, 200)               40400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
__________

In [25]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_ckpt = ModelCheckpoint(filepath='../models/blstm_att_context.h5', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3)
callbacks = [model_ckpt, reduce_lr, early_stopping]

In [26]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=256, epochs=10, callbacks=callbacks) 

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2280375208>

In [27]:
model.load_weights('../models/blstm_att_context.h5')

In [28]:
y_pred_val = model.predict(X_val, verbose=1)



In [29]:
y_pred_val.shape

(15958, 6)

In [30]:
log_loss(y_val[:, 0], y_pred_val[:, 0])

0.09389108551392919

In [31]:
y_pred_val[:, 0] = y_pred_val[:, 0]

In [66]:
val_df = pd.DataFrame()
for i, col in enumerate(classes):
    loss = log_loss(y_val[:, i], y_pred_val[:, i])
    auc = roc_auc_score(y_val[:, i], y_pred_val[:, i])
    acc = accuracy_score(y_val[:, i], (y_pred_val[:, i] > .5).astype(int))
    val_df = val_df.append({'class': col, 'auc': auc, 'loss': loss, 'acc': acc}, ignore_index=True)
    print('{:15} log_loss: {:.2f} auc: {:.2f} acc: {:.2f}'.format(col, loss, auc, acc))
    print()
    print(confusion_matrix(y_val[:, i], (y_pred_val[:, i] > .5).astype(int)))
    print()
val_df = val_df.set_index('class')

toxic           log_loss: 0.09 auc: 0.98 acc: 0.97

[[14245   229]
 [  293  1191]]

severe_toxic    log_loss: 0.02 auc: 0.98 acc: 0.99

[[15780    26]
 [  103    49]]

obscene         log_loss: 0.05 auc: 0.99 acc: 0.98

[[14995   146]
 [  119   698]]

threat          log_loss: 0.01 auc: 0.97 acc: 1.00

[[15904     1]
 [   48     5]]

insult          log_loss: 0.06 auc: 0.99 acc: 0.97

[[14962   234]
 [  165   597]]

identity_hate   log_loss: 0.02 auc: 0.98 acc: 0.99

[[15787    16]
 [   98    57]]



In [68]:
val_df.mean()

acc     0.984564
auc     0.980920
loss    0.042378
dtype: float64

In [69]:
test = pd.read_csv('../data/test.csv')

In [70]:
test_comments = test['comment_text'].fillna('UNK')
test_sequences = tokenizer.texts_to_sequences(test_comments)
X_test = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [71]:
model.load_weights('../models/blstm_att_context.h5')

In [72]:
y_pred = model.predict(X_test, verbose=1)



In [73]:
sub = pd.read_csv('../data/sample_submission.csv')

In [74]:
sub.iloc[:, 1:] = y_pred

In [75]:
sub.to_csv('../submissions/blstm_att_context.csv', index=False)

In [76]:
pd.read_csv('../submissions/blstm_att_context.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998951,0.318161,0.962042,0.01398098,0.926105,0.5011047
1,0000247867823ef7,0.000131,3.353382e-07,6e-06,3.450277e-08,6e-06,3.80984e-07
2,00013b17ad220c46,0.00041,9.019499e-06,0.00025,2.879248e-06,0.000231,1.005919e-05
3,00017563c3f7919a,0.00014,1.675939e-06,3.8e-05,2.664927e-06,4.3e-05,1.957815e-06
4,00017695ad8997eb,0.01374,0.0001591392,0.004634,0.0001108757,0.002463,0.0001271297
