In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
import sys

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, confusion_matrix
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, MaxPooling2D, Conv2D, Reshape, concatenate,\
Embedding, BatchNormalization, Activation, Dropout
from keras.models import Model
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [4]:
data = pd.read_csv('../data/train.csv')

In [5]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
classes = ['toxic', 'severe_toxic',
           'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
comments = data['comment_text'].fillna('UNK').values
y = data[classes].values

In [8]:
train_comments, val_comments, y_train, y_val = train_test_split(comments, y, test_size=0.2)

In [9]:
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 150

In [10]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_comments)

In [11]:
train_sequences = tokenizer.texts_to_sequences(train_comments)
val_sequences   = tokenizer.texts_to_sequences(val_comments)
X_train         = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
X_val           = pad_sequences(val_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [12]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 183447 unique tokens.


In [13]:
! ls ../embeddings

glove.840B.300d.txt  glove.840B.300d.zip


In [14]:
embeddings_index = {}
f = open('../embeddings/glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("error reading word", word)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

error reading word .
error reading word at
error reading word .
error reading word to
error reading word .
error reading word .
error reading word email
error reading word or
error reading word contact
error reading word Email
error reading word on
error reading word At
error reading word by
error reading word in
error reading word emailing
error reading word Contact
error reading word at
error reading word •
error reading word at
error reading word is
Found 2195884 word vectors.


In [15]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 88153


In [16]:
embedding_matrix.shape

(183448, 300)

In [17]:
def conv_block(x, filter_size, sequence_length, embedding_dim):
    x = Conv2D(64, (filter_size, embedding_dim))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    x = MaxPooling2D((sequence_length - filter_size + 1, 1), strides=(1,1))(x)
    x = Flatten()(x)
    return x

In [18]:
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(comment_input)

x = Reshape((MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, 1))(x)
conv1 = conv_block(x, 4, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
conv2 = conv_block(x, 5, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
conv3 = conv_block(x, 6, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

x = concatenate([conv1, conv2, conv3])

x = Dense(100)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.2)(x)

# x = Dense(64)(x)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.2)(x)

predictions = Dense(6, activation='sigmoid')(x)

In [19]:
model = Model(comment_input, predictions)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [20]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     55034400    input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 150, 300, 1)  0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 147, 1, 64)   76864       reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_2 (

In [21]:
#early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_ckpt = ModelCheckpoint(filepath='../models/cnn.h5', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3)
callbacks = [model_ckpt, reduce_lr]

In [22]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=256,
          epochs=5, callbacks=callbacks) 

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6cfdc94cf8>

In [23]:
y_pred_val = model.predict(X_val, verbose=1)



In [24]:
y_pred_val.shape

(31915, 6)

In [25]:
val_df = pd.DataFrame()
for i, col in enumerate(classes):
    loss = log_loss(y_val[:, i], y_pred_val[:, i])
    auc = roc_auc_score(y_val[:, i], y_pred_val[:, i])
    acc = accuracy_score(y_val[:, i], (y_pred_val[:, i] > .5).astype(int))
    val_df = val_df.append({'class': col, 'auc': auc, 'loss': loss, 'acc': acc}, ignore_index=True)
    print('{:15} log_loss: {:.2f} auc: {:.2f} acc: {:.2f}'.format(col, loss, auc, acc))
    print()
    print(confusion_matrix(y_val[:, i], (y_pred_val[:, i] > .5).astype(int)))
    print()
val_df = val_df.set_index('class')

toxic           log_loss: 0.10 auc: 0.98 acc: 0.96

[[28656   215]
 [ 1000  2044]]

severe_toxic    log_loss: 0.03 auc: 0.99 acc: 0.99

[[31544    49]
 [  246    76]]

obscene         log_loss: 0.05 auc: 0.99 acc: 0.98

[[30121   126]
 [  490  1178]]

threat          log_loss: 0.01 auc: 0.98 acc: 1.00

[[31818    16]
 [   56    25]]

insult          log_loss: 0.08 auc: 0.98 acc: 0.97

[[30135   170]
 [  765   845]]

identity_hate   log_loss: 0.03 auc: 0.98 acc: 0.99

[[31613    29]
 [  212    61]]



In [26]:
val_df.mean()

acc     0.982380
auc     0.983347
loss    0.049142
dtype: float64

In [27]:
test = pd.read_csv('../data/test.csv')

In [28]:
test_comments = test['comment_text'].fillna('UNK')
test_sequences = tokenizer.texts_to_sequences(test_comments)
X_test = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [29]:
model.load_weights('../models/cnn.h5')

In [30]:
y_pred = model.predict(X_test, verbose=1)



In [111]:
sub = pd.read_csv('../data/sample_submission.csv')

In [112]:
sub.iloc[:, 1:] = y_pred

In [113]:
sub.to_csv('../submissions/cnn_300.csv', index=False)

In [114]:
sub.to_csv('cnn.csv', index=False)

In [115]:
pd.read_csv('../submissions/cnn_300.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.899014,0.06560973,0.636133,0.01711239,0.498962,0.04544874
1,0000247867823ef7,0.005194,8.586932e-08,0.000779,9.83027e-09,0.000243,4.422942e-06
2,00013b17ad220c46,0.000398,6.701796e-09,0.000135,3.856752e-10,2.6e-05,3.213974e-07
3,00017563c3f7919a,0.000285,6.2189e-11,1.7e-05,1.383176e-08,1.4e-05,1.032655e-08
4,00017695ad8997eb,0.004565,6.080754e-07,0.000887,1.510305e-06,0.000159,8.54721e-06
