In [98]:
import pandas as pd
import numpy as np
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, MaxPooling2D, Conv2D, Reshape, concatenate,\
Embedding, BatchNormalization, Activation, Dropout, Bidirectional, LSTM
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from Attention import *

In [6]:
data = pd.read_csv('../data/train.csv')

In [7]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [8]:
classes = data.columns[2:].values

In [9]:
train_comments = data['comment_text'].fillna('UNK')
labels = data.loc[:, classes].values

In [117]:
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 150

In [13]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_comments)

In [118]:
train_sequences = tokenizer.texts_to_sequences(train_comments)
X_train = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [15]:
y_train = data[classes].values

In [16]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 153188 unique tokens.


In [17]:
! ls ../embeddings

glove.840B.300d.txt  glove.840B.300d.zip


In [18]:
embeddings_index = {}
f = open('../embeddings/glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("error reading word", word)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

error reading word .
error reading word at
error reading word .
error reading word to
error reading word .
error reading word .
error reading word email
error reading word or
error reading word contact
error reading word Email
error reading word on
error reading word At
error reading word by
error reading word in
error reading word emailing
error reading word Contact
error reading word at
error reading word •
error reading word at
error reading word is
Found 2195884 word vectors.


In [119]:
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 68661


In [120]:
embedding_matrix.shape

(153189, 300)

In [None]:
def conv_block(x, filter_size, sequence_length, embedding_dim):
    x = Conv2D(64, (filter_size, embedding_dim))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    x = MaxPooling2D((sequence_length - filter_size + 1, 1), strides=(1,1))(x)
    x = Flatten()(x)
    return x

In [144]:
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

with tf.device('/gpu:0'):
    x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(comment_input)
    
with tf.device('/gpu:1'):
    x = Reshape((MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, 1))(x)
    conv1 = conv_block(x, 4, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
    conv2 = conv_block(x, 5, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
    conv3 = conv_block(x, 6, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
    conv4 = conv_block(x, 7, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

    x = concatenate([conv1, conv2, conv3, conv4])

    x = Dense(256)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    
    x = Dense(64)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    
    predictions = Dense(6, activation='sigmoid')(x)

In [145]:
adam = Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=1e-4)

In [146]:
model = Model(comment_input, predictions)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [147]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_21 (Embedding)        (None, 150, 300)     45956700    input_21[0][0]                   
__________________________________________________________________________________________________
reshape_20 (Reshape)            (None, 150, 300, 1)  0           embedding_21[0][0]               
__________________________________________________________________________________________________
conv2d_54 (Conv2D)              (None, 147, 1, 64)   76864       reshape_20[0][0]                 
__________________________________________________________________________________________________
conv2d_55 

In [148]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_ckpt = ModelCheckpoint(filepath='../models/cnn.h5', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3)
callbacks = [early_stopping, model_ckpt, reduce_lr]

In [149]:
model.fit(X_train, y_train, batch_size=512, epochs=10, validation_split=0.1, callbacks=callbacks)

Train on 86265 samples, validate on 9586 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.callbacks.History at 0x7f0759a85a58>

In [150]:
test = pd.read_csv('../data/test.csv')

In [151]:
test_comments = test['comment_text'].fillna('UNK')
test_sequences = tokenizer.texts_to_sequences(test_comments)
X_test = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [152]:
model.load_weights('../models/cnn.h5')

In [153]:
y_pred = model.predict(X_test)

In [154]:
sub = pd.read_csv('../data/sample_submission.csv')

In [155]:
sub.iloc[:, 1:] = y_pred

In [156]:
sub.to_csv('../submissions/cnn.csv', index=False)

In [157]:
sub.to_csv('cnn.csv', index=False)

In [158]:
pd.read_csv('../submissions/cnn.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.001792,1.8e-05,0.000174,4.2e-05,0.000131,4.8e-05
1,6102620,0.002217,2.4e-05,0.000651,4.5e-05,0.000334,0.000216
2,14563293,0.000448,3.7e-05,0.000355,0.000284,0.000304,0.000301
3,21086297,0.003724,3.9e-05,0.000699,0.000119,0.000415,0.000135
4,22982444,0.00098,8e-06,0.000123,2e-05,9.3e-05,2e-05
