In [3]:
import pandas as pd
import numpy as np

In [4]:
import os
import sys

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [5]:
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, MaxPooling2D, Conv2D, Reshape, concatenate,\
Embedding, BatchNormalization, Activation, Dropout, Bidirectional, LSTM
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import tensorflow as tf
from Attention import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [6]:
data = pd.read_csv('../data/train.csv')

In [7]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
classes = data.columns[2:].values

In [9]:
train_comments = data['comment_text'].fillna('UNK')

In [10]:
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 200
MAX_SEQUENCE_LENGTH = 150

In [11]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_comments)

In [12]:
train_sequences = tokenizer.texts_to_sequences(train_comments)
X_train = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [13]:
y_train = data[classes].values

In [14]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 210337 unique tokens.


In [15]:
! ls ../embeddings

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt


In [16]:
embeddings_index = {}
f = open('../embeddings/glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("error reading word", word)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400001 word vectors.


In [17]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 107059


In [18]:
embedding_matrix.shape

(200000, 200)

In [19]:
def conv_block(x, filter_size, sequence_length, embedding_dim):
    x = Conv2D(64, (filter_size, embedding_dim))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    x = MaxPooling2D((sequence_length - filter_size + 1, 1), strides=(1,1))(x)
    x = Flatten()(x)
    return x

In [1]:
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(comment_input)

x = Reshape((MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, 1))(x)
conv1 = conv_block(x, 4, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
conv2 = conv_block(x, 5, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
conv3 = conv_block(x, 6, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

x = concatenate([conv1, conv2, conv3])

x = Dense(100)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.2)(x)

# x = Dense(64)(x)
# x = BatchNormalization()(x)
# x = Activation('relu')(x)
# x = Dropout(0.2)(x)

predictions = Dense(6, activation='sigmoid')(x)

NameError: name 'Input' is not defined

In [2]:
model = Model(comment_input, predictions)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])

NameError: name 'Model' is not defined

In [29]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 200)     40000000    input_2[0][0]                    
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 150, 200, 1)  0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 147, 1, 64)   51264       reshape_2[0][0]                  
__________________________________________________________________________________________________
conv2d_5 (

In [30]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_ckpt = ModelCheckpoint(filepath='../models/cnn.h5', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3)
callbacks = [model_ckpt, reduce_lr]

In [31]:
model.fit(X_train, y_train, batch_size=256, epochs=5, validation_split=0.1, callbacks=callbacks)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

KeyboardInterrupt: 

In [150]:
test = pd.read_csv('../data/test.csv')

In [151]:
test_comments = test['comment_text'].fillna('UNK')
test_sequences = tokenizer.texts_to_sequences(test_comments)
X_test = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [152]:
model.load_weights('../models/cnn.h5')

In [153]:
y_pred = model.predict(X_test)

In [154]:
sub = pd.read_csv('../data/sample_submission.csv')

In [155]:
sub.iloc[:, 1:] = y_pred

In [156]:
sub.to_csv('../submissions/cnn.csv', index=False)

In [157]:
sub.to_csv('cnn.csv', index=False)

In [158]:
pd.read_csv('../submissions/cnn.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.001792,1.8e-05,0.000174,4.2e-05,0.000131,4.8e-05
1,6102620,0.002217,2.4e-05,0.000651,4.5e-05,0.000334,0.000216
2,14563293,0.000448,3.7e-05,0.000355,0.000284,0.000304,0.000301
3,21086297,0.003724,3.9e-05,0.000699,0.000119,0.000415,0.000135
4,22982444,0.00098,8e-06,0.000123,2e-05,9.3e-05,2e-05
