In [77]:
import pandas as pd
import numpy as np
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, BatchNormalization, AveragePooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from Attention import *

In [90]:
data = pd.read_csv('../data/train.csv')

In [91]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [92]:
classes = data.columns[2:].values

In [94]:
train_comments = data['comment_text'].fillna('UNK')
labels = data.loc[:, classes].values

In [95]:
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 100

In [96]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_comments)

In [97]:
train_sequences = tokenizer.texts_to_sequences(train_comments)
X_train = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [104]:
y_train = data[classes].values

In [98]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 153188 unique tokens.


In [83]:
embeddings_index = {}
f = open('../embeddings/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400001 word vectors.


In [99]:
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 398


In [100]:
embedding_matrix.shape

(10000, 50)

In [101]:
embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)

In [102]:
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(comment_input)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = AveragePooling1D()(x)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Attention()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.1)(x)
x = BatchNormalization()(x)
predictions = Dense(6, activation='sigmoid')(x)

In [87]:
model = Model(comment_input, predictions)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [88]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      multiple                  465200    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 200)          120800    
_________________________________________________________________
average_pooling1d_2 (Average (None, 50, 200)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 50, 100)           100400    
_________________________________________________________________
attention_2 (Attention)      (None, 100)               150       
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
__________

In [105]:
model.fit(X_train, y_train, batch_size=32, epochs=1, validation_split=0.2)

Train on 76680 samples, validate on 19171 samples
Epoch 1/1
 5984/76680 [=>............................] - ETA: 11:29 - loss: 0.3896 - acc: 0.5348

KeyboardInterrupt: 