In [1]:
import numpy as np
import pandas as pd

In [2]:
#read train and test data

In [3]:
train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')
print(train_df.shape)
print(test_df.shape)

(159571, 8)
(153164, 2)


In [4]:
FASTTEXT_EMBEDFILE = './fasttext_wordvec/wiki.en.vec'

In [5]:
GLOVE_EMBEDFILE = './fasttext_wordvec/glove.840B.300d.txt'

In [6]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
train_df = train_df.dropna(subset=['comment_text'])

In [8]:
X_train = train_df['comment_text'].values
label_column_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train_df[label_column_names].values
X_test = test_df['comment_text'].values

In [9]:
#preprocess the text

In [10]:
from keras.preprocessing import text,sequence

Using TensorFlow backend.


In [11]:
vocab_size = 40000
max_len = 200
embed_size = 300

In [12]:
def text2sequence(vocab_size,max_len,X_train,X_test):
    tokenizer = text.Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(list(X_train)+list(X_test))
    word_index = tokenizer.word_index
    X_train = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train),maxlen=max_len)
    X_test = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test),maxlen=max_len)
    return X_train, X_test , word_index

In [13]:
X_train, X_test, word_index = text2sequence(vocab_size,max_len,X_train,X_test)

In [14]:
np.random.seed(42)

In [15]:
vocab_size = min(vocab_size,len(word_index))

In [16]:
embed_mat_file = './Data/embed_mat_40000.npy'
#embedding_matrix.dump(embed_mat_file)

In [17]:
embedding_matrix = np.load(embed_mat_file)

In [18]:
glove_mat_file = './Data/glove_mat_.npy'
#glove_matrix.dump(glove_mat_file)

In [19]:
glove_matrix = np.load(glove_mat_file)

In [20]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Lambda, dot, Activation,Dropout
from keras.layers import GRU ,CuDNNGRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D
from keras.layers.normalization import BatchNormalization
from keras.engine.topology import Layer
from keras import backend as K
from keras.initializers import Constant
from keras import initializers,regularizers,constraints
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint
from sklearn.metrics import roc_auc_score
from keras.optimizers import adam

In [21]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [22]:
K.clear_session()

In [23]:
weights = Constant(embedding_matrix)

In [24]:
g_weights = Constant(glove_matrix)

In [25]:
inputs = Input(shape=(max_len,))
embed_layer = Embedding(vocab_size+1,embed_size,embeddings_initializer=weights)(inputs)

g_inputs = Input(shape=(max_len,))
glove_embed_layer = Embedding(vocab_size+1,embed_size,embeddings_initializer=g_weights)(g_inputs)

concat_embed = concatenate([embed_layer,glove_embed_layer])
s_dout = SpatialDropout1D(0.4)(concat_embed)

bi_rnn = Bidirectional(CuDNNGRU(40,return_sequences=True))(s_dout)

bi_rnn2 = Bidirectional(CuDNNGRU(40,return_sequences=True))(bi_rnn)

#bi_rnn = Conv1D(32, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(bi_rnn)
avg_pool = GlobalAveragePooling1D()(bi_rnn2)
max_pool = GlobalMaxPooling1D()(bi_rnn2)
concat = concatenate([avg_pool,max_pool])

outputs = Dense(6,activation='sigmoid')(concat)

opt = adam(lr=0.001)

model = Model(inputs=[inputs,g_inputs],outputs=outputs)
model.compile(loss='binary_crossentropy',
             optimizer=opt,
             metrics=['accuracy'])

In [26]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     12000300    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     12000300    input_2[0][0]                    
__________________________________________________________________________________________________
concatenat

In [27]:
batch_size = 32
epochs = 5

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_tra, X_val, y_tra, y_val = train_test_split(X_train,y_train,train_size=0.95,random_state=42)



In [30]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [31]:
filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=2)
RocAuc = RocAucEvaluation(validation_data=([X_val,X_val], y_val), interval=1)
callback_list = [checkpoint,early,RocAuc]

In [32]:
history = model.fit([X_tra,X_tra],y_tra, batch_size=batch_size, epochs=epochs, validation_data=([X_val,X_val], y_val),
                    callbacks=callback_list)

Train on 151592 samples, validate on 7979 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.03999, saving model to weights_base.best.hdf5

 ROC-AUC - epoch: 1 - score: 0.986267 

Epoch 2/5

Epoch 00002: val_loss did not improve from 0.03999

 ROC-AUC - epoch: 2 - score: 0.988686 

Epoch 3/5

Epoch 00003: val_loss did not improve from 0.03999

 ROC-AUC - epoch: 3 - score: 0.988687 



In [33]:
model.load_weights(filepath)

In [34]:
predictions = model.predict([X_test,X_test],batch_size=64)

In [35]:
test_labels_df = pd.read_csv('./Data/test_labels.csv')

In [36]:
#the value in lable -1 not use for scoring
count_indices = test_labels_df.loc[test_labels_df['toxic']>=0].index.values

In [37]:
test_labels = test_labels_df[label_column_names].values

In [38]:
roc_auc_score(test_labels[count_indices],predictions[count_indices])

0.980791268831127

In [None]:
submission = pd.read_csv('./Data/sample_submission.csv')
submission[label_column_names] = predictions
submission.to_csv('submission.csv',index=False)