In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from Attention import *
from AttentionWithContext import *

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


In [2]:
EMBEDDING_FILE = '../data/glove.840B.300d.txt'
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test["comment_text"].str.lower()

In [3]:
max_features=100000
maxlen=150
embed_size=300

In [4]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [5]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [6]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [7]:
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [8]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
# x = Dense(128, activation='relu')(x)
# x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [10]:
batch_size = 128
epochs = 10
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)



In [11]:
# filepath="../input/best-model/best.hdf5"
filepath="../cache/1_17_LC_weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

In [11]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
          callbacks = callbacks_list,verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.986986
Epoch 00000: val_acc improved from -inf to 0.98213, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.988146
Epoch 00001: val_acc improved from 0.98213 to 0.98333, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 3/10
 ROC-AUC - epoch: 3 - score: 0.988861
Epoch 00002: val_acc did not improve
Epoch 4/10
 ROC-AUC - epoch: 4 - score: 0.988386
Epoch 00003: val_acc improved from 0.98333 to 0.98403, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 5/10
 ROC-AUC - epoch: 5 - score: 0.988753
Epoch 00004: val_acc did not improve
Epoch 6/10
 ROC-AUC - epoch: 6 - score: 0.988773
Epoch 00005: val_acc did not improve
Epoch 7/10
 ROC-AUC - epoch: 7 - score: 0.988190
Epoch 00006: val_acc did not improve
Epoch 8/10
 ROC-AUC - epoch: 8 - score: 0.988386
Epoch 00007: val_acc did not improve
Epoch 9/10
 ROC-AUC - epoch: 9 - score: 0.

<keras.callbacks.History at 0x7fe535fab9b0>

In [10]:
from Attention import *
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
att = Attention()(x)
x = concatenate([avg_pool, max_pool, att]) 
# x = Dense(128, activation='relu')(x)
# x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [11]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
          callbacks = callbacks_list,verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.986314
Epoch 00000: val_acc improved from -inf to 0.98284, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.988109
Epoch 00001: val_acc improved from 0.98284 to 0.98346, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 3/10
 ROC-AUC - epoch: 3 - score: 0.988333
Epoch 00002: val_acc did not improve
Epoch 4/10
 ROC-AUC - epoch: 4 - score: 0.988875
Epoch 00003: val_acc improved from 0.98346 to 0.98405, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 5/10
 ROC-AUC - epoch: 5 - score: 0.989228
Epoch 00004: val_acc improved from 0.98405 to 0.98406, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 6/10
 ROC-AUC - epoch: 6 - score: 0.989195
Epoch 00005: val_acc did not improve
Epoch 7/10
 ROC-AUC - epoch: 7 - score: 0.988625
Epoch 00006: val_acc did not improve
Epoch 8/10
 ROC-AUC - epoch: 8 - score: 0.988693
Epoch

KeyboardInterrupt: 

In [8]:

sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
att1 = Attention()(x)
att2 = AttentionWithContext()(x)
x = concatenate([avg_pool, max_pool, att1, att2]) 
# x = Dense(128, activation='relu')(x)
# x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [12]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=4, validation_data=(X_val, y_val),
          callbacks = callbacks_list,verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
 ROC-AUC - epoch: 1 - score: 0.986489
Epoch 00000: val_acc improved from -inf to 0.98319, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 2/4
 ROC-AUC - epoch: 2 - score: 0.988721
Epoch 00001: val_acc did not improve
Epoch 3/4
 ROC-AUC - epoch: 3 - score: 0.989283
Epoch 00002: val_acc improved from 0.98319 to 0.98345, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 4/4
 ROC-AUC - epoch: 4 - score: 0.989312
Epoch 00003: val_acc improved from 0.98345 to 0.98419, saving model to ../cache/1_17_LC_weights_base.best.hdf5


<keras.callbacks.History at 0x7f31754645f8>

In [13]:

#Loading model weights
model.load_weights(filepath)
print('Predicting....')
y_pred = model.predict(x_test,batch_size=1024,verbose=1)

Predicting....


In [20]:
y_pred.shape

(153164, 6)

In [14]:
df_sub = pd.read_csv('../submissions/submission_0.986.csv')

In [15]:
df_sub.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.971201,0.31695,0.947543,0.112325,0.895078,0.387622
1,0000247867823ef7,0.024167,0.023195,0.023079,0.022111,0.023916,0.023222
2,00013b17ad220c46,0.028507,0.023676,0.026225,0.022412,0.024312,0.023133
3,00017563c3f7919a,0.022757,0.022425,0.022316,0.022343,0.022613,0.022091
4,00017695ad8997eb,0.029917,0.022518,0.024325,0.022156,0.02473,0.022612


In [17]:
lab_pseudo = df_sub[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [18]:
comb_pseudo = np.concatenate([y_tra, lab_pseudo])

In [19]:
comb_feat = np.concatenate([X_tra, x_test])

In [21]:
model.fit(comb_feat, comb_pseudo, batch_size=batch_size, epochs=10, validation_data=(X_val, y_val),
          callbacks = callbacks_list,verbose=1)

Train on 296777 samples, validate on 15958 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.987805
Epoch 00000: val_acc improved from 0.98419 to 0.98430, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.988217
Epoch 00001: val_acc improved from 0.98430 to 0.98482, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 3/10
 ROC-AUC - epoch: 3 - score: 0.988472
Epoch 00002: val_acc did not improve
Epoch 4/10
 ROC-AUC - epoch: 4 - score: 0.986954
Epoch 00003: val_acc did not improve
Epoch 5/10
 ROC-AUC - epoch: 5 - score: 0.987136
Epoch 00004: val_acc did not improve
Epoch 6/10
 ROC-AUC - epoch: 6 - score: 0.987571
Epoch 00005: val_acc did not improve
Epoch 7/10
 ROC-AUC - epoch: 7 - score: 0.987549
Epoch 00006: val_acc did not improve
Epoch 8/10
 ROC-AUC - epoch: 8 - score: 0.988832
Epoch 00007: val_acc improved from 0.98482 to 0.98496, saving model to ../cache/1_17_LC_weights_base.best.hdf5
Epoch 9/10
 ROC-AUC - epoch: 9 - score:

<keras.callbacks.History at 0x7f30f8659588>

In [22]:
#Loading model weights
model.load_weights(filepath)
print('Predicting....')
y_pred1 = model.predict(x_test,batch_size=1024,verbose=1)

Predicting....


In [23]:
y_pred1

array([[0.9895253 , 0.2886152 , 0.9835381 , 0.1704133 , 0.9536547 ,
        0.51884526],
       [0.01347826, 0.01623414, 0.01314509, 0.01344353, 0.01535578,
        0.01389873],
       [0.01918216, 0.01955657, 0.02060369, 0.01874701, 0.01825291,
        0.01813327],
       ...,
       [0.01332473, 0.0166356 , 0.01674842, 0.01393711, 0.01584494,
        0.01284019],
       [0.0194856 , 0.01672496, 0.01828372, 0.01282756, 0.01627895,
        0.01966052],
       [0.93357766, 0.03650426, 0.75407887, 0.0222304 , 0.5072394 ,
        0.02168871]], dtype=float32)

In [26]:
y_pred1.shape

(153164, 6)

In [24]:
sub = pd.DataFrame()
sub['id'] = df_sub['id'].values

In [27]:
sub["toxic"] = y_pred1[:,0]
sub["severe_toxic"] = y_pred1[:,1]
sub["obscene"] = y_pred1[:,2]
sub["threat"] = y_pred1[:,3]
sub["insult"] = y_pred1[:,4]
sub["identity_hate"] = y_pred1[:,5]