In [1]:
import numpy as np
import pandas as pd
import pickle 
import gc

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from keras.models import Model
from keras.layers import Bidirectional, Input, Embedding, Dense, CuDNNGRU, Conv1D, GlobalAveragePooling1D
from keras.layers import concatenate, Dropout, SpatialDropout1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam
from keras import backend as K

import warnings
warnings.filterwarnings('ignore')


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def np_rank(array):
    ranks = np.empty_like(array)
    for i in np.arange(array.shape[1]):
        temp = array[:, i].argsort()
        ranks[temp, i] = np.arange(len(array))
    return ranks

def save_oof(train_oof, test_oof, name, sample_submission):
    # oof test
    submission = pd.concat([sample_submission.iloc[:, 0], pd.DataFrame(test_oof)], axis=1)
    submission.columns = sample_submission.columns
    # submission.to_csv("../output/cnn_conv1D_emb_num_5epochs.csv.gz", compression="gzip", index=False)
    submission.to_csv("../output/test/{}.csv".format(name), index=False)

    # oof train
    submission_train = pd.concat([sample_submission.iloc[:, 0], pd.DataFrame(train_oof)], axis=1)
    submission_train.columns = sample_submission.columns
    submission_train.to_csv("../output/train/{}.csv".format(name), index=False)
    
def oof(X_train, X_test, y, num_folds, seed):
    
    scores = []
    train_predict = np.zeros((X_train.shape[0],6))
    test_predict = np.zeros((X_test.shape[0],6))
    
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    
    for train_idx, val_idx in kf.split(X_train):

        x_train = X_train[train_idx]
        x_val = X_train[val_idx]
        y_train = y[train_idx]
        y_val = y[val_idx]
        
        # fit model 
        model = get_model()
        RocAuc = RocAucEvaluation(validation_data=(x_val, y_val), interval=1)
        model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, 
                  validation_data=(x_val, y_val), callbacks=[RocAuc], verbose=2)
        
        # predict
        train_predict[val_idx] = model.predict(x_val, batch_size=batch_size)
        test_predict += np_rank(model.predict(X_test, batch_size=batch_size))
        
        # save scores 
        cv_score = roc_auc_score(y_val, train_predict[val_idx])
        scores.append(cv_score)
        
        # release memory
        del model
        gc.collect()
        K.clear_session()
        
    test_predict /= (num_folds*test_predict.shape[0])
    return scores, train_predict, test_predict

In [3]:
EMBEDDING_FILE = '../input/glove.840B.300d.txt'
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].str.lower()

In [4]:
max_features=100000
maxlen=150
embed_size=300

In [5]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)


embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [6]:
# with open("../dumps/cnn_bi_lstm.pkl", "wb") as f:
#     pickle.dump(obj=(x_train, x_test, y_train, embedding_matrix), file=f)
    
with open("../dumps/cnn_bi_lstm.pkl", "rb") as f:
    x_train, x_test, y_train, embedding_matrix = pickle.load(file=f)

In [7]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [8]:
def get_model():   
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model


In [9]:
np.random.seed(42)
seed = 42
num_folds = 10
batch_size = 128
epochs = 3

scores, train_oof, test_oof = oof(x_train, x_test, y_train, num_folds, seed)

Train on 143613 samples, validate on 15958 samples
Epoch 1/3
 - 49s - loss: 0.0564 - acc: 0.9801 - val_loss: 0.0478 - val_acc: 0.9815

 ROC-AUC - epoch: 1 - score: 0.985954 

Epoch 2/3
 - 47s - loss: 0.0440 - acc: 0.9831 - val_loss: 0.0429 - val_acc: 0.9834

 ROC-AUC - epoch: 2 - score: 0.988063 

Epoch 3/3
 - 47s - loss: 0.0410 - acc: 0.9841 - val_loss: 0.0407 - val_acc: 0.9841

 ROC-AUC - epoch: 3 - score: 0.989020 

Train on 143614 samples, validate on 15957 samples
Epoch 1/3
 - 48s - loss: 0.0551 - acc: 0.9804 - val_loss: 0.0461 - val_acc: 0.9829

 ROC-AUC - epoch: 1 - score: 0.985465 

Epoch 2/3
 - 48s - loss: 0.0435 - acc: 0.9833 - val_loss: 0.0454 - val_acc: 0.9823

 ROC-AUC - epoch: 2 - score: 0.987264 

Epoch 3/3
 - 48s - loss: 0.0409 - acc: 0.9843 - val_loss: 0.0435 - val_acc: 0.9833

 ROC-AUC - epoch: 3 - score: 0.986927 

Train on 143614 samples, validate on 15957 samples
Epoch 1/3
 - 48s - loss: 0.0564 - acc: 0.9800 - val_loss: 0.0470 - val_acc: 0.9823

 ROC-AUC - epoch: 1

In [12]:
scores, roc_auc_score(y_train, train_oof)

([0.989020135152018,
  0.9869273116282727,
  0.9844613563199487,
  0.9885131450357131,
  0.9875784977456701,
  0.9896771051693563,
  0.9882156645139646,
  0.9892042149556164,
  0.9877735638641374,
  0.9859339148309343],
 0.9869086015152017)

In [13]:
save_oof(train_oof, test_oof, "cnn_bi_lstm", sample_submission)

In [None]:
# #### callbacks
# # filepath="../input/best-model/best.hdf5"
# filepath="weights_base.best.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
# ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
# callbacks_list = [ra_val,checkpoint, early]

# model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)
# #Loading model weights
# model.load_weights(filepath)
# print('Predicting....')
# y_pred = model.predict(x_test,batch_size=1024,verbose=1)