In [2]:
import pickle
import time

import numpy as np
import pandas as pd

import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Concatenate, Conv1D, Activation, TimeDistributed, Flatten, RepeatVector, Permute, multiply
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, GlobalAveragePooling1D, MaxPooling1D, SpatialDropout1D, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.optimizers import Adam
from keras import backend as K

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score

import gc

np.set_printoptions(precision=8, suppress=True)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def np_rank(array):
    ranks = np.empty_like(array)
    for i in np.arange(array.shape[1]):
        temp = array[:, i].argsort()
        ranks[temp, i] = np.arange(len(array))
    return ranks

In [4]:
with open("../dumps/cnn_dump.pkl", "rb") as f:
    X_train, X_test, y, embedding_matrix = pickle.load(file=f)
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")    
sample_submission = pd.read_csv("../input/sample_submission.csv")

In [5]:
min_count = 10 #the minimum required word frequency in the text
max_features = 27403 #it's from previous run with min_count=10
maxlen = 150 #padding length
num_folds = 10 #number of folds
batch_size = 512 
embed_size = 300 #embeddings dimension
epochs = 4
seed = 42

In [6]:
def get_model_cnn(X_train):
    global embed_size
    inp = Input(shape=(maxlen, ), name="text")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    z = GlobalMaxPool1D()(x)
    x = Conv1D(embed_size, 4, activation="relu")(x)
    x = GlobalMaxPool1D()(x)
    x = Concatenate()([x,z,num_vars])
    x = Dropout(0.3)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=[inp,num_vars], outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [8]:
# X_tra, X_val = {}, {}
# X_tra['text'], X_val['text'], y_tra, y_val = train_test_split(X_train['text'], y, 
#                                                               train_size=0.95, random_state=233)
# X_tra['num_vars'], X_val['num_vars'], _, _ = train_test_split(X_train['num_vars'], y, 
#                                                               train_size=0.95, random_state=233)
# RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

# model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
#           callbacks=[RocAuc])

# y_pred = model.predict(X_test, batch_size=1024)
# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
# submission.to_csv("../output/cnn_conv1D_emb_num_NOCV.csv.gz", compression="gzip", index=False)

In [9]:
# OOF
scores = []
train_predict = np.zeros((train.shape[0],6))
test_predict = np.zeros((test.shape[0],6))
kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

for train_index, valid_index in kf.split(X_train['num_vars']):
    
    kfold_X_train = {}
    kfold_X_valid = {}
    kfold_y_train, kfold_y_valid = y[train_index], y[valid_index]
    for c in ['text','num_vars']:
        kfold_X_train[c] = X_train[c][train_index]
        kfold_X_valid[c] = X_train[c][valid_index]

    model = get_model_cnn(X_train)
    RocAuc = RocAucEvaluation(validation_data=(kfold_X_valid, kfold_y_valid), interval=1)
    model.fit(kfold_X_train, kfold_y_train, batch_size=batch_size, epochs=epochs, verbose=1,
              validation_data=(kfold_X_valid, kfold_y_valid), callbacks=[RocAuc])
    train_predict[valid_index] = model.predict(kfold_X_valid, batch_size=batch_size)
    test_predict += np_rank(model.predict(X_test, batch_size=batch_size))
    cv_score = roc_auc_score(kfold_y_valid, train_predict[valid_index])
    scores.append(cv_score)     
    
    # release memory
    del model
    gc.collect()
    K.clear_session()    
    
test_predict /= num_folds

Train on 143613 samples, validate on 15958 samples
Epoch 1/4

 ROC-AUC - epoch: 1 - score: 0.971978 

Epoch 2/4

 ROC-AUC - epoch: 2 - score: 0.984528 

Epoch 3/4

 ROC-AUC - epoch: 3 - score: 0.985993 

Epoch 4/4

 ROC-AUC - epoch: 4 - score: 0.986568 

Train on 143614 samples, validate on 15957 samples
Epoch 1/4

 ROC-AUC - epoch: 1 - score: 0.980584 

Epoch 2/4

 ROC-AUC - epoch: 2 - score: 0.985738 

Epoch 3/4

 ROC-AUC - epoch: 3 - score: 0.986806 

Epoch 4/4

 ROC-AUC - epoch: 4 - score: 0.987015 

Train on 143614 samples, validate on 15957 samples
Epoch 1/4

 ROC-AUC - epoch: 1 - score: 0.979061 

Epoch 2/4

 ROC-AUC - epoch: 2 - score: 0.984295 

Epoch 3/4

 ROC-AUC - epoch: 3 - score: 0.985376 

Epoch 4/4

 ROC-AUC - epoch: 4 - score: 0.985585 

Train on 143614 samples, validate on 15957 samples
Epoch 1/4

 ROC-AUC - epoch: 1 - score: 0.978432 

Epoch 2/4

 ROC-AUC - epoch: 2 - score: 0.985116 

Epoch 3/4

 ROC-AUC - epoch: 3 - score: 0.985970 

Epoch 4/4

 ROC-AUC - epoch: 4 

In [10]:
test_predict /= test_predict.shape[0]
roc_auc_score(y, train_predict), scores

(0.9851916984923461,
 [0.9865682690274534,
  0.9870149692251476,
  0.9855850074512719,
  0.9862767180504787,
  0.9837170504234583,
  0.9866728792566021,
  0.9860085196168967,
  0.9837759652301429,
  0.9874957413096629,
  0.9827511342639913])

In [13]:
submission = pd.concat([sample_submission.iloc[:, 0], 
                        pd.DataFrame(test_predict)], axis=1)
submission.columns = sample_submission.columns
# submission.to_csv("../output/cnn_conv1D_emb_num_5epochs.gz", compression="gzip", index=False)

# oof test
submission.to_csv("../output/test/cnn_conv1D_num.csv", index=False)

# oof train
train_oof = pd.concat([sample_submission.iloc[:, 0], pd.DataFrame(train_predict)], axis=1)
train_oof.to_csv("../output/train/cnn_conv1D_num.csv", index=False)