In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

# import os
# os.environ['OMP_NUM_THREADS'] = '4'

from keras import backend as K
K.tensorflow_backend._get_available_gpus()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

In [2]:
def np_rank(array):
    ranks = np.empty_like(array)
    for i in np.arange(array.shape[1]):
        temp = array[:, i].argsort()
        ranks[temp, i] = np.arange(len(array))
    return ranks

def save_oof(train_oof, test_oof, name, sample_submission):
    # oof test
    submission = pd.concat([sample_submission.iloc[:, 0], pd.DataFrame(test_oof)], axis=1)
    submission.columns = sample_submission.columns
    # submission.to_csv("../output/cnn_conv1D_emb_num_5epochs.csv.gz", compression="gzip", index=False)
    submission.to_csv("../output/test/{}.csv".format(name), index=False)

    # oof train
    submission_train = pd.concat([sample_submission.iloc[:, 0], pd.DataFrame(train_oof)], axis=1)
    submission_train.columns = sample_submission.columns
    submission_train.to_csv("../output/train/{}.csv".format(name), index=False)
    
def oof(X_train, X_test, y, num_folds, seed):
    
    scores = []
    train_predict = np.zeros((X_train.shape[0],6))
    test_predict = np.zeros((X_test.shape[0],6))
    
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    
    for train_idx, val_idx in kf.split(X_train):

        x_train = X_train[train_idx]
        x_val = X_train[val_idx]
        y_train = y[train_idx]
        y_val = y[val_idx]
        
        # fit model 
        model = get_model()
        RocAuc = RocAucEvaluation(validation_data=(x_val, y_val), interval=1)
        model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, 
                  validation_data=(x_val, y_val), callbacks=[RocAuc], verbose=1)
        
        # predict
        train_predict[val_idx] = model.predict(x_val, batch_size=batch_size)
        test_predict += np_rank(model.predict(X_test, batch_size=batch_size))
        
        # save scores 
        cv_score = roc_auc_score(y_val, train_predict[val_idx])
        scores.append(cv_score)
        
    test_predict /= (num_folds*test_predict.shape[0])
    return scores, train_predict, test_predict

In [9]:
max_features = 30000
maxlen = 100
embed_size = 300

In [19]:
# EMBEDDING_FILE = '../input/crawl-300d-2M.vec'
# train = pd.read_csv('../input/train.csv')
# test = pd.read_csv('../input/test.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

# X_train = train["comment_text"].fillna("fillna").values
# y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
# X_test = test["comment_text"].fillna("fillna").values

# tokenizer = text.Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(list(X_train) + list(X_test))
# X_train = tokenizer.texts_to_sequences(X_train)
# X_test = tokenizer.texts_to_sequences(X_test)
# x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
# x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

# def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
# embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf-8"))

# word_index = tokenizer.word_index
# nb_words = min(max_features, len(word_index))
# embedding_matrix = np.zeros((nb_words, embed_size))
# for word, i in word_index.items():
#     if i >= max_features: continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [15]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [4]:
# with open("../dumps/cnn_pooled_gru.pkl", "wb") as f:
#     pickle.dump(obj=(x_train, x_test, y_train, embedding_matrix), file=f)
    
with open("../dumps/cnn_pooled_gru.pkl", "rb") as f:
    x_train, x_test, y_train, embedding_matrix = pickle.load(file=f)

In [11]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [12]:
batch_size = 32
epochs = 2
num_folds=10
seed = 42
# X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
# RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [13]:
# np.random.seed(42)
# hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
#                  callbacks=[RocAuc], verbose=1)

In [16]:
scores, train_oof, test_oof = oof(x_train, x_test, y_train, num_folds, seed)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.987979 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.988183 

Train on 143614 samples, validate on 15957 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985894 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.985954 

Train on 143614 samples, validate on 15957 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.983282 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.983465 

Train on 143614 samples, validate on 15957 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.988254 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.987825 

Train on 143614 samples, validate on 15957 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.983365 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.984606 

Train on 143614 samples, validate on 15957 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.988377 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.988772 

Train on 143614 samples, validate on 15957 samples
Epoch 1/2

 ROC-AUC - epoch: 1 

In [17]:
scores, roc_auc_score(y_train, train_oof)

([0.9881830036311351,
  0.985953525873131,
  0.9834654983720464,
  0.9878251794894345,
  0.9846055111769164,
  0.9887723455318583,
  0.9883455754344013,
  0.9866443686735483,
  0.9877922685281124,
  0.9859181169999981],
 0.9859654488329598)

In [20]:
save_oof(train_oof, test_oof, "NN_GRU_pooling", sample_submission)