# Simple GRU network with pretrained vectors for initialization

In [1]:
import sys, os, re, csv, codecs, gc, numpy as np, pandas as pd
import tensorflow as tf
#from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Permute, GRU, Conv1D, LSTM, Embedding, Dropout, Activation, CuDNNLSTM, CuDNNGRU, concatenate, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, BatchNormalization, SpatialDropout1D, Dot
from keras.optimizers import Adam, RMSprop
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras_tqdm import TQDMNotebookCallback
import keras.backend as K
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from functools import reduce
from keras.layers import Layer, PReLU, SpatialDropout1D
from keras import initializers
from sklearn.model_selection import cross_val_predict

from nltk.tokenize import word_tokenize, wordpunct_tokenize, TweetTokenizer, MWETokenizer, ToktokTokenizer
from nltk.corpus import stopwords

import unicodedata
from collections import Counter
import itertools

np.random.seed(786)

from Tokenizer import Tokenizer
from ZeroMaskedLayer import ZeroMaskedLayer
from AttentionLayer import AttentionLayer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = '../input/'
utility_path = '../utility/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{utility_path}glove.42B.300d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [3]:
word_tokenize("I won't do this check!!!")

['I', 'wo', "n't", 'do', 'this', 'check', '!', '!', '!']

In [4]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [5]:
def unicodeToAscii(series):
    return series.apply(lambda s: unicodedata.normalize('NFKC', str(s)))


def multiple_replace(text, adict):
    rx = re.compile('|'.join(map(re.escape, adict)))

    def one_xlat(match):
        return adict[match.group(0)]

    return rx.sub(one_xlat, text)

STOP_WORDS = set(stopwords.words( 'english' ))
# Lowercase, trim, and remove non-letter characters
def normalizeString(series):
    series = unicodeToAscii(series)
    series = series.str.lower()
    series = series.str.replace(r"(\n){1,}", " ")
    #series = series.str.replace(r"\'", "")
    #series = series.str.replace(r"\-", "")
    #series = series.str.replace(r"[^0-9a-zA-Z.,!?]+", " ")

    return series


In [6]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

print(train.shape, test.shape)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

#Get validation folds
train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes])
train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000')
cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category')))
cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category')))

(159571, 8) (153164, 2)




In [7]:
for df in train, test:
    df["comment_text"] = normalizeString(df["comment_text"])

In [8]:
train.comment_text.sample(1).values[0]

'i will test alternate summaries for greater clarity.'

In [9]:
#pd.concat([train["comment_text"].astype(str), test["comment_text"].astype(str)]).reset_index(drop=True)[:len(train), :]

In [10]:
MAX_FEATURES = 250000
MAX_LEN = 250

tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=word_tokenize)
X = tok.fit_transform(pd.concat([train["comment_text"].astype(str), test["comment_text"].astype(str)]))
X_train = X[:len(train), :]
X_test = X[len(train):, :]

print(X_train.shape, X_test.shape)

(159571, 250) (153164, 250)


In [11]:
#del train, test

In [19]:
EMBED_SIZE = 300
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

def initialize_embeddings(filename, tokenizer):
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(filename))

    word_index = tokenizer.vocab_idx
    nb_words = min(MAX_FEATURES+1, len(word_index)+1)
    embedding_matrix = np.zeros((nb_words, EMBED_SIZE))
    for word, i in word_index.items():
        if i > MAX_FEATURES: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
embedding_matrix = initialize_embeddings(EMBEDDING_FILE, tok)
print(embedding_matrix.shape)
print(np.mean(embedding_matrix), np.std(embedding_matrix))

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
class GRUClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, gru_dim=150, dense_dim=256, batch_size=128, epochs=2, bidirectional=False, 
                 pool_type='all', initial_weights=None, optimizer='adam' ,verbose=1, out_dim=6, callbacks=None,
                spatial_drop=0.0, dropout=0.0, mask_zero=True, 
                gru_kernel_regularization = 0.0,
                gru_recurrent_regularization = 0.0,
                gru_bias_regularization = 0.0,
                embeddings_regularization = 0.0,
                ):
        
        self.gru_dim = gru_dim
        self.dense_dim = dense_dim
        self.batch_size = batch_size
        self.epochs= epochs
        self.bidirectional = bidirectional
        self.pool_type = pool_type
        self.initial_weights = initial_weights
        self.verbose = verbose
        self.callbacks = callbacks
        self.optimizer = optimizer
        self.out_dim = out_dim
        self.spatial_drop = spatial_drop
        self.dropout = dropout
        self.mask_zero = mask_zero
        self.gru_kernel_regularization = gru_kernel_regularization
        self.gru_recurrent_regularization = gru_recurrent_regularization
        self.gru_bias_regularization = gru_bias_regularization
        self.embeddings_regularization = embeddings_regularization
        
    def _build_model(self):
        inp = Input(shape=(MAX_LEN,))
        emb = Embedding(MAX_FEATURES+1, 
                        EMBED_SIZE,
                        weights=[self.initial_weights],
                        mask_zero=self.mask_zero,
                        #embeddings_regularizer=regularizers.l2(self.embeddings_regularization),
                        trainable=True)(inp)

        if self.mask_zero:
            emb = ZeroMaskedLayer()(emb)
            
        emb = SpatialDropout1D(self.spatial_drop)(emb)
        if self.bidirectional:
            enc = Bidirectional(GRU(int(self.gru_dim), return_sequences=True, return_state=True, stateful=True,
                                         ))(emb)
            x = enc[0]
            state = enc[1]
        else:
            x, state = GRU(int(self.gru_dim), return_sequences=True, return_state=True,
                            kernel_regularizer=regularizers.l2(self.gru_kernel_regularization),
                            recurrent_regularizer=regularizers.l2(self.gru_recurrent_regularization),
                            bias_regularizer=regularizers.l2(self.gru_bias_regularization)
                               )(emb)
            #x = SpatialDropout1D(0.5)(x)
        
        if self.pool_type == 'avg':
            x = GlobalAveragePooling1D()(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'max':
            x = GlobalMaxPool1D()(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'attn':
            x = AttentionLayer(MAX_LEN)(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'all':
            #x1 = GlobalAveragePooling1D()(x)
            x2 = GlobalMaxPool1D()(x)
            x3 = AttentionLayer(MAX_LEN)(x)
            x = concatenate([x2, x3, state])
    
        x = Dropout(self.dropout)(x)
        x = Dense(self.dense_dim)(x)
        x = PReLU()(x)
        
        #x = Dense(self.dense_dim)(x)
        #x = PReLU()(x)

        out = Dense(self.out_dim, activation="sigmoid")(x)
        if self.optimizer == 'adam':
            opt = Adam(lr=0.001, decay=0.0, clipnorm=1.0)
        elif self.optimizer == 'rmsprop':
            opt = RMSprop(clipnorm=1.0)
        model = Model(inputs=inp, outputs=out)
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model = self._build_model()
        
        if self.callbacks:
            self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs,
                       verbose=self.verbose,
                       callbacks=self.callbacks,
                       shuffle=True)
        else:
            self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs,
                       verbose=self.verbose,
                       shuffle=True)
        return self
    
    def predict(self, X, y=None):
        if self.model:
            y_hat = self.model.predict(X, batch_size=1024)
        else:
            raise ValueError("Model not fit yet")
        return y_hat

In [None]:
def lr_decay(epoch):
    if epoch == 0:
        return 0.0016
    if epoch == 1:
        return 0.00024
    if epoch == 2:
        return 0.001
    if epoch == 3:
        return 0.00001


def shuffle_crossvalidator(model, cvlist, X, y, lr_decay):
    y_trues = []
    y_preds = []
    scores = []
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        print("ROC AUC for this fold is ", score)
        y_trues.append(y_val)
        y_preds.append(y_pred)
        K.clear_session()
        gc.collect()
        #break
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    score = roc_auc_score(y_trues, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, scores

def outoffold_crossvalidator(model_params, cvlist, X, y, lr_decay):
    y_preds = np.zeros(y.shape)
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
        
        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        print("ROC AUC for this fold is ", roc_auc_score(y_val, y_pred))
        y_preds[val_idx] = y_pred
        K.clear_session()
        break
    score = roc_auc_score(y, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, score


In [None]:
import tensorflow as tf
K.clear_session()
config = tf.ConfigProto(
        device_count = {'GPU': 0}
    )
sess = tf.Session(config=config)
model = GRUClassifier(gru_dim=300, dense_dim=900, initial_weights=embedding_matrix, bidirectional=False,
                    batch_size=64, epochs=2, optimizer='adam', pool_type='all', dropout=0.2, spatial_drop=0.3, mask_zero=False)

y_preds, y_trues, _ = shuffle_crossvalidator(model, cvlist2, X_train, y, lr_decay)

In [None]:
import gc 
gc.collect()
K.clear_session()

In [None]:
#Pick top 10 parameter settings, Bag models for those settings
#Try linear blending on those settings
NUM_BAGS = 10
cvlist3 = list(StratifiedShuffleSplit(n_splits=NUM_BAGS, test_size=0.05, random_state=786).split(y, y[:,2]))


def shuffle_train_predict(model, cvlist, X, y, X_test, lr_decay):
    y_trues = []
    y_preds = []
    y_test_preds = []
    scores = []
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        print("ROC AUC for this fold is ", score)
        y_trues.append(y_val)
        y_preds.append(y_pred)
        y_test_preds.append(y_test_pred)
        K.clear_session()
        gc.collect()
        #break
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    y_test_preds = np.mean(y_test_preds, axis=0)
    print("Shape of test _preds is ", y_test_preds.shape)
    print("Means of val and test preds are {} and {}".format(np.mean(y_preds, axis=1), np.mean(y_test_preds, axis=1)))
    score = roc_auc_score(y_trues, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, y_test_preds

def train_predict(parameter_space):
    
    def lr_decay(epoch):
        if epoch == 0:
            return parameter_space['lr1'][0]
        if epoch == 1:
            return parameter_space['lr2'][0]
    
    model = GRUClassifier(initial_weights=embedding_matrix, bidirectional=[True, False][parameter_space['bidirectional'][0]],
                          gru_dim = int(parameter_space['gru_dim'][0]),
                          dense_dim = int(parameter_space['dense_dim'][0]),
                          mask_zero = [True, False][parameter_space['mask_zero'][0]],
                          pool_type = ['avg', 'max', 'attn', 'all'][parameter_space['pool_type'][0]],
                          batch_size= int(parameter_space['batch_size'][0]), 
                          epochs=2, 
                          optimizer=["adam", "rmsprop"][parameter_space['optimizer'][0]],
                          dropout=parameter_space['dropout'][0],
                          spatial_drop=parameter_space['spatial_drop'][0],
                          gru_kernel_regularization = parameter_space["gru_kernel_reg"][0],
                          gru_recurrent_regularization = parameter_space["gru_recc_reg"][0],
                          gru_bias_regularization = parameter_space["gru_bias_reg"][0],
                          #embeddings_regularization = parameter_space["embeddings_reg"],
                          )

    y_preds, y_trues, y_test_preds = shuffle_train_predict(model, cvlist3, X_train, y, X_test, lr_decay)    
    return y_preds, y_trues, y_test_preds

#####
y_preds_all = []
y_trues_all = []
y_test_preds_all = []
for params in parameter_list:
    y_preds, y_trues, y_test_preds = train_predict(params)
    y_preds_all.append(y_preds)
    y_trues_all.append(y_trues)
    y_test_preds_all.append(y_test_preds)
    


Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.989692 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.990572 

ROC AUC for this fold is  0.9905723708910106
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986022 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988035 

ROC AUC for this fold is  0.9880350331273641
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.988386 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.989426 

ROC AUC for this fold is  0.9894261798411909
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986620 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988654 

ROC AUC for this fold is  0.9886540749170946
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.983526 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.986736 

ROC AUC for this fold is  0.9867360504779404
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.987915 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.989794 

ROC AUC for this fold is  0.9897943899894113
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.988948 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.990137 

ROC AUC for this fold is  0.99

ROC AUC for this fold is  0.99035323100561
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.988255 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.989766 

ROC AUC for this fold is  0.9897663028885799
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.985113 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.987764 

ROC AUC for this fold is  0.9877638950732696
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986432 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988305 

ROC AUC for this fold is  0.9883046565615933
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986178 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.986739 

ROC AUC for this fold is  0.9867389337672714
Shape of test _preds is  (153164, 6)
Means of val and test preds are [1.1591929e-05 1.1925943e-04 1.2761238e-04 ... 5.8704225e-04 3.8671819e-06
 1.6288273e-03] and [6.5117562e-01 2.7441100e-05 2.7715214e-04 ... 4.9246199e-05 6.6319037e-05
 4.0668675e-01]
Overall score on 10 fold CV is 0.9873042146327488
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.989752 

Epoch 2/2
 ROC-AUC - e

 ROC-AUC - epoch: 1 - score: 0.985382 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.987807 

ROC AUC for this fold is  0.9878066676398046
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.988377 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.989610 

ROC AUC for this fold is  0.9896096401234588
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.982843 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.988082 

ROC AUC for this fold is  0.98808178544326
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.983606 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.985411 

ROC AUC for this fold is  0.9854111148666624
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.987102 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.989273 

ROC AUC for this fold is  0.9892729856138169
Epoch 1/2

In [47]:
#Check corelation between different predictions
np.corrcoef(y_preds_all, axis=0)

0

In [None]:
#Try different stacking approaches
from scipy.stats import gmean, hmean

preds_mean = gmean(y_preds_all, axis=0)
print(roc_auc_score(y_trues_all[0], preds_mean))
test_preds_mean = gmean(y_test_preds_all, axis=0)


In [None]:
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[label_cols] = test_preds_mean
sample_submission.to_csv('../input/gru_glove_10bags_submission.csv', index=False)