### HAN's

Implementation based on paper here:
http://aclweb.org/anthology/N/N16/N16-1174.pdf
http://aclweb.org/anthology/D15-1167

Inspiration for code taken from here:
https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py

In [1]:
import sys, os, re, csv, codecs, gc, numpy as np, pandas as pd
import tensorflow as tf
#from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Permute, GRU, Conv1D, LSTM, Embedding, Dropout, Activation, CuDNNLSTM, CuDNNGRU, concatenate, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, BatchNormalization, SpatialDropout1D, Dot
from keras.optimizers import Adam, RMSprop
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras_tqdm import TQDMNotebookCallback
import keras.backend as K
from keras.callbacks import LearningRateScheduler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from functools import reduce
from keras.layers import Layer, PReLU, SpatialDropout1D, TimeDistributed
from keras import initializers
from sklearn.model_selection import cross_val_predict

from nltk.tokenize import word_tokenize, wordpunct_tokenize, TweetTokenizer, MWETokenizer, ToktokTokenizer, sent_tokenize
from nltk.corpus import stopwords

import unicodedata
from collections import Counter
import itertools

np.random.seed(786)

from SentenceTokenizer import SentenceTokenizer
from ZeroMaskedLayer import ZeroMaskedLayer
from AttentionLayer import AttentionLayer, AttentionWrapper, AttentionWithContext, Attention
from RocAucEvaluation import RocAucEvaluation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
path = '../input/'
utility_path = '../utility/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{utility_path}glove.42B.300d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [4]:
def unicodeToAscii(series):
    return series.apply(lambda s: unicodedata.normalize('NFKC', str(s)))

STOP_WORDS = set(stopwords.words( 'english' ))

repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    #"m": "am",
    #"r": "are",
    #"u": "you",
    "haha": "ha",
    "hahaha": "ha",
}

#https://stackoverflow.com/questions/15175142/how-can-i-do-multiple-substitutions-using-regex-in-python
def one_xlat(match):
        return repl[match.group(0)]
    
rx = re.compile('|'.join(map(re.escape, repl)))
# Lowercase, trim, and remove non-letter characters
def normalizeString(series):
    series = unicodeToAscii(series)
    series = series.str.lower()
    series = series.str.replace(rx, one_xlat)
    series = series.str.replace(r"(\n){1,}", " ")
    #series = series.str.replace(r"\'", "")
    #series = series.str.replace(r"\-", "")
    series = series.str.replace(r"[^0-9a-zA-Z.,!?]+", " ")
    series = series.str.replace(r"[.]+",".")
    series = series.str.replace(r"[!]+","!")
    series = series.str.replace(r"[?]+",".")
    return series


In [5]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

print(train.shape, test.shape)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

#Get validation folds
train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes])
train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000')
cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category')))
cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category')))

(159571, 8) (153164, 2)




In [6]:
for df in train, test:
    df["comment_text"] = normalizeString(df["comment_text"])

In [7]:
MAX_FEATURES = 200000
MAX_SENTENCE_LEN = 50
MAX_SENTENCES = 10

def custome_tokenizer(text):
    return [TweetTokenizer().tokenize(sent) for sent in  sent_tokenize(text)]

tok = SentenceTokenizer(max_features=MAX_FEATURES, max_sentence_len=MAX_SENTENCE_LEN, max_sentences=MAX_SENTENCES, tokenizer=custome_tokenizer)

In [8]:
#c = Counter()
#s = Counter()
#def cnts(x):
#    toks = custome_tokenizer(x)
#    s.update([len(toks)])
#    c.update([len(sent) for sent in toks])
#train.comment_text.apply(lambda x: cnts(x))

In [9]:
%%time
X_train = tok.fit_transform(train.comment_text)
X_test = tok.transform(test.comment_text)

CPU times: user 2min 12s, sys: 586 ms, total: 2min 12s
Wall time: 2min 12s


In [10]:
print(X_train.shape, X_test.shape)

(159571, 10, 50) (153164, 10, 50)


In [11]:
EMBED_SIZE = 300
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

def initialize_embeddings(filename, tokenizer):
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(filename))

    word_index = tokenizer.vocab_idx
    nb_words = min(MAX_FEATURES+1, len(word_index)+1)
    embedding_matrix = np.zeros((nb_words, EMBED_SIZE))
    for word, i in word_index.items():
        if i > MAX_FEATURES: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [12]:
embedding_matrix = initialize_embeddings(EMBEDDING_FILE, tok)
print(embedding_matrix.shape)
print(np.mean(embedding_matrix), np.std(embedding_matrix))

(200001, 300)
0.002777580891439761 0.22665129958961627


In [25]:
def rnn_wrapper(seq_len, enc_dim, bidirectional=False, attention=True, rnn_cell='gru'):
    #Select cell to use
    if rnn_cell == 'gru':
        rnn_enc = CuDNNGRU(enc_dim, return_sequences=True)
    elif rnn_cell == 'lstm':
        rnn_enc = CuDNNLSTM(enc_dim, return_sequences=True)
        
    #Apply bidirectional is flag in on
    if bidirectional:
        enc = Bidirectional(rnn_enc)
    else:
        enc = rnn_enc
        
    if attention:
        enc = Attention()(enc)
    
    return enc
        

    

In [26]:
from sklearn.base import BaseEstimator, ClassifierMixin
class LSTMGRNN(BaseEstimator, ClassifierMixin):
    def __init__(self, rnn_word_dim=150, rnn_sent_dim=150, dense_dim=256, rnn_type="gru", batch_size=128, epochs=2, bidirectional=False, 
                 pool_type='all', initial_weights=None, optimizer='adam' ,verbose=1, out_dim=6, callbacks=None,
                spatial_drop=0.0, dropout=0.0, mask_zero=True, 
                gru_kernel_regularization = 0.0001,
                gru_recurrent_regularization = 0.0001,
                gru_bias_regularization = 0.0001,
                embeddings_regularization = 0.0,
                 word_cell='gru',
                 word_attention=True,
                 sent_cell='gru',
                 sent_attention=True,
                ):
        
        self.rnn_word_dim = rnn_word_dim
        self.rnn_sent_dim = rnn_sent_dim
        self.word_cell = word_cell
        self.word_attention = word_attention
        self.sent_cell = sent_cell
        self.sent_attention = sent_attention
        self.dense_dim = dense_dim
        self.rnn_type = rnn_type
        self.batch_size = batch_size
        self.epochs= epochs
        self.bidirectional = bidirectional
        self.pool_type = pool_type
        self.initial_weights = initial_weights
        self.verbose = verbose
        self.callbacks = callbacks
        self.optimizer = optimizer
        self.out_dim = out_dim
        self.spatial_drop = spatial_drop
        self.dropout = dropout
        self.mask_zero = mask_zero
        self.gru_kernel_regularization = gru_kernel_regularization
        self.gru_recurrent_regularization = gru_recurrent_regularization
        self.gru_bias_regularization = gru_bias_regularization
        self.embeddings_regularization = embeddings_regularization
        
    def _build_model(self):
        inp = Input(shape=(MAX_SENTENCES, MAX_SENTENCE_LEN))
        
        sent_input = Input(shape=(MAX_SENTENCE_LEN,))
        word_emb = Embedding(MAX_FEATURES+1, 
                        EMBED_SIZE,
                        weights=[self.initial_weights],
                        mask_zero=self.mask_zero,
                        #embeddings_regularizer=regularizers.l2(self.embeddings_regularization),
                        trainable=True)(sent_input)
    
        if self.mask_zero:
            word_emb = ZeroMaskedLayer()(word_emb)
        word_emb = SpatialDropout1D(self.spatial_drop)(word_emb)
        
        l_rnn = rnn_wrapper(MAX_SENTENCE_LEN, self.rnn_word_dim, bidirectional=self.bidirectional, 
                           attention=self.word_attention, rnn_cell=self.word_cell)(word_emb)
        l_rnn = GlobalAveragePooling1D()(l_rnn)
        sentEncoder = Model(sent_input, l_rnn)
            
        
        emb = TimeDistributed(sentEncoder)(inp)
        print(emb.shape)
        if self.bidirectional:
            enc = Bidirectional(CuDNNGRU(int(self.rnn_sent_dim), return_sequences=True, return_state=True, stateful=True,
                                         ))(emb)
            x = enc[0]
            state = enc[1]
        else:
            x, state = GRU(int(self.rnn_sent_dim), return_sequences=True, return_state=True,
                            kernel_regularizer=regularizers.l2(self.gru_kernel_regularization),
                            recurrent_regularizer=regularizers.l2(self.gru_recurrent_regularization),
                            bias_regularizer=regularizers.l2(self.gru_bias_regularization)
                               )(emb)
            #x = SpatialDropout1D(0.5)(x)
        
        if self.pool_type == 'avg':
            x = GlobalAveragePooling1D()(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'max':
            x = GlobalMaxPool1D()(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'attn':
            x = AttentionLayer(MAX_SENTENCES)(x)
            x = concatenate([x, state])
            
        elif self.pool_type == 'all':
            x1 = GlobalAveragePooling1D()(x)
            x2 = GlobalMaxPool1D()(x)
            x3 = AttentionLayer(MAX_SENTENCES)(x)
            x = concatenate([x2, x3, state])
    
        x = Dropout(self.dropout)(x)
        x = Dense(self.dense_dim)(x)
        x = PReLU()(x)
        
        #x = Dense(self.dense_dim)(x)
        #x = PReLU()(x)

        out = Dense(self.out_dim, activation="sigmoid")(x)
        if self.optimizer == 'adam':
            opt = Adam(lr=0.001, decay=0.0, clipnorm=1.0)
        elif self.optimizer == 'rmsprop':
            opt = RMSprop(clipnorm=1.0)
        model = Model(inputs=inp, outputs=out)
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model = self._build_model()
        
        if self.callbacks:
            self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs,
                       verbose=self.verbose,
                       callbacks=self.callbacks,
                       shuffle=True)
        else:
            self.model.fit(X, y, batch_size=self.batch_size, epochs=self.epochs,
                       verbose=self.verbose,
                       shuffle=True)
        return self
    
    def predict(self, X, y=None):
        if self.model:
            y_hat = self.model.predict(X, batch_size=1024)
        else:
            raise ValueError("Model not fit yet")
        return y_hat

In [27]:
def lr_decay(epoch):
    if epoch == 0:
        return 0.0015
    if epoch == 1:
        return 0.0001
    if epoch == 2:
        return 0.001
    if epoch == 3:
        return 0.00001


def shuffle_crossvalidator(model, cvlist, X, y, lr_decay):
    y_trues = []
    y_preds = []
    scores = []
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist2:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        print("ROC AUC for this fold is ", score)
        y_trues.append(y_val)
        y_preds.append(y_pred)
        K.clear_session()
        gc.collect()
        #break
    y_trues = np.concatenate(y_trues)
    y_preds = np.concatenate(y_preds)
    score = roc_auc_score(y_trues, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, scores

def outoffold_crossvalidator(model_params, cvlist, X, y, lr_decay):
    y_preds = np.zeros(y.shape)
    LRDecay = LearningRateScheduler(lr_decay)

    for tr_index, val_index in cvlist2:
        X_tr, y_tr = X[tr_index, :], y[tr_index, :]
        X_val, y_val = X[val_index, :], y[val_index, :]
        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
        
        model.set_params(**{'callbacks':[RocAuc, LRDecay]})
        model.fit(X_tr, y_tr)

        y_pred = model.predict(X_val)
        print("ROC AUC for this fold is ", roc_auc_score(y_val, y_pred))
        y_preds[val_idx] = y_pred
        K.clear_session()
        break
    score = roc_auc_score(y, y_preds)
    print("Overall score on 10 fold CV is {}".format(score))
    
    return y_preds, y_trues, score


In [28]:
import tensorflow as tf
config = tf.ConfigProto(
        device_count = {'GPU': 1}
    )
sess = tf.Session(config=config)

model = LSTMGRNN(rnn_word_dim=300, rnn_sent_dim=100, dense_dim=900, initial_weights=embedding_matrix, bidirectional=False,
                    batch_size=100, epochs=2, optimizer='adam', mask_zero=False, pool_type='all')

y_preds, y_trues, _ = shuffle_crossvalidator(model, cvlist2, X_train, y, lr_decay)

ValueError: Layer attention_1 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.layers.cudnn_recurrent.CuDNNGRU'>. Full input: [<keras.layers.cudnn_recurrent.CuDNNGRU object at 0x7f45bb3420f0>]. All inputs to the layer should be tensors.

In [16]:
ToktokTokenizer().tokenize("I won't do this!!")

['I', 'won', "'", 't', 'do', 'this', '!', '!']