In [1]:
import gc
import re
import numpy as np
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from keras.layers import Input, Embedding, SpatialDropout1D, CuDNNGRU, CuDNNLSTM, Bidirectional, \
    Dense, Dropout, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model
from keras.preprocessing.text import Tokenizer as KerasTokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from attention import Attention
from f1_early_stopping import F1_EarlyStopping, find_threshold
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from cyclic_lr import CyclicLR

Using TensorFlow backend.


In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
class Tokenizer(KerasTokenizer):
    _NON_ASCII_PATTERN = re.compile(r'[^\x00-\x7f]')
    _SPECIAL_ASCII_CHARACTERS = re.compile(r'[\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f]')
    
    def __init__(self, tokenizer=word_tokenize, *args, **kwargs):
        super(Tokenizer, self).__init__(*args, **kwargs)
        self.tokenizer = tokenizer
    
    def _preprocess(self, text):
        text = Tokenizer._NON_ASCII_PATTERN.sub(' ', text)
        text = Tokenizer._SPECIAL_ASCII_CHARACTERS.sub('', text)
        return ' '.join(self.tokenizer(text))
    
    def fit_on_texts(self, texts):
        return super(Tokenizer, self).fit_on_texts([
            self._preprocess(text)
            for text in texts
        ])
    
    def texts_to_sequences(self, texts):
        return super(Tokenizer, self).texts_to_sequences([
            self._preprocess(text)
            for text in texts
        ])

In [4]:
train_y = train['target'].values

In [5]:
MAX_FEATURES = 95000
tokenizer = Tokenizer(num_words=95000)
tokenizer.fit_on_texts(train['question_text'])
train_X = tokenizer.texts_to_sequences(train['question_text'])
test_X = tokenizer.texts_to_sequences(test['question_text'])

In [6]:
pd.Series(train_X).apply(len).quantile(0.995)

44.0

In [7]:
MAXLEN = 70
train_X = pad_sequences(train_X, MAXLEN)
test_X = pad_sequences(test_X, MAXLEN)

In [8]:
def read_glove(fname, encoding='utf-8', errors='ignore'):
    word2index = {}
    vectors = []
    with open(fname, 'r', encoding=encoding, errors=errors) as src:
        for i, line in enumerate(src):
            if not line:
                break
            parts = line.split(' ')
            word = parts[0]
            vector = np.array(parts[1:]).astype(np.float32)
            word2index[word] = i
            vectors.append(vector)
    return word2index, np.array(vectors)


def lowercase_word_index(word2index):
    lowercased = {
        word.lower(): index
        for word, index in word2index.items()
    }
    return dict(lowercased, **word2index)


def vocab_oov_split(word2index, tokenizer):
    known_words = set(tokenizer.word_index.keys())
    vocabulary_words = set(word2index.keys())
    return sorted(known_words & vocabulary_words), sorted(known_words - vocabulary_words)


def vocabulary_embedding(word2index, embeddings, tokenizer, vocabulary):
    embeddings_final = np.zeros([len(tokenizer.word_index) + 1, embeddings.shape[1]])
    for word in vocabulary:
        final_idx = tokenizer.word_index[word]
        current_idx = word2index[word]
        embeddings_final[final_idx] = embeddings[current_idx, :]
    return embeddings_final

In [9]:
paragram_word2index, paragram_embeddings = read_glove('../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt')
paragram_word2index_lowercased = lowercase_word_index(paragram_word2index)
paragram_vocabulary, paragram_oov = vocab_oov_split(paragram_word2index_lowercased, tokenizer)
paragram_embeddings_cutten = vocabulary_embedding(paragram_word2index_lowercased, paragram_embeddings, tokenizer, paragram_vocabulary)

In [10]:
glove_word2index, glove_embeddings = read_glove('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')
glove_word2index_lowercased = lowercase_word_index(glove_word2index)
glove_vocabulary, glove_oov = vocab_oov_split(glove_word2index_lowercased, tokenizer)
glove_embeddings_cutten = vocabulary_embedding(glove_word2index_lowercased, glove_embeddings, tokenizer, glove_vocabulary)

In [11]:
del paragram_embeddings, glove_embeddings
gc.collect()

0

In [12]:
embeddings = (paragram_embeddings_cutten + glove_embeddings_cutten) / 2.0

In [13]:
def get_model(embeddings):
    inp = Input(shape=(MAXLEN,))
    x = Embedding(embeddings.shape[0], embeddings.shape[1], weights=[embeddings], trainable=False)(inp)
    x = SpatialDropout1D(0.05)(x)
    
    x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
    atten_1 = Attention(MAXLEN)(x) # skip connect
    
    y = Bidirectional(CuDNNGRU(40, return_sequences=True))(x)
    atten_2 = Attention(MAXLEN)(y)
    
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    conc = concatenate([atten_1, atten_2, avg_pool, max_pool])
    
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.2)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [14]:
RANDOM_STATE = 42
tf.set_random_seed(RANDOM_STATE)

models = []
kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_STATE)
scores = []
for idx_train, idx_val in kfold.split(train_X, train_y):
    model = get_model(embeddings)
    f1_callback = F1_EarlyStopping(train_X[idx_train], train_y[idx_train],
                                   train_X[idx_val], train_y[idx_val],
                                   batch_size=1024)
    model.fit(train_X[idx_train], train_y[idx_train],
              validation_data=(train_X[idx_val], train_y[idx_val]),
              epochs=8,
              verbose=True,
              batch_size=256,
              callbacks=[
                  CyclicLR(base_lr=0.001, max_lr=0.002,
                           step_size=300., mode='exp_range',
                           gamma=0.99994),
                  f1_callback,
              ])
    scores.append(f1_callback.best_score)
    models.append(model)
    print('*' * 80)
    
scores = np.array(scores)
print('Mean score: {0}'.format(np.mean(scores)))
print('Score std: {0}'.format(np.std(scores)))

Train on 979591 samples, validate on 326531 samples
Epoch 1/8
Epoch 0 train finished. Checking classification quality.
Val F1: 0.6611072823340225
Updated model
Epoch 2/8
Epoch 1 train finished. Checking classification quality.
Val F1: 0.6746587370343488
Updated model
Epoch 3/8
Epoch 2 train finished. Checking classification quality.
Val F1: 0.6799371882505081
Updated model
Epoch 4/8
Epoch 3 train finished. Checking classification quality.
Val F1: 0.6788297627274821
Finished training. Returned to best state.
********************************************************************************
Train on 979591 samples, validate on 326531 samples
Epoch 1/8
Epoch 0 train finished. Checking classification quality.
Val F1: 0.6640594265614514
Updated model
Epoch 2/8
Epoch 1 train finished. Checking classification quality.
Val F1: 0.6769407178075909
Updated model
Epoch 3/8
Epoch 2 train finished. Checking classification quality.
Val F1: 0.6809817367412435
Updated model
Epoch 4/8
Epoch 3 train finish

In [15]:
for i, model in enumerate(models):
    model.save('model-{0}.h5'.format(i))

In [16]:
glove_oov

["''how",
 "''kate",
 "''national",
 "''religion",
 "''that",
 "''why",
 "'0",
 "'00000000000000000021e800",
 "'00s",
 "'1",
 "'1'54'am",
 "'100",
 "'102",
 "'10x",
 "'110",
 "'1967",
 "'200",
 "'2009",
 "'2099",
 "'2nd",
 "'2r",
 "'3",
 "'301",
 "'4",
 "'4'",
 "'4'it",
 "'403",
 "'4k",
 "'5",
 "'500",
 "'539",
 "'6",
 "'666",
 "'7",
 "'777888999",
 "'8",
 "'a",
 "'a'units",
 "'a2a",
 "'aa",
 "'aaji",
 "'aajkal",
 "'aap",
 "'aazadi",
 "'ab",
 "'abdullah",
 "'ability",
 "'abode",
 "'abortion",
 "'about",
 "'above",
 "'abracadabra",
 "'abuse",
 "'acceleration",
 "'accept",
 "'acceptable",
 "'access",
 "'accha",
 "'accident",
 "'accidental",
 "'accidentally",
 "'acclaimed",
 "'accomplishments",
 "'accurate",
 "'accuser",
 "'achievement",
 "'act",
 "'acting",
 "'actioning",
 "'active",
 "'actively",
 "'actor",
 "'actors",
 "'actually",
 "'adam",
 "'adapted",
 "'adaption",
 "'adat",
 "'add",
 "'addams",
 "'addicting",
 "'addictive",
 "'additional",
 "'adds",
 "'adiye",
 "'adjust",
 "'admini

In [17]:
paragram_oov

["''how",
 "''kate",
 "''national",
 "''religion",
 "''that",
 "''why",
 "'0",
 "'00000000000000000021e800",
 "'00s",
 "'1",
 "'1'54'am",
 "'100",
 "'102",
 "'10x",
 "'110",
 "'1967",
 "'200",
 "'2009",
 "'2099",
 "'2nd",
 "'2r",
 "'3",
 "'301",
 "'4",
 "'4'",
 "'4'it",
 "'403",
 "'4k",
 "'5",
 "'500",
 "'539",
 "'6",
 "'666",
 "'7",
 "'777888999",
 "'8",
 "'a",
 "'a'units",
 "'a2a",
 "'aa",
 "'aaji",
 "'aajkal",
 "'aap",
 "'aazadi",
 "'ab",
 "'abdullah",
 "'ability",
 "'abode",
 "'abortion",
 "'about",
 "'above",
 "'abracadabra",
 "'abuse",
 "'acceleration",
 "'accept",
 "'acceptable",
 "'access",
 "'accha",
 "'accident",
 "'accidental",
 "'accidentally",
 "'acclaimed",
 "'accomplishments",
 "'accurate",
 "'accuser",
 "'achievement",
 "'act",
 "'acting",
 "'actioning",
 "'active",
 "'actively",
 "'actor",
 "'actors",
 "'actually",
 "'adam",
 "'adapted",
 "'adaption",
 "'adat",
 "'add",
 "'addams",
 "'addicting",
 "'addictive",
 "'additional",
 "'adds",
 "'adiye",
 "'adjust",
 "'admini