In [1]:
import operator
import tensorflow as tf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.utils import class_weight
from sklearn.metrics import roc_curve, precision_recall_curve
# import dask.dataframe as dd
# import dask.multiprocessing
# import dask.threaded
# import dask
# import timeit
# from time import time

from nltk.tokenize import word_tokenize
# import spacy
# nlp = spacy.load('en')
# tqdm.pandas()

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, Activation, CuDNNGRU, CuDNNLSTM, Layer
from tensorflow.keras.layers import Bidirectional, Flatten, SpatialDropout1D
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras import optimizers, layers
from keras.callbacks import *

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [3]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_df['target'].values),
                                                 train_df['target'].values)

In [4]:
def tokenize(x):
#     x = x.lower()
    x = x.replace("'","'")
    x = ' '.join(word_tokenize(x))
    return str(x)

In [5]:
#restore
train_df["question_text"] = train_df["question_text"].str.lower().apply(tokenize)
test_df["question_text"] = test_df["question_text"].str.lower().apply(tokenize)
# sentences = train_df["question_text"].apply(lambda x: x.split()).values

In [6]:
#restore
mispell_dict = {' .net':' dotnet',
                ' ^2':' square',
                'bhakts':'followers',
#                 'l&t':'larsen & toubro',
                '/math':'math',
                'demonetisation':'demonetization',
                'cryptocurrencies':'cryptocurrency',
                'quorans':'quoran',
                ' an':' ',
                ' a ':' ',
                ' the ':' ',
                ' this ':' ',
                ' that ':' ',
                ' then ':' ',
                ' ..': '',
                '+':'',
                '\\': '',
                '^': '',
                '“':'"',
                '”':'"',
                '？':'?',
                '£':'pound',
                '€':'euro',
                "pokémon": "pokemon",
                '…':'',
                '—':'-',
                '°':'',
                'non':'non'
}
# mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    text = text.replace('’', "'")
    text = text.replace('‘', "'")
    for k,v in mispell_dict.items():
        if k in text:
            text = text.replace(k,v)
    return text

In [7]:
#restore
train_df["question_text"] = train_df["question_text"].apply(replace_typical_misspell)
test_df["question_text"] = test_df["question_text"].apply(replace_typical_misspell)

In [8]:
class CyclicLR(Callback):
    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
    

def f1(y_true, y_pred, plot=False):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [9]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 40000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 50 # max number of words in a question to use

def load_data_kfold(k):
    train_X = train_df["question_text"].values
    train_y = train_df["target"].values
    test_X = test_df["question_text"].values
    
    folds = list(StratifiedKFold(n_splits=k, shuffle=True, random_state=1).split(train_X, train_y))
    
    #tokenize
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)
    
    #padding
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    return folds, train_X, train_y, test_X, tokenizer

k = 5
folds, train_X, train_y, test_X, tokenizer = load_data_kfold(k)

In [10]:
del train_df
import gc; gc.collect()
time.sleep(1)

We have four different types of embeddings.
 * GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
 * glove.840B.300d - https://nlp.stanford.edu/projects/glove/
 * paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
 * wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html
 
 A very good explanation for different types of embeddings are given in this [kernel](https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge). Please refer the same for more details..

**Glove Embeddings:**

In [11]:
#restore
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_1 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: embedding_matrix_1[i] = embedding_vector
    
    if embedding_vector is not None:
        embedding_matrix_1[i] = embedding_vector
    elif embeddings_index.get(word.capitalize()) is not None:
        embedding_matrix_1[i] = embeddings_index.get(word.capitalize())
    elif embeddings_index.get(word.upper()) is not None:
        embedding_matrix_1[i] = embeddings_index.get(word.upper())

del word_index, embeddings_index, all_embs
import gc; gc.collect()
time.sleep(1)

**Wiki News FastText Embeddings:**

In [12]:
EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_2 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: embedding_matrix_2[i] = embedding_vector

    if embedding_vector is not None:
        embedding_matrix_2[i] = embedding_vector
    elif embeddings_index.get(word.capitalize()) is not None:
        embedding_matrix_2[i] = embeddings_index.get(word.capitalize())
    elif embeddings_index.get(word.upper()) is not None:
        embedding_matrix_2[i] = embeddings_index.get(word.upper())
        
del word_index, embeddings_index, all_embs
import gc; gc.collect()
time.sleep(1) 

**Paragram Embeddings:**

In [13]:
EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_3 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: embedding_matrix_3[i] = embedding_vector
    
    if embedding_vector is not None:
        embedding_matrix_3[i] = embedding_vector
    elif embeddings_index.get(word.capitalize()) is not None:
        embedding_matrix_3[i] = embeddings_index.get(word.capitalize())
    elif embeddings_index.get(word.upper()) is not None:
        embedding_matrix_3[i] = embeddings_index.get(word.upper())

del word_index, embeddings_index, all_embs
import gc; gc.collect()
time.sleep(1)  

**Word2vec Embeddings:**

In [14]:
from gensim.models import KeyedVectors

EMBEDDING_FILE = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_4 = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix_4[i] = embedding_vector
    elif word.capitalize() in embeddings_index:
        embedding_matrix_4[i] = embeddings_index.get_vector(word.capitalize())
    elif word.upper() in embeddings_index:
        embedding_matrix_4[i] = embeddings_index.get_vector(word.upper())
        
del word_index, embeddings_index
import gc; gc.collect()
time.sleep(1)

**Combine :**

In [15]:
embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4), axis=1)  
del embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4
gc.collect()

0

In [16]:
def best_f1(y_true, y_pred, plot=False):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    if plot:
        plt.plot(thresholds, F, '-b')
        plt.plot([best_th], [best_score], '*r')
        plt.show()
    search_result = {'threshold': best_th , 'f1': best_score}
    return search_result

**convolution model**

filter_sizes = [1,2,3,5]
num_filters = 36
from keras.layers import Conv1D, MaxPool1D, BatchNormalization
import keras.backend as K
def get_cmodel():    
    inp = Input(shape=(maxlen, ))
#     x = Lambda(lambda x: K.reverse(x,axes=-1))(inp)
    x = Embedding(max_features, embed_size * 4, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    #x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv1D(num_filters, kernel_size=(filter_sizes[0], embed_size * 4),
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_1 = Conv1D(num_filters, kernel_size=(filter_sizes[1], embed_size * 4),
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_2 = Conv1D(num_filters, kernel_size=(filter_sizes[2], embed_size * 4), 
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_3 = Conv1D(num_filters, kernel_size=(filter_sizes[3], embed_size * 4),
                                 kernel_initializer='he_normal', activation='elu')(x)
    
    maxpool_0 = MaxPool1D(pool_size=(maxlen - filter_sizes[0] + 1))(conv_0)
    maxpool_1 = MaxPool1D(pool_size=(maxlen - filter_sizes[1] + 1))(conv_1)
    maxpool_2 = MaxPool1D(pool_size=(maxlen - filter_sizes[2] + 1))(conv_2)
    maxpool_3 = MaxPool1D(pool_size=(maxlen - filter_sizes[3] + 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = BatchNormalization()(z)
        
    outp = Dense(1, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

**capsule model**

In [17]:
def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, int(input_dim_capsule),
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [18]:
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size* 4, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
#     x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Capsule(num_capsule=30, dim_capsule=20, routings=4, share_weights=True)(x)
#     x = GlobalAveragePooling1D()(x)
    x = Flatten()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    return Model(inputs=inp, outputs=x)

In [19]:
# model = get_model()
# print(model.summary())

In [20]:
pred_test_y = []
pred_val_score = []
pred_val_y = []

In [21]:
# for i in range(4):
for j, (train_idx, val_idx) in enumerate(folds):
    if j > 0:
        break
    print('\nFold ',j)
    X_train_cv = train_X[train_idx]
    y_train_cv = train_y[train_idx]
    X_valid_cv = train_X[val_idx]
    y_valid_cv = train_y[val_idx]
    model = get_model()
    model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(lr=0.002), metrics=['accuracy'])
#     if "model1_"+str(j)+".h5" in os.listdir('.') and i != 0:
#         model.load_weights("model1_"+str(j)+".h5")
    model.fit(X_train_cv, y_train_cv, batch_size=512, epochs=1, validation_data=(X_valid_cv, y_valid_cv), class_weight=class_weights)
#     model.save_weights("model1_"+str(j)+".h5")
    predict_val_y = model.predict([X_valid_cv], batch_size=1024, verbose=1)
    best = best_f1(y_valid_cv, predict_val_y)
    pred_val_score.append(best)
    predict_test_y = model.predict([test_X], batch_size=1024, verbose=1)
    pred_test_y.append(predict_test_y)
    print(pred_val_score)
    print('*****************************************')


Fold  0
Train on 1044897 samples, validate on 261225 samples
Epoch 1/1
 1024/56370 [..............................] - ETA: 5s

  after removing the cwd from sys.path.


[{'threshold': 0.28004756569862366, 'f1': 0.6705398894714468}]
*****************************************


In [26]:
thres = [i['threshold'] for i in pred_val_score]
test_y = [i for i in pred_test_y]
p_test_y = np.zeros_like(np.array(test_y))
for i in range(5):
    p_test_y[i] = (test_y[i]>thres[i]).astype(int)

In [28]:
p = (p_test_y.reshape((5, test_df.shape[0])).sum(axis=0)> 2).astype(int)

#restore
#model.save_weights("model.h5")
model.load_weights("model.h5")

In [None]:
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = p
out_df.to_csv("submission.csv", index=False)