In [1]:
import gc
import re
import os
import pandas as pd
import numpy as np
from unidecode import unidecode
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import string
import re
import math
import operator
from pyphen import Pyphen
import time
print(os.listdir("../input"))

['train-test-result-analysis', 'quora-insincere-questions-classification']


In [2]:
from keras.models import Model, Sequential
from keras import layers
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras import backend as K
from keras import optimizers

Using TensorFlow backend.


In [3]:
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [4]:
exclude = list(string.punctuation)
dic = Pyphen(lang='en_US')

def legacy_round(number, points=0):
    p = 10 ** points
    return float(math.floor((number * p) + math.copysign(0.5, number))) / p

def char_count(text, ignore_spaces=True):
        if ignore_spaces:
            text = text.replace(" ", "")
        return len(text)

def lexicon_count(text):
        count = len(text.split())
        return count
    
def syllable_count(text):
        text = text.lower()
        text = "".join(x for x in text if x not in exclude)
        if not text:
            return 0
        count = 0
        vowels = 'aeiouy'
        for word in text.split(' '):
            #word_hyphenated = dic.inserted(word)
            #count += max(1, word_hyphenated.count("-") + 1)
            word = word.strip(".:;?!")
            if len(word) < 1:
                continue
            if word[0] in vowels:
                count +=1
            for index in range(1,len(word)):
                if word[index] in vowels and word[index-1] not in vowels:
                    count +=1
            if word.endswith('e'):
                count -= 1
            if word.endswith('le'):
                count+=1
            if count == 0:
                count +=1
        return count

def sentence_count(text):
        ignore_count = 0
        sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text)
        for sentence in sentences:
            if lexicon_count(sentence) <= 2:
                ignore_count += 1
        return max(1, len(sentences) - ignore_count)
    
def avg_sentence_length(text):
        try:
            asl = float(lexicon_count(text) / sentence_count(text))
            return legacy_round(asl, 1)
        except ZeroDivisionError:
            return 0.0

def avg_syllables_per_word(text):
        syllable = syllable_count(text)
        words = lexicon_count(text)
        try:
            syllables_per_word = float(syllable) / float(words)
            return legacy_round(syllables_per_word, 1)
        except ZeroDivisionError:
            return 0.0

def avg_letter_per_word(text):
        try:
            letters_per_word = float(
                char_count(text) / lexicon_count(text))
            return legacy_round(letters_per_word, 2)
        except ZeroDivisionError:
            return 0.0


def avg_sentence_per_word(text):
        try:
            sentence_per_word = float(
                sentence_count(text) / lexicon_count(text))
            return legacy_round(sentence_per_word, 2)
        except ZeroDivisionError:
            return 0.0
        
def flesch_reading_ease(text):
        sentence_length = avg_sentence_length(text)
        syllables_per_word = avg_syllables_per_word(text)
        flesch = (
            206.835
            - float(1.015 * sentence_length)
            - float(84.6 * syllables_per_word)
        )
        return legacy_round(flesch, 2)

def flesch_kincaid_grade(text):
        sentence_lenth = avg_sentence_length(text)
        syllables_per_word = avg_syllables_per_word(text)
        flesch = (
            float(0.39 * sentence_lenth)
            + float(11.8 * syllables_per_word)
            - 15.59)
        return legacy_round(flesch, 1)

def polysyllabcount(text):
        count = 0
        for word in text.split():
            wrds = syllable_count(word)
            if wrds >= 3:
                count += 1
        return count

def smog_index(text):
        sentences = sentence_count(text)
        if sentences >= 3:
            try:
                poly_syllab = polysyllabcount(text)
                smog = (
                    (1.043 * (30 * (poly_syllab / sentences)) ** .5)
                    + 3.1291)
                return legacy_round(smog, 1)
            except ZeroDivisionError:
                return 0.0
        else:
            return 0.0

def coleman_liau_index(text):
        letters = legacy_round(avg_letter_per_word(text)*100, 2)
        sentences = legacy_round(avg_sentence_per_word(text)*100, 2)
        coleman = float((0.058 * letters) - (0.296 * sentences) - 15.8)
        return legacy_round(coleman, 2)

def automated_readability_index(text):
        chrs = char_count(text)
        words = lexicon_count(text)
        sentences = sentence_count(text)
        try:
            a = float(chrs)/float(words)
            b = float(words) / float(sentences)
            readability = (
                (4.71 * legacy_round(a, 2))
                + (0.5 * legacy_round(b, 2))
                - 21.43)
            return legacy_round(readability, 1)
        except ZeroDivisionError:
            return 0.0

def linsear_write_formula(text):
        easy_word = 0
        difficult_word = 0
        text_list = text.split()[:100]
        for word in text_list:
            if syllable_count(word) < 3:
                easy_word += 1
            else:
                difficult_word += 1
        text = ' '.join(text_list)
        number = float(
            (easy_word * 1 + difficult_word * 3)
            / sentence_count(text))
        if number <= 20:
            number -= 2
        return number / 2


In [5]:
text_function_list = [char_count, lexicon_count, syllable_count, sentence_count, avg_letter_per_word,
                     avg_sentence_length, avg_sentence_per_word, avg_syllables_per_word, flesch_kincaid_grade, flesch_reading_ease,
                     polysyllabcount, smog_index, coleman_liau_index, automated_readability_index, linsear_write_formula]

In [6]:
def f1_score(true,pred): #considering sigmoid activation, threshold = 0.5
    pred = K.cast(K.greater(pred,0.5), K.floatx())

    groundPositives = K.sum(true) + K.epsilon()
    correctPositives = K.sum(true * pred) + K.epsilon()
    predictedPositives = K.sum(pred) + K.epsilon()

    precision = correctPositives / predictedPositives
    recall = correctPositives / groundPositives

    m = (2 * precision * recall) / (precision + recall)

    return m

In [7]:
train_df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv', usecols=['question_text', 'target'])
val_df = pd.read_csv('../input/train-test-result-analysis/val_data.csv')

In [8]:
val_idx = val_df['column_index'].values
bad_df = train_df.index.isin(val_idx)

In [9]:
val_df = train_df[bad_df]
train_df = train_df[~bad_df]

print(train_df.shape, val_df.shape)

(1249752, 2) (56370, 2)


In [10]:
def clean_text(x):
    special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

In [11]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: clean_text(str(x)))
val_df['question_text'] = val_df['question_text'].apply(lambda x: clean_text(str(x)))

In [12]:
train_sentences = train_df['question_text']
train_labels = train_df['target']
val_sentences = val_df['question_text']
val_labels = val_df['target']

In [13]:
def add_features(df):
    print('Processing features for dataframe!\n')
    df['question_text'] = df['question_text'].apply(lambda x:str(x))
    for text_function in text_function_list:
        start_time = time.time()
        df[text_function.__name__] = df['question_text'].apply(lambda x: text_function(str(x)))
        print('Processing features for function {} took {} minutes'.format(text_function.__name__, (time.time()-start_time)/60))
    df['total_length'] = df['question_text'].apply(len)
    df['capitals'] = df['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/(float(row['total_length'])+1),
                                axis=1)
    df['num_words'] = df['question_text'].str.count('\S+')
    df['num_unique_words'] = df['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1)
    del df['num_unique_words'], df['num_words'], df['capitals'], df['total_length']
    gc.collect()
    print('Done!\n')
    return df

In [14]:
start = time.time()
train_df = add_features(train_df)
print('\nTook {} minutes to process train\n'.format((time.time()-start)/60))
val_df = add_features(val_df)
print('\nTook {} minutes to process val\n'.format((time.time()-start)/60))
print('\n\nTook {} minutes total'.format((time.time()-start)/60))

Processing features for dataframe!

Processing features for function char_count took 0.029163781801859537 minutes
Processing features for function lexicon_count took 0.030443596839904784 minutes
Processing features for function syllable_count took 1.333863115310669 minutes
Processing features for function sentence_count took 0.12348668575286866 minutes
Processing features for function avg_letter_per_word took 0.07547036806742351 minutes
Processing features for function avg_sentence_length took 0.17072641849517822 minutes
Processing features for function avg_sentence_per_word took 0.16872075001398723 minutes
Processing features for function avg_syllables_per_word took 1.4330856919288635 minutes
Processing features for function flesch_kincaid_grade took 1.5839717229207357 minutes
Processing features for function flesch_reading_ease took 1.6255208015441895 minutes
Processing features for function polysyllabcount took 1.471647580464681 minutes
Processing features for function smog_index to

In [15]:
val_df.head()

Unnamed: 0,question_text,target,char_count,lexicon_count,syllable_count,sentence_count,avg_letter_per_word,avg_sentence_length,avg_sentence_per_word,avg_syllables_per_word,flesch_kincaid_grade,flesch_reading_ease,polysyllabcount,smog_index,coleman_liau_index,automated_readability_index,linsear_write_formula,caps_vs_length,words_vs_unique
13,Can we use our external hard disk as a OS as w...,0,77,20,26,1,3.85,20.0,0.05,1.3,7.6,76.56,3,0.0,5.05,6.7,13.0,0.030928,0.809524
19,How many baronies might exist within a county ...,0,47,9,17,1,5.22,9.0,0.11,1.9,10.3,36.96,2,0.0,11.22,7.7,5.5,0.017857,0.9
30,Which babies are more sweeter to their parents...,1,72,15,20,2,4.8,7.5,0.13,1.3,2.7,89.24,0,0.0,8.19,4.9,2.75,0.022989,0.8125
36,"Why my package still is ISC since May , and I ...",0,51,14,16,1,3.64,14.0,0.07,1.1,2.9,99.57,1,0.0,3.24,2.7,7.0,0.092308,0.933333
76,Can we get ITC on charges levied by banks?,0,34,9,10,1,3.78,9.0,0.11,1.1,0.9,104.64,0,0.0,2.87,0.9,3.5,0.093023,0.9


In [16]:
train_features = train_df.drop(['question_text', 'target'], axis=1)
val_features = val_df.drop(['question_text', 'target'], axis=1)

In [17]:
ss = StandardScaler()
ss.fit(np.vstack((train_features, val_features)))
train_features = ss.transform(train_features)
val_features = ss.transform(val_features)

In [18]:
gc.collect()

119

In [19]:
max_features = 20000
maxlen = 100

In [20]:
tokenizer = text.Tokenizer(num_words=max_features)

In [21]:
tokenizer.fit_on_texts(list(train_sentences) + list(val_sentences))

In [22]:
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [23]:
tokenized_val = tokenizer.texts_to_sequences(val_sentences)
X_val = sequence.pad_sequences(tokenized_val, maxlen=maxlen)

In [24]:
gc.collect()

0

In [26]:
batch_size = 1024
epochs = 2
embed_size = 300

In [25]:
def dnn_model(features, train_flag = True, embedding_weights=None):
    features_input = layers.Input(shape=(features.shape[1],))
    inp = layers.Input(shape=(maxlen, ))
    if embedding_weights is None:
        x = layers.Embedding(max_features, embed_size, trainable=train_flag)(inp)
    else:
        x = layers.Embedding(max_features, embedding_weights.shape[1], weights=[embedding_weights], trainable=train_flag)(inp)
    x = layers.Bidirectional(layers.CuDNNLSTM(64, kernel_initializer='glorot_normal', return_sequences = True))(x)
    x, x_h, x_c = layers.Bidirectional(layers.CuDNNGRU(64, kernel_initializer='glorot_normal', return_sequences=True, return_state = True))(x)
    avg_pool = layers.GlobalAveragePooling1D()(x)
    max_pool = layers.GlobalMaxPooling1D()(x)
    x = layers.concatenate([avg_pool, x_h, max_pool, features_input])
    x = layers.Dense(32, activation="tanh", kernel_initializer='glorot_normal')(x)
    x = layers.Dense(1, activation="sigmoid", kernel_initializer='glorot_normal')(x)
    model = Model(inputs=[inp,features_input], outputs=x)
    adam = optimizers.adam(clipvalue=1.0)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=[f1_score])

    return model

# GLOVE

In [None]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_f1_score", mode="max", patience=4)
#clr = CyclicLR(base_lr=0.0003, max_lr=0.005, step_size=2000.)
callbacks = [checkpoint, early_stopping]

In [27]:
EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix_1 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) #embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_1[i] = embedding_vector

del word_index, embeddings_index, all_embs, nb_words
gc.collect()

In [29]:
model = dnn_model(train_features, False, embedding_matrix_1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     6000000     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 128)     187392      embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) [(None, 100, 128), ( 74496       bidirectional_1[0][0]            
__________________________________________________________________________________________________
global_ave

In [30]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from -inf to 0.60642, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score improved from 0.60642 to 0.63370, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7fca19a217b8>

In [32]:
model = dnn_model(train_features, True, embedding_matrix_1)

In [33]:
model.load_weights(weight_path)

In [34]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from 0.63370 to 0.64944, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score did not improve from 0.64944


<keras.callbacks.History at 0x7fca0b505390>

In [37]:
model.load_weights(weight_path)

In [38]:
val_preds_1 = model.predict([X_val, val_features], batch_size=1024, verbose=1)



In [39]:
max_f1_score = 0
max_f1_threshold = ''
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f1_at_threshold = metrics.f1_score(val_labels, (val_preds_1>thresh).astype(int))
    if f1_at_threshold > max_f1_score:
        max_f1_score = f1_at_threshold
        max_f1_threshold = thresh
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

Max threshold is 0.39 with f1 score of 0.672875131164743


In [40]:
del model, weight_path, checkpoint, early_stopping, callbacks, max_f1_score, max_f1_threshold, embedding_matrix_1
gc.collect()

# WIKI

In [42]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_f1_score", mode="max", patience=4)
#clr = CyclicLR(base_lr=0.0003, max_lr=0.005, step_size=2000.)
callbacks = [checkpoint, early_stopping]

In [43]:
EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_2 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_2[i] = embedding_vector
        
del word_index, embeddings_index, all_embs, nb_words
gc.collect()

0

In [44]:
model = dnn_model(train_features, False, embedding_matrix_2)

In [45]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from -inf to 0.58726, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score did not improve from 0.58726


<keras.callbacks.History at 0x7fcb469f9cf8>

In [46]:
model = dnn_model(train_features, True, embedding_matrix_2)

In [47]:
model.load_weights(weight_path)

In [48]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from 0.58726 to 0.60876, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score improved from 0.60876 to 0.61689, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7fcb456c94e0>

In [49]:
model.load_weights(weight_path)

In [50]:
val_preds_2 = model.predict([X_val, val_features], batch_size=1024, verbose=1)



In [51]:
max_f1_score = 0
max_f1_threshold = ''
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f1_at_threshold = metrics.f1_score(val_labels, (val_preds_2>thresh).astype(int))
    if f1_at_threshold > max_f1_score:
        max_f1_score = f1_at_threshold
        max_f1_threshold = thresh
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

Max threshold is 0.32 with f1 score of 0.6609453054950685


In [52]:
del model, weight_path, checkpoint, early_stopping, callbacks, max_f1_score, max_f1_threshold, embedding_matrix_2
gc.collect()

15

# Paragram

In [53]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_f1_score", mode="max", patience=4)
#clr = CyclicLR(base_lr=0.0003, max_lr=0.005, step_size=2000.)
callbacks = [checkpoint, early_stopping]

In [54]:
EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_3 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_3[i] = embedding_vector
del word_index, embeddings_index, all_embs, nb_words
gc.collect()

0

In [55]:
model = dnn_model(train_features, False, embedding_matrix_3)

In [56]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from -inf to 0.57321, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score improved from 0.57321 to 0.61473, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7fcb4425e6d8>

In [57]:
model = dnn_model(train_features, True, embedding_matrix_3)

In [58]:
model.load_weights(weight_path)

In [59]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from 0.61473 to 0.63365, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score improved from 0.63365 to 0.63852, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7fcb37bb9390>

In [60]:
model.load_weights(weight_path)

In [61]:
val_preds_3 = model.predict([X_val, val_features], batch_size=1024, verbose=1)



In [62]:
max_f1_score = 0
max_f1_threshold = ''
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f1_at_threshold = metrics.f1_score(val_labels, (val_preds_3>thresh).astype(int))
    if f1_at_threshold > max_f1_score:
        max_f1_score = f1_at_threshold
        max_f1_threshold = thresh
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

Max threshold is 0.34 with f1 score of 0.6659809719722294


In [63]:
del model, weight_path, checkpoint, early_stopping, callbacks, max_f1_score, max_f1_threshold, embedding_matrix_3
gc.collect()

15

# Word2Vec

In [66]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_f1_score", mode="max", patience=4)
#clr = CyclicLR(base_lr=0.0003, max_lr=0.005, step_size=2000.)
callbacks = [checkpoint, early_stopping]

In [67]:
from gensim.models import KeyedVectors

EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_4 = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= max_features: continue
    if word in embeddings_index:
        embedding_vector = embeddings_index.get_vector(word)
        embedding_matrix_4[i] = embedding_vector

del word_index, embeddings_index, nb_words
gc.collect()

0

In [68]:
model = dnn_model(train_features, False, embedding_matrix_4)

In [69]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from -inf to 0.60065, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score improved from 0.60065 to 0.62457, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7fcb0a0825f8>

In [70]:
model = dnn_model(train_features, True, embedding_matrix_4)

In [71]:
model.load_weights(weight_path)

In [72]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from 0.62457 to 0.62826, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score improved from 0.62826 to 0.65362, saving model to early_weights.hdf5


<keras.callbacks.History at 0x7fcb09c2e2e8>

In [73]:
model.load_weights(weight_path)

In [74]:
val_preds_4 = model.predict([X_val, val_features], batch_size=1024, verbose=1)



In [75]:
max_f1_score = 0
max_f1_threshold = ''
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f1_at_threshold = metrics.f1_score(val_labels, (val_preds_4>thresh).astype(int))
    if f1_at_threshold > max_f1_score:
        max_f1_score = f1_at_threshold
        max_f1_threshold = thresh
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

Max threshold is 0.41 with f1 score of 0.666921508664628


In [76]:
del model, weight_path, checkpoint, early_stopping, callbacks, max_f1_score, max_f1_threshold, embedding_matrix_4
gc.collect()

15

# Evaluate on 3!

In [77]:
final_preds_3 = 0.34*val_preds_1 + 0.33*val_preds_2 + 0.33*val_preds_3

In [78]:
max_f1_score = 0
max_f1_threshold = ''
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f1_at_threshold = metrics.f1_score(val_labels, (final_preds_3>thresh).astype(int))
    if f1_at_threshold > max_f1_score:
        max_f1_score = f1_at_threshold
        max_f1_threshold = thresh
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

Max threshold is 0.34 with f1 score of 0.6812451762284538


# Evaluate on 4!

In [79]:
final_preds_4 = 0.25*val_preds_1 + 0.25*val_preds_2 + 0.25*val_preds_3 + 0.25*val_preds_4

In [80]:
max_f1_score = 0
max_f1_threshold = ''
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f1_at_threshold = metrics.f1_score(val_labels, (final_preds_4>thresh).astype(int))
    if f1_at_threshold > max_f1_score:
        max_f1_score = f1_at_threshold
        max_f1_threshold = thresh
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

Max threshold is 0.34 with f1 score of 0.6807594936708862
