In [2]:
import gc
import re
import os
import pandas as pd
import numpy as np
from unidecode import unidecode
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import string
import re
import math
import operator
from pyphen import Pyphen
import time
print(os.listdir("../input"))

['train-test-result-analysis', 'quora-insincere-questions-classification']


In [3]:
from keras.models import Model, Sequential
from keras import layers
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras import backend as K
from keras import optimizers

Using TensorFlow backend.


In [4]:
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [5]:
exclude = list(string.punctuation)
dic = Pyphen(lang='en_US')

def legacy_round(number, points=0):
    p = 10 ** points
    return float(math.floor((number * p) + math.copysign(0.5, number))) / p

def char_count(text, ignore_spaces=True):
        if ignore_spaces:
            text = text.replace(" ", "")
        return len(text)

def lexicon_count(text):
        count = len(text.split())
        return count
    
def syllable_count(text):
        text = text.lower()
        text = "".join(x for x in text if x not in exclude)
        if not text:
            return 0
        count = 0
        vowels = 'aeiouy'
        for word in text.split(' '):
            #word_hyphenated = dic.inserted(word)
            #count += max(1, word_hyphenated.count("-") + 1)
            word = word.strip(".:;?!")
            if len(word) < 1:
                continue
            if word[0] in vowels:
                count +=1
            for index in range(1,len(word)):
                if word[index] in vowels and word[index-1] not in vowels:
                    count +=1
            if word.endswith('e'):
                count -= 1
            if word.endswith('le'):
                count+=1
            if count == 0:
                count +=1
        return count

def sentence_count(text):
        ignore_count = 0
        sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text)
        for sentence in sentences:
            if lexicon_count(sentence) <= 2:
                ignore_count += 1
        return max(1, len(sentences) - ignore_count)
    
def avg_sentence_length(text):
        try:
            asl = float(lexicon_count(text) / sentence_count(text))
            return legacy_round(asl, 1)
        except ZeroDivisionError:
            return 0.0

def avg_syllables_per_word(text):
        syllable = syllable_count(text)
        words = lexicon_count(text)
        try:
            syllables_per_word = float(syllable) / float(words)
            return legacy_round(syllables_per_word, 1)
        except ZeroDivisionError:
            return 0.0

def avg_letter_per_word(text):
        try:
            letters_per_word = float(
                char_count(text) / lexicon_count(text))
            return legacy_round(letters_per_word, 2)
        except ZeroDivisionError:
            return 0.0


def avg_sentence_per_word(text):
        try:
            sentence_per_word = float(
                sentence_count(text) / lexicon_count(text))
            return legacy_round(sentence_per_word, 2)
        except ZeroDivisionError:
            return 0.0
        
def flesch_reading_ease(text):
        sentence_length = avg_sentence_length(text)
        syllables_per_word = avg_syllables_per_word(text)
        flesch = (
            206.835
            - float(1.015 * sentence_length)
            - float(84.6 * syllables_per_word)
        )
        return legacy_round(flesch, 2)

def flesch_kincaid_grade(text):
        sentence_lenth = avg_sentence_length(text)
        syllables_per_word = avg_syllables_per_word(text)
        flesch = (
            float(0.39 * sentence_lenth)
            + float(11.8 * syllables_per_word)
            - 15.59)
        return legacy_round(flesch, 1)

def polysyllabcount(text):
        count = 0
        for word in text.split():
            wrds = syllable_count(word)
            if wrds >= 3:
                count += 1
        return count

def smog_index(text):
        sentences = sentence_count(text)
        if sentences >= 3:
            try:
                poly_syllab = polysyllabcount(text)
                smog = (
                    (1.043 * (30 * (poly_syllab / sentences)) ** .5)
                    + 3.1291)
                return legacy_round(smog, 1)
            except ZeroDivisionError:
                return 0.0
        else:
            return 0.0

def coleman_liau_index(text):
        letters = legacy_round(avg_letter_per_word(text)*100, 2)
        sentences = legacy_round(avg_sentence_per_word(text)*100, 2)
        coleman = float((0.058 * letters) - (0.296 * sentences) - 15.8)
        return legacy_round(coleman, 2)

def automated_readability_index(text):
        chrs = char_count(text)
        words = lexicon_count(text)
        sentences = sentence_count(text)
        try:
            a = float(chrs)/float(words)
            b = float(words) / float(sentences)
            readability = (
                (4.71 * legacy_round(a, 2))
                + (0.5 * legacy_round(b, 2))
                - 21.43)
            return legacy_round(readability, 1)
        except ZeroDivisionError:
            return 0.0

def linsear_write_formula(text):
        easy_word = 0
        difficult_word = 0
        text_list = text.split()[:100]
        for word in text_list:
            if syllable_count(word) < 3:
                easy_word += 1
            else:
                difficult_word += 1
        text = ' '.join(text_list)
        number = float(
            (easy_word * 1 + difficult_word * 3)
            / sentence_count(text))
        if number <= 20:
            number -= 2
        return number / 2


In [6]:
text_function_list = [char_count, lexicon_count, syllable_count, sentence_count, avg_letter_per_word,
                     avg_sentence_length, avg_sentence_per_word, avg_syllables_per_word, flesch_kincaid_grade, flesch_reading_ease,
                     polysyllabcount, smog_index, coleman_liau_index, automated_readability_index, linsear_write_formula]

In [7]:
def f1_score(true,pred): #considering sigmoid activation, threshold = 0.5
    pred = K.cast(K.greater(pred,0.5), K.floatx())

    groundPositives = K.sum(true) + K.epsilon()
    correctPositives = K.sum(true * pred) + K.epsilon()
    predictedPositives = K.sum(pred) + K.epsilon()

    precision = correctPositives / predictedPositives
    recall = correctPositives / groundPositives

    m = (2 * precision * recall) / (precision + recall)

    return m

In [8]:
train_df = pd.read_csv('../input/quora-insincere-questions-classification/train.csv', usecols=['question_text', 'target'])
val_df = pd.read_csv('../input/train-test-result-analysis/val_data.csv')

In [9]:
val_idx = val_df['column_index'].values
bad_df = train_df.index.isin(val_idx)

In [10]:
val_df = train_df[bad_df]
train_df = train_df[~bad_df]

print(train_df.shape, val_df.shape)

(1249752, 2) (56370, 2)


In [11]:
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)

def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

In [12]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: clean_text(str(x)))
val_df['question_text'] = val_df['question_text'].apply(lambda x: clean_text(str(x)))

In [13]:
train_sentences = train_df['question_text']
train_labels = train_df['target']
val_sentences = val_df['question_text']
val_labels = val_df['target']

In [14]:
def add_features(df):
    print('Processing features for dataframe!\n')
    df['question_text'] = df['question_text'].apply(lambda x:str(x))
    for text_function in text_function_list:
        start_time = time.time()
        df[text_function.__name__] = df['question_text'].apply(lambda x: text_function(str(x)))
        print('Processing features for function {} took {} minutes'.format(text_function.__name__, (time.time()-start_time)/60))
    df['total_length'] = df['question_text'].apply(len)
    df['capitals'] = df['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/(float(row['total_length'])+1),
                                axis=1)
    df['num_words'] = df['question_text'].str.count('\S+')
    df['num_unique_words'] = df['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / (df['num_words']+1)
    del df['num_unique_words'], df['num_words'], df['capitals'], df['total_length']
    gc.collect()
    print('Done!\n')
    return df

In [15]:
start = time.time()
train_df = add_features(train_df)
print('\nTook {} minutes to process train\n'.format((time.time()-start)/60))
val_df = add_features(val_df)
print('\nTook {} minutes to process val\n'.format((time.time()-start)/60))
print('\n\nTook {} minutes total'.format((time.time()-start)/60))

Processing features for dataframe!

Processing features for function char_count took 0.029738124211629233 minutes
Processing features for function lexicon_count took 0.031287304560343426 minutes
Processing features for function syllable_count took 1.323587973912557 minutes
Processing features for function sentence_count took 0.12240836620330811 minutes
Processing features for function avg_letter_per_word took 0.07474201917648315 minutes
Processing features for function avg_sentence_length took 0.1651384393374125 minutes
Processing features for function avg_sentence_per_word took 0.16396253903706867 minutes
Processing features for function avg_syllables_per_word took 1.3883389075597128 minutes
Processing features for function flesch_kincaid_grade took 1.6005149682362874 minutes
Processing features for function flesch_reading_ease took 1.5901175697644552 minutes
Processing features for function polysyllabcount took 1.4625470360120139 minutes
Processing features for function smog_index to

In [16]:
val_df.head()

Unnamed: 0,question_text,target,char_count,lexicon_count,syllable_count,sentence_count,avg_letter_per_word,avg_sentence_length,avg_sentence_per_word,avg_syllables_per_word,flesch_kincaid_grade,flesch_reading_ease,polysyllabcount,smog_index,coleman_liau_index,automated_readability_index,linsear_write_formula,caps_vs_length,words_vs_unique
13,Can we use our external hard disk as a OS as w...,0,77,20,26,1,3.85,20.0,0.05,1.3,7.6,76.56,3,0.0,5.05,6.7,13.0,0.030928,0.809524
19,How many baronies might exist within a county ...,0,47,9,17,1,5.22,9.0,0.11,1.9,10.3,36.96,2,0.0,11.22,7.7,5.5,0.017857,0.9
30,Which babies are more sweeter to their parents...,1,72,15,20,2,4.8,7.5,0.13,1.3,2.7,89.24,0,0.0,8.19,4.9,2.75,0.022989,0.8125
36,"Why my package still is ISC since May , and I ...",0,51,14,16,1,3.64,14.0,0.07,1.1,2.9,99.57,1,0.0,3.24,2.7,7.0,0.092308,0.933333
76,Can we get ITC on charges levied by banks?,0,34,9,10,1,3.78,9.0,0.11,1.1,0.9,104.64,0,0.0,2.87,0.9,3.5,0.093023,0.9


In [17]:
train_features = train_df.drop(['question_text', 'target'], axis=1)
val_features = val_df.drop(['question_text', 'target'], axis=1)

In [18]:
ss = StandardScaler()
ss.fit(np.vstack((train_features, val_features)))
train_features = ss.transform(train_features)
val_features = ss.transform(val_features)

In [19]:
del ss, train_df, val_df
gc.collect()

141

In [20]:
max_features = 20000
maxlen = 100

In [21]:
tokenizer = text.Tokenizer(num_words=max_features)

In [22]:
tokenizer.fit_on_texts(list(train_sentences) + list(val_sentences))

In [23]:
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [24]:
tokenized_val = tokenizer.texts_to_sequences(val_sentences)
X_val = sequence.pad_sequences(tokenized_val, maxlen=maxlen)

In [25]:
del tokenized_val, tokenized_train, train_sentences, val_sentences
gc.collect()

0

In [26]:
EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt'

In [27]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [28]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) #embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [29]:
del word_index, embeddings_index, all_embs, tokenizer, nb_words
gc.collect()

0

In [30]:
batch_size = 1024
epochs = 2
embed_size = 300

In [31]:
def dnn_model(features, train_flag = True):
    features_input = layers.Input(shape=(features.shape[1],))
    inp = layers.Input(shape=(maxlen, ))
    x = layers.Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_flag)(inp)
    x = layers.Bidirectional(layers.CuDNNLSTM(64, kernel_initializer='glorot_normal', return_sequences = True))(x)
    x, x_h, x_c = layers.Bidirectional(layers.CuDNNGRU(64, kernel_initializer='glorot_normal', return_sequences=True, return_state = True))(x)
    avg_pool = layers.GlobalAveragePooling1D()(x)
    max_pool = layers.GlobalMaxPooling1D()(x)
    x = layers.concatenate([avg_pool, x_h, max_pool, features_input])
    x = layers.Dense(32, activation="tanh", kernel_initializer='glorot_normal')(x)
    x = layers.Dense(1, activation="sigmoid", kernel_initializer='glorot_normal')(x)
    model = Model(inputs=[inp,features_input], outputs=x)
    adam = optimizers.adam(clipvalue=1.0)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=[f1_score])

    return model

In [32]:
model = dnn_model(train_features, False)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     6000000     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 128)     187392      embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) [(None, 100, 128), ( 74496       bidirectional_1[0][0]            
__________________________________________________________________________________________________
global_ave

In [33]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_f1_score', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_f1_score", mode="max", patience=4)
#clr = CyclicLR(base_lr=0.0003, max_lr=0.005, step_size=2000.)
callbacks = [checkpoint, early_stopping]

In [34]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from -inf to 0.62529, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score did not improve from 0.62529


<keras.callbacks.History at 0x7f51b29e0b38>

In [35]:
model = dnn_model(train_features)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 100, 300)     6000000     input_4[0][0]                    
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 100, 128)     187392      embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) [(None, 100, 128), ( 74496       bidirectional_3[0][0]            
__________________________________________________________________________________________________
global_ave

In [36]:
model.load_weights(weight_path)

In [37]:
model.fit([X_train, train_features], train_labels, batch_size=batch_size, epochs=epochs, shuffle = True, validation_data=([X_val, val_features], val_labels), callbacks=callbacks)

Train on 1249752 samples, validate on 56370 samples
Epoch 1/2

Epoch 00001: val_f1_score improved from 0.62529 to 0.65739, saving model to early_weights.hdf5
Epoch 2/2

Epoch 00002: val_f1_score did not improve from 0.65739


<keras.callbacks.History at 0x7f51a0254160>

In [38]:
model.load_weights(weight_path)

In [39]:
val_preds = model.predict([X_val, val_features], batch_size=1024, verbose=1)
max_f1_score = 0
max_f1_threshold = ''
for thresh in np.arange(0.1, 0.901, 0.01):
    thresh = np.round(thresh, 2)
    f1_at_threshold = metrics.f1_score(val_labels, (val_preds>thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, f1_at_threshold))
    if f1_at_threshold > max_f1_score:
        max_f1_score = f1_at_threshold
        max_f1_threshold = thresh

F1 score at threshold 0.1 is 0.5699185583087854
F1 score at threshold 0.11 is 0.5806794996895236
F1 score at threshold 0.12 is 0.5881288454578357
F1 score at threshold 0.13 is 0.5949285384970032
F1 score at threshold 0.14 is 0.6025725284010892
F1 score at threshold 0.15 is 0.610441767068273
F1 score at threshold 0.16 is 0.6147723963894012
F1 score at threshold 0.17 is 0.6214109521460286
F1 score at threshold 0.18 is 0.626638647052937
F1 score at threshold 0.19 is 0.6320284697508897
F1 score at threshold 0.2 is 0.6371316711312589
F1 score at threshold 0.21 is 0.6401835053696173
F1 score at threshold 0.22 is 0.6421610169491525
F1 score at threshold 0.23 is 0.6452720832886122
F1 score at threshold 0.24 is 0.6485782504883874
F1 score at threshold 0.25 is 0.6520307354555435
F1 score at threshold 0.26 is 0.6543113971812229
F1 score at threshold 0.27 is 0.6560645089035726
F1 score at threshold 0.28 is 0.6600090785292783
F1 score at threshold 0.29 is 0.6630696820112502
F1 score at threshold 0.

### With val_acc as metric

In [40]:
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

Max threshold is 0.4 with f1 score of 0.6745788667687596


### With f1 metric as val_check

In [None]:
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))

### With CLR

In [None]:
print('Max threshold is {} with f1 score of {}'.format(max_f1_threshold, max_f1_score))