In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import backend as K

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU,CuDNNLSTM,Reshape, Flatten,Conv2D
from keras.layers import Bidirectional, GlobalMaxPool1D,SpatialDropout1D,GlobalAvgPool1D,concatenate,Concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import Callback

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
train_df = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
test_df = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


### Check vocab coverage

In [3]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [4]:
sentences = train_df["question_text"].apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 1306122/1306122 [00:05<00:00, 249273.32it/s]

{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}





In [5]:
from gensim.models import KeyedVectors

news_path = '../input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [6]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [7]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 508823/508823 [00:01<00:00, 299411.48it/s]

Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all text





In [8]:
oov[:10]

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [9]:
print('?' in embeddings_index)
print('&' in embeddings_index)

False
True


In [10]:
def clean_text(x):

    x = str(x)
    for punct in '&?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        if punct not in embeddings_index:
            x = x.replace(punct, '')
    return x

In [11]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
sentences = train_df["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:05<00:00, 249320.25it/s]


In [12]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 302295/302295 [00:01<00:00, 278892.90it/s]


Found embeddings for 48.56% of vocab
Found embeddings for  89.38% of all text


In [13]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [14]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
sentences = train_df["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:05<00:00, 252764.74it/s]


In [15]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 288730/288730 [00:01<00:00, 281314.80it/s]

Found embeddings for 51.32% of vocab
Found embeddings for  90.09% of all text





In [16]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [17]:
train_df["question_text"] = train_df["question_text"].apply(lambda x: replace_typical_misspell(x))
sentences = train_df["question_text"].apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:05<00:00, 230837.69it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 275967.51it/s]


In [18]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 288659/288659 [00:01<00:00, 279486.85it/s]

Found embeddings for 51.33% of vocab
Found embeddings for  98.42% of all text





### Split, tokenize and pad data

In [19]:
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use
threshold = 0.35

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [20]:
np.random.seed(2018)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx]

In [21]:
class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = (y_pred > threshold).astype(int)
            score = f1_score(self.y_val, y_pred)
            print("\n F1 Score - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [22]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 256)          440320    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          164864    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
__________

In [23]:
F1_Score = F1Evaluation(validation_data=(val_X, val_y), interval=1)
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y),callbacks=[F1_Score], verbose=2)

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
 - 194s - loss: 0.1252 - acc: 0.9518 - val_loss: 0.1100 - val_acc: 0.9554

 F1 Score - epoch: 1 - score: 0.634384 

Epoch 2/2
 - 186s - loss: 0.1011 - acc: 0.9594 - val_loss: 0.1080 - val_acc: 0.9570

 F1 Score - epoch: 2 - score: 0.651665 



<keras.callbacks.History at 0x7fa02441e128>

In [24]:
pred_noemb = model.predict([val_X], batch_size=1024, verbose=1)
pred_noemb_test = model.predict([test_X], batch_size=1024, verbose=1)



In [25]:
del model, inp, x
import gc; gc.collect()
time.sleep(10)

In [26]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [27]:
!ls ../input/quora-insincere-questions-classification/embeddings/

GoogleNews-vectors-negative300	paragram_300_sl999
glove.840B.300d			wiki-news-300d-1M


In [28]:
EMBEDDING_FILE = '../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if (await self.run_code(code, result,  async_=asy)):


In [29]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 256)          440320    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100, 128)          164864    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
__________

In [30]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y),callbacks=[F1_Score], verbose=2)

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
 - 190s - loss: 0.1142 - acc: 0.9549 - val_loss: 0.1007 - val_acc: 0.9592

 F1 Score - epoch: 1 - score: 0.673047 

Epoch 2/2
 - 187s - loss: 0.0944 - acc: 0.9624 - val_loss: 0.0998 - val_acc: 0.9602

 F1 Score - epoch: 2 - score: 0.678911 



<keras.callbacks.History at 0x7fa02445f7b8>

In [31]:
pred_glove = model.predict([val_X], batch_size=1024, verbose=1)
pred_glove_test = model.predict([test_X], batch_size=1024, verbose=1)



In [32]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = SpatialDropout1D(0.1)(x)
x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
y = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)

atten_1 = Attention(maxlen)(x) # skip connect
atten_2 = Attention(maxlen)(y)
avg_pool = GlobalAvgPool1D()(y)
max_pool = GlobalMaxPool1D()(y)

conc = concatenate([atten_1, atten_2, avg_pool, max_pool])
conc = Dense(16, activation="relu")(conc)
conc = Dropout(0.1)(conc)
outp = Dense(1, activation="sigmoid")(conc)    

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])

In [33]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y),callbacks=[F1_Score], verbose=2)

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
 - 118s - loss: 0.1238 - acc: 0.9523 - val_loss: 0.1062 - val_acc: 0.9576

 F1 Score - epoch: 1 - score: 0.656083 

Epoch 2/2
 - 115s - loss: 0.1074 - acc: 0.9579 - val_loss: 0.1018 - val_acc: 0.9584

 F1 Score - epoch: 2 - score: 0.668099 



<keras.callbacks.History at 0x7fa214b26c88>

In [34]:
pred_glove_attntn = model.predict([val_X], batch_size=1024, verbose=1)
pred_glove_attntn_test = model.predict([test_X], batch_size=1024, verbose=1)



In [35]:
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D

filter_sizes = [1,2,3,5]
num_filters = 42

inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.4)(x)
x = Reshape((maxlen, embed_size, 1))(x)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size),
                             kernel_initializer='he_normal', activation='tanh')(x)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size),
                             kernel_initializer='he_normal', activation='tanh')(x)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), 
                             kernel_initializer='he_normal', activation='tanh')(x)
conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size),
                             kernel_initializer='he_normal', activation='tanh')(x)

maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)

z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
z = Flatten()(z)
z = Dropout(0.1)(z)

outp = Dense(1, activation="sigmoid")(z)

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


In [36]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y),callbacks=[F1_Score], verbose=2)

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
 - 116s - loss: 0.1206 - acc: 0.9524 - val_loss: 0.1067 - val_acc: 0.9574

 F1 Score - epoch: 1 - score: 0.646580 

Epoch 2/2
 - 113s - loss: 0.1023 - acc: 0.9592 - val_loss: 0.1042 - val_acc: 0.9586

 F1 Score - epoch: 2 - score: 0.661499 



<keras.callbacks.History at 0x7fa189eaa198>

In [37]:
pred_CNN = model.predict([val_X], batch_size=1024, verbose=1)
pred_cnn_test = model.predict([test_X], batch_size=1024, verbose=1)



In [38]:
pred_final = 0.25*pred_noemb+0.25*pred_glove+0.25*pred_glove_attntn+0.25*pred_CNN
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_final>thresh).astype(int))))

F1 score at threshold 0.1 is 0.6091982910278965
F1 score at threshold 0.11 is 0.6203341780851338
F1 score at threshold 0.12 is 0.6287645974185618
F1 score at threshold 0.13 is 0.6362456940902788
F1 score at threshold 0.14 is 0.6418325060339726
F1 score at threshold 0.15 is 0.6483103879849813
F1 score at threshold 0.16 is 0.6520181063749528
F1 score at threshold 0.17 is 0.657686592205551
F1 score at threshold 0.18 is 0.6617732840754423
F1 score at threshold 0.19 is 0.6661411115490737
F1 score at threshold 0.2 is 0.6689641394466087
F1 score at threshold 0.21 is 0.6713945595461224
F1 score at threshold 0.22 is 0.6752193770205779
F1 score at threshold 0.23 is 0.6778216586988152
F1 score at threshold 0.24 is 0.6795635443571768
F1 score at threshold 0.25 is 0.6802793623713813
F1 score at threshold 0.26 is 0.682046030291597
F1 score at threshold 0.27 is 0.6826366910400524
F1 score at threshold 0.28 is 0.6842628382109333
F1 score at threshold 0.29 is 0.6846544885760573
F1 score at threshold 0.

In [39]:
# Can also try doing regression to get weights
pred_final_test = 0.25*pred_noemb_test+0.25*pred_glove_test+0.25*pred_glove_attntn_test+0.25*pred_cnn_test

# Select best threshold from above
best_threshold = 0.34 
pred_final_test = (pred_final_test > best_threshold).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_final_test
out_df.to_csv("submission.csv", index=False)