# Quora

https://www.kaggle.com/c/quora-insincere-questions-classification

#### Libraries

In [1]:
# Native
import os
import string
import timeit
import gc
import re

# Data
import pandas as pd
import numpy as np
import scipy as sp

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import gensim

# modeling
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD

# RNN
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Bidirectional, Dropout
from keras.callbacks import EarlyStopping
import math


# XGB
import xgboost as xgb
import lightgbm as lgb

# cython
import Cython
%load_ext Cython

# Viz
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


#### Load data

In [2]:
BASEDIR = '/Users/brandonshurick/Kaggles/quora-insincere-questions-classification'

In [3]:
train = pd.read_csv(os.path.join(BASEDIR, 'train.csv.zip'))
test = pd.read_csv(os.path.join(BASEDIR, 'test.csv.zip'))

In [4]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


#### Word2Vec model

#### Load embeddings

In [None]:
from tqdm import tqdm 
embeddings_index = {}
embeddings_index2 = {}
embeddings_index3 = {}

f = open(os.path.join(BASEDIR,'glove.840B.300d/glove.840B.300d.txt'))
f2 = open(os.path.join(BASEDIR,'paragram_300_sl999/paragram_300_sl999.txt'), encoding='latin') 
f3 = open(os.path.join(BASEDIR,'wiki-news-300d-1M/wiki-news-300d-1M.vec'), encoding='latin')

# # glove
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

# paragram
for line in tqdm(f2):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index2[word] = coefs
        
# wikinews
for line in tqdm(f3):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    if len(coefs) == 300:
        embeddings_index3[word] = coefs

f.close()
f2.close()
f3.close()

2196017it [02:12, 16536.49it/s]
1703756it [01:44, 16381.83it/s]
999995it [00:57, 17288.25it/s]


#### Concatenate all embeddings

In [None]:
x = pd.DataFrame(embeddings_index).T
x2 = pd.DataFrame(embeddings_index2).T
x3 = pd.DataFrame(embeddings_index3).T

In [None]:
x2.columns = np.arange(300,600)
x3.columns = np.arange(600,900)

In [None]:
embeddings = pd.concat((x,x2,x3),axis=1,sort=False).fillna(0)

In [None]:
embeddings.shape

(3144144, 900)

In [None]:
embeddings_index = dict(zip(embeddings.index, embeddings.values))

In [None]:
del x; del x2; del x3
del embeddings

In [None]:
gc.collect()

21

#### Custom tokenize function

In [None]:
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

def tokenize(text):
    
    # Split up words
    trans_table = {ord(c): None for c in string.punctuation + string.digits}  
    tokens = [
        word 
        for word in nltk.word_tokenize(text.translate(trans_table)) 
        if len(word) > 1
        and word not in stop_words
    ]
    
    # Stem
    stems = [ stemmer.stem(item) for item in tokens if item not in stop_words ]
    return stems

In [None]:
tokenize('This is only a test sentence and it is horrible crap.')

['thi', 'test', 'sentenc', 'horribl', 'crap']

#### TDIDF matrix

### Stacked modeling with kfold

In [None]:
%%cython
cimport numpy as np
import numpy as np
import re

# embedding first 30 words 
cpdef np.ndarray text_to_array(str text, dict embeddings_index):
    cdef np.ndarray empyt_emb
    cdef list text_block
    empyt_emb = np.zeros(900)
    text_block = re.sub('([.,!?()\'"])', r' \1', text).split()[:30]
    embeds = [ embeddings_index.get(x, empyt_emb) for x in text_block ]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

In [None]:
batch_size = 2**6
def batch_gen(X, y):
    n_batches = math.ceil(len(X) / batch_size)
    while True: 
        X = X.sample(frac=1.)  # Shuffle the data.
        y = y[X.index]
        for i in range(n_batches):
            texts = X.iloc[i*batch_size:(i+1)*batch_size]
            text_arr = np.array([text_to_array(text, embeddings_index) for text in texts])
            yield text_arr, np.array(y[i*batch_size:(i+1)*batch_size])

In [None]:
kfd = KFold(n_splits=8, random_state=22)



class NN:
    def __init__(self, layers=64, shape=(30, 300), model_type='GRU'):
        
        if model_type == 'LSTM':
            
            model = Sequential()
            model.add(Bidirectional(LSTM(layers, return_sequences=True),
                                    input_shape=shape))
            model.add(Bidirectional(LSTM(layers)))
            model.add(Dense(1, activation="sigmoid"))
            model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
            self.model = model
        
        elif model_type == 'GRU':
            
            model = Sequential()
            model.add(Bidirectional(GRU(layers, return_sequences=True),
                                    input_shape=shape))
            model.add(Bidirectional(GRU(layers)))
            model.add(Dense(1, activation="sigmoid"))
            model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
            self.model = model


# XGB params
param = {
    'max_depth':4, 
    'eta':0.05, 
    'silent':1, 
    'objective':'binary:logistic' ,
    'colsample_bytree': 0.85 ,
    'subsample': 0.85 ,
    'lambda': 0.01 ,
    'alpha': 0.01 ,
    'eval_metric': 'error',
}
num_round = 1500


# prepare values to split up
X = train.question_text
y = train.target
y_preds = np.zeros((X.shape[0], 6))

model_components = {}

# Loop through K folds and run models
for i, (train_index, test_index) in enumerate(kfd.split(X)):
    # models
    nb = BernoulliNB()
    sgd = SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-2)
    svm = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-2)
    lr = SGDClassifier(loss='log', max_iter=1000, tol=1e-2)
    
    print('\nRunning fold {}\n'.format(i+1))
    
    # kfold splits
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # word vectors for NN 
    print('\tConvert word vectors to set of arrays')
    test_vects = np.array([text_to_array(X_text, embeddings_index) for X_text in X_test])
    
    # RNN
    print('\tModel {}, RNN'.format(6)) 
    rnn = NN(layers=32, shape=(30, 900), model_type='GRU')
    mg = batch_gen(X_train, y_train)
    rnn.model.fit_generator(
        mg, 
        epochs=20,
        shuffle=True,
        steps_per_epoch=1000,
        validation_data=(test_vects, y_test),
        validation_steps=100,
        verbose=True,
#         callbacks=[EarlyStopping()],
    )
    y_preds[test_index, 5] = rnn.model.predict(test_vects).ravel()
#     model_components['rnn'] = rnn
    del rnn
    del test_vects
    del mg
    
    print('TFIDF transform')
    # transform tfidf
    tf = TfidfVectorizer(
        max_features=5000, 
        ngram_range=(1,2), 
        tokenizer=tokenize,
    )
    X_train = tf.fit_transform(X_train)
    X_test = tf.transform(X_test)
#     model_components['tf'] = tf
    if i < 3:
        del tf
    
    # NB
    print('\tModel {}, Naive Bayes'.format(1)) 
    nb.fit(X_train, y_train)
    y_preds[test_index, 0] = nb.predict_proba(X_test)[:, 1]
#     model_components['nb'] = nb
    del nb
    
    # Huber
    print('\tModel {}, Huber'.format(2))
    sgd.fit(X_train, y_train)
    y_preds[test_index, 1] = sgd.predict(X_test)
#     model_components['sgd'] = sgd
    del sgd
    
    # SVM
    print('\tModel {}, SVM'.format(3))
    svm.fit(X_train, y_train)
    y_preds[test_index, 2] = svm.predict(X_test)
#     model_components['svm'] = svm
    del svm
    
    # PCA projection into smaller space
#     print('\tPCA transform')
#     pca = TruncatedSVD(n_components=1000)
#     X_train = pca.fit_transform(X_train)
#     X_test = pca.transform(X_test)
#     model_components['pca'] = pca
    
    # LR
    print('\tModel {}, Logistic Regression'.format(4))
    lr.fit(X_train, y_train)
    y_preds[test_index, 3] = lr.predict_proba(X_test)[:, 1]
#     model_components['lr'] = lr
    del lr
    
    # XGB
#     print('\tModel {}, XGB'.format(5)) 
#     param['scale_pos_weight'] = np.sum(y_train == 0)*1. / np.sum(y_train == 1)
#     dtrain = xgb.DMatrix(X_train, label=y_train)
#     dtest = xgb.DMatrix(X_test, label=y_test)
#     eval_set = [
#         (dtrain,'train'), 
#         (dtest,'test'),
#     ]
#     bst = xgb.train(
#         param, 
#         dtrain, 
#         num_round, 
#         evals=eval_set, 
#         verbose_eval=200, 
#         early_stopping_rounds=50
#     )
#     y_preds[test_index, 4] = bst.predict(dtest)
#     model_components['xgb'] = xgb

    # LGB
    print('\tModel {}, LGB'.format(5)) 
    dtrain = lgb.Dataset(X_train, label=y_train)
    dtest = lgb.Dataset(X_test, label=y_test)
    param = {
        'num_leaves':35, 
        'num_trees':1000, 
        'objective':'binary',
        'min_data_in_leaf': 50 ,
        'bagging_fraction': 0.85 ,
    }
    param['metric'] = 'binary_logloss'
    num_round = 100
    bst = lgb.train(
        param, 
        dtrain, 
        num_round, 
        valid_sets=[dtest], 
        valid_names=['eval'],
        early_stopping_rounds=50,
        verbose_eval=100,
    )
    y_preds[test_index, 4] = bst.predict(X_test)
#     model_components['lgb'] = lgb
    del bst
    
    # gc
    gc.collect()


Running fold 1

	Convert word vectors to set of arrays
	Model 6, RNN
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
TFIDF transform
	Model 1, Naive Bayes
	Model 2, Huber
	Model 3, SVM
	Model 4, Logistic Regression
	Model 5, LGB




Training until validation scores don't improve for 50 rounds.
[100]	eval's binary_logloss: 0.137765
[200]	eval's binary_logloss: 0.131845
[300]	eval's binary_logloss: 0.129138
[400]	eval's binary_logloss: 0.127661
[500]	eval's binary_logloss: 0.126708
[600]	eval's binary_logloss: 0.126074
[700]	eval's binary_logloss: 0.125628
[800]	eval's binary_logloss: 0.125213
[900]	eval's binary_logloss: 0.124961
[1000]	eval's binary_logloss: 0.124781
Did not meet early stopping. Best iteration is:
[999]	eval's binary_logloss: 0.12478

Running fold 2

	Convert word vectors to set of arrays
	Model 6, RNN
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
TFIDF transform
	Model 1, Naive Bayes
	Model 2, Huber
	Model 3, SVM
	Model 4, Logistic Regression
	Model 5, LGB




Training until validation scores don't improve for 50 rounds.
[100]	eval's binary_logloss: 0.140893
[200]	eval's binary_logloss: 0.134888
[300]	eval's binary_logloss: 0.13217
[400]	eval's binary_logloss: 0.130659
[500]	eval's binary_logloss: 0.12971
[600]	eval's binary_logloss: 0.129081
[700]	eval's binary_logloss: 0.128599
[800]	eval's binary_logloss: 0.128179
[900]	eval's binary_logloss: 0.127988
[1000]	eval's binary_logloss: 0.127815
Did not meet early stopping. Best iteration is:
[999]	eval's binary_logloss: 0.127815

Running fold 3

	Convert word vectors to set of arrays
	Model 6, RNN
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
TFIDF transform
	Model 1, Naive Bayes
	Model 2, Huber
	Model 3, SVM
	Model 4, Logistic Regression
	Model 5, LGB




Training until validation scores don't improve for 50 rounds.
[100]	eval's binary_logloss: 0.139349
[200]	eval's binary_logloss: 0.133431
[300]	eval's binary_logloss: 0.130684
[400]	eval's binary_logloss: 0.129209
[500]	eval's binary_logloss: 0.128287
[600]	eval's binary_logloss: 0.127603
[700]	eval's binary_logloss: 0.127113
[800]	eval's binary_logloss: 0.126772
[900]	eval's binary_logloss: 0.126454
[1000]	eval's binary_logloss: 0.126283
Did not meet early stopping. Best iteration is:
[998]	eval's binary_logloss: 0.126277

Running fold 4

	Convert word vectors to set of arrays
	Model 6, RNN
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
TFIDF transform
	Model 1, Naive Bayes
	Model 2, Huber
	Model 3, SVM
	Model 4, Logistic Regression
	Model 5, LGB




Training until validation scores don't improve for 50 rounds.
[100]	eval's binary_logloss: 0.13686
[200]	eval's binary_logloss: 0.131123
[300]	eval's binary_logloss: 0.128528
[400]	eval's binary_logloss: 0.127057
[500]	eval's binary_logloss: 0.12616
[600]	eval's binary_logloss: 0.125454
[700]	eval's binary_logloss: 0.125009
[800]	eval's binary_logloss: 0.124695
[900]	eval's binary_logloss: 0.124377
[1000]	eval's binary_logloss: 0.124132
Did not meet early stopping. Best iteration is:
[1000]	eval's binary_logloss: 0.124132

Running fold 5

	Convert word vectors to set of arrays
	Model 6, RNN
Epoch 1/20
Epoch 2/20

In [None]:
f1_score(y, y_preds[:, 0] > 0.5)

In [None]:
f1_score(y, y_preds[:, 1] > 0.5)

In [None]:
f1_score(y, y_preds[:, 2] > 0.5)

In [None]:
f1_score(y, y_preds[:, 3] > 0.5)

In [None]:
f1_score(y, y_preds[:, 4] > 0.5)

In [None]:
f1_score(y, y_preds[:, 5] > 0.5)

In [None]:
0.653256236576904

### Stack models

In [None]:
kfd2 = KFold(n_splits=4, random_state=23)
y_val = np.zeros(X.shape[0])

param = {
    'max_depth':4 , 
    'eta':0.05 , 
    'silent':1 , 
    'objective':'binary:logistic' ,
    'colsample_bytree': 0.9 ,
    'subsample': 0.9 ,
    'lambda': 0.01 ,
    'alpha': 0.01 ,
    'eval_metric': 'error',
}
num_round = 1500

for i, (train_index, test_index) in enumerate(kfd2.split(X)):
    print('Running fold {}'.format(i+1))

    # kfold splits
    X_train, X_test = X[train_index], X[test_index]
    X_train_preds, X_test_preds = y_preds[train_index, :], y_preds[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    # transform tfidf
    X_train = tf.fit_transform(X_train)
    X_test = tf.transform(X_test)
    
    # append predictions
    X_train = sp.sparse.hstack((X_train, sp.sparse.csr_matrix(X_train_preds))).tocsr()
    X_test = sp.sparse.hstack((X_test, sp.sparse.csr_matrix(X_test_preds))).tocsr()
    
    # stacked model
    param['scale_pos_weight'] = np.sum(y_train == 0)*1. / np.sum(y_train == 1)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    eval_set = [
        (dtrain,'train'), 
        (dtest,'test'),
    ]
    bst = xgb.train(
        param, 
        dtrain, 
        num_round, 
        evals=eval_set, 
        verbose_eval=200,
        early_stopping_rounds=100,
        
    )
    y_val[test_index] = bst.predict(dtest)

In [None]:
roc_auc_score(y, y_val)

In [None]:
f1_score(y, y_val > 0.50) # top leaderboard has f1 score ~0.7+