# Quora Duplicate Questions classification
###  Kaggle competition

## Exploration & Data preparation

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import string
import nltk
from nltk.metrics import *
from time import time

## Import dataset

In [2]:
df_train_base = pd.read_csv('data/train.csv')
df_train_base.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


### Clean the dataset

In [3]:
%%time
#punctable =str.maketrans({key: "" for key in string.punctuation})
stopwords = nltk.corpus.stopwords.words('english')
for qcol in ['question1','question2']:
    df_train_base['act_len_' + qcol ] = df_train_base[qcol].apply(lambda x: len(str(x).split()))
    #df_train_base[qcol] = df_train_base[qcol].apply(lambda x: str(x).lower().translate(punctable))
    df_train_base[qcol] = df_train_base[qcol].apply(lambda x: str(x).lower().translate(None, string.punctuation))
    df_train_base[qcol] = df_train_base[qcol].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
    df_train_base['imp_len_' + qcol ] = df_train_base[qcol].apply(lambda x: len(str(x).split()))
    #Binary encoding for the type of question - {'what','how','why','where','which'}
    for qtype in ['what','why','where','how','which']:
        df_train_base[qtype+'_'+qcol] = df_train_base[qcol].apply(lambda x: (qtype in str(x).lower())*1)



CPU times: user 1min 19s, sys: 976 ms, total: 1min 20s
Wall time: 1min 24s


In [4]:
%%time
question1 = df_train_base[['qid1','question1']].drop_duplicates().rename(columns={'qid1':'qid','question1':'question'})
question1['type'] = 1
question2 = df_train_base[['qid2','question2']].drop_duplicates().rename(columns={'qid2':'qid','question2':'question'})
question2['type'] = 2
allquestions = pd.concat([question1,question2])
print (allquestions.shape)
print (allquestions.head())

(590018, 3)
   qid                                           question  type
0    1          step step guide invest share market india     1
1    3                    story kohinoor kohinoor diamond     1
2    5       increase speed internet connection using vpn     1
3    7                              mentally lonely solve     1
4    9  one dissolve water quikly sugar salt methane c...     1
CPU times: user 442 ms, sys: 115 ms, total: 558 ms
Wall time: 571 ms


### tf-idf vectorizer

In [5]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.9,max_features=2000,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(allquestions.question)

CPU times: user 10.3 s, sys: 307 ms, total: 10.6 s
Wall time: 11.3 s


In [6]:
def sparse_max_row(csr_mat):
    ret = np.maximum.reduceat(csr_mat.data, csr_mat.indptr[:-1])
    ret[np.diff(csr_mat.indptr) == 0] = 0
    return ret

def min_sparse(X):
    if len(X.data) == 0:
        return 0
    m = X.data.min()
    return m if X.getnnz() == X.size else min(m, 0)

allquestions['tfidf_score'] = tfidf.sum(axis=1)
allquestions['tfidf_max'] =sparse_max_row(tfidf)
allquestions['tfidf_min'] =[min_sparse(tfidf.getrow(i)) for i in range(tfidf.shape[0])]

In [7]:
allquestions.head()

Unnamed: 0,qid,question,type,tfidf_score,tfidf_max,tfidf_min
0,1,step step guide invest share market india,1,1.998898,0.788155,0.208347
1,3,story kohinoor kohinoor diamond,1,1.0,1.0,1.0
2,5,increase speed internet connection using vpn,1,2.22027,0.537069,0.372623
3,7,mentally lonely solve,1,1.0,1.0,1.0
4,9,one dissolve water quikly sugar salt methane c...,1,1.985718,0.540787,0.393676


## Feature engineering

In [77]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def get_dist(sent1, sent2, func):
    # to take care of division by zero errors
    #print(sent1)
    try:
        res = func(set(sent1.split()), set(sent2.split()))
    except:
        res=0
    return res

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def get_word_match_share(sent1, sent2, get_stems = True):
    res = 0
    words1 = sent1.split()
    words2 = sent2.split()
    if (len(words1) + len(words2)) >0:
        res = 2*len(set(words1).intersection(set(words2)))/(len(words1) + len(words2))
    if get_stems:
        try:
            words1 = stem_tokens(words1, stemmer)
            words2 = stem_tokens(words2, stemmer)    
            if (len(words1) + len(words2)) >0:
                res = 2*len(set(words1).intersection(set(words2)))/(len(words1) + len(words2))
        except:
            pass
    return res

In [78]:
features = ['act_len_diff','imp_len_diff','tfidf_score_diff','word_match_share'
            ,'jaccard_dist','masi_dist'
            ,'what_diff','why_diff','where_diff','how_diff','which_diff']

questions = ['question1','question2']
fields = ['tfidf_score','tfidf_max','tfidf_min','what','why','where','how','which']
features = features + [field + '_' + question for field in fields for question in questions]
y_col = 'is_duplicate'

print(features)

['act_len_diff', 'imp_len_diff', 'tfidf_score_diff', 'word_match_share', 'jaccard_dist', 'masi_dist', 'what_diff', 'why_diff', 'where_diff', 'how_diff', 'which_diff', 'tfidf_score_question1', 'tfidf_score_question2', 'tfidf_max_question1', 'tfidf_max_question2', 'tfidf_min_question1', 'tfidf_min_question2', 'what_question1', 'what_question2', 'why_question1', 'why_question2', 'where_question1', 'where_question2', 'how_question1', 'how_question2', 'which_question1', 'which_question2']


In [79]:
df_train = pd.merge(df_train_base, allquestions[allquestions.type==1], left_on='qid1',right_on='qid')
df_train = pd.merge(df_train, allquestions[allquestions.type==2], left_on='qid2',right_on='qid', suffixes=('_question1','_question2'))
df_train['jaccard_dist'] = df_train.apply(lambda row: get_dist(row['question_question1'], 
                                                        row['question_question2'], jaccard_distance)
                                                         , axis=1)
df_train['masi_dist'] = df_train.apply(lambda row: get_dist(row['question_question1'], 
                                                        row['question_question2'], masi_distance)
                                                         , axis=1)
df_train['word_match_share'] = df_train.apply(lambda row: get_word_match_share(row['question_question1'], row['question_question2']), axis=1)

for col in ['act_len','imp_len','tfidf_score']:
    df_train[col+'_diff'] = df_train[col+'_question1'] - df_train[col+'_question2']
for qtype in ['what','why','where','how','which']:
     df_train[qtype + '_'  + 'diff'] = (df_train[qtype+'_question1'] != df_train[qtype + '_question2']).astype(int)
print (df_train[features + [y_col]].head())

   act_len_diff  imp_len_diff  tfidf_score_diff  word_match_share  \
0             2             1          0.168171          0.769231   
1            -5            -5         -0.727128          0.307692   
2             1             1          0.000000          0.285714   
3             0             0         -0.413921          0.500000   
4             2            -1         -0.387095          0.222222   

   jaccard_dist  masi_dist  what_diff  why_diff  where_diff  how_diff  \
0      0.166667   0.441667          0         0           0         0   
1      0.777778   0.926667          0         0           0         0   
2      1.000000   1.000000          0         0           0         0   
3      0.600000   0.868000          0         0           0         0   
4      0.857143   0.952857          0         0           0         0   

       ...       what_question2  why_question1  why_question2  \
0      ...                    0              0              0   
1      ...      

### Train - test dataset split
70:30 split for train and test. Test set will only be used for validation. Train set may be further split for further cross validation

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
seed= 32
X_train, X_test, Y_train, Y_test = train_test_split(df_train[features], df_train[y_col], test_size=0.3,random_state=seed)

### Run a test classifier to establish baseline performance

In [81]:
%%time
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, Y_train)

CPU times: user 1min 36s, sys: 1.57 s, total: 1min 38s
Wall time: 1min 40s


In [82]:
preds = clf.predict(X_test)
pred_proba = clf.predict_proba(X_test)
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
print('Accuracy = %1.5f' % accuracy_score(Y_test, preds))
print('ROC AUC = %1.5f' % roc_auc_score(Y_test, preds))
print('Log Loss = %1.5f' % log_loss(Y_test, pred_proba))

Accuracy = 0.74513
ROC AUC = 0.72612
Log Loss = 0.50675


In [83]:
sorted(zip(clf.feature_importances_, features), reverse=True)[:10]

[(0.12446123074336882, 'jaccard_dist'),
 (0.10845770101023272, 'masi_dist'),
 (0.093708665726030885, 'word_match_share'),
 (0.083858870213419257, 'tfidf_score_diff'),
 (0.081898587281253779, 'tfidf_min_question1'),
 (0.081159800417174133, 'tfidf_max_question1'),
 (0.080938690931960072, 'tfidf_min_question2'),
 (0.08093734183512559, 'tfidf_score_question1'),
 (0.080915737851055769, 'tfidf_score_question2'),
 (0.080483083124224641, 'tfidf_max_question2')]

## Save the training data into a pickle for next stage

In [84]:
X_train.to_pickle('data/X_train.pkl')
Y_train.to_pickle('data/Y_train.pkl')
X_test.to_pickle('data/X_test.pkl')
Y_test.to_pickle('data/Y_test.pkl')

In [85]:
df_train[features + [y_col]].to_pickle('data/df_train.pkl')

## Prepare the validation set with the same transformations

In [95]:
df_test_base = pd.read_csv('data/test.csv')
df_test_base.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [98]:
%%time
#punctable =str.maketrans({key: "" for key in string.punctuation})
stopwords = nltk.corpus.stopwords.words('english')
for qcol in ['question1','question2']:
    df_test_base['act_len_' + qcol ] = df_test_base[qcol].apply(lambda x: len(str(x).split()))
    #df_test_base[qcol] = df_test_base[qcol].apply(lambda x: str(x).lower().translate(punctable))
    df_test_base[qcol] = df_test_base[qcol].apply(lambda x: str(x).lower().translate(None, string.punctuation))
    df_test_base[qcol] = df_test_base[qcol].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
    df_test_base['imp_len_' + qcol ] = df_test_base[qcol].apply(lambda x: len(str(x).split()))
    for qtype in ['what','why','where','how','which']:
        df_test_base[qtype+'_'+qcol] = df_test_base[qcol].apply(lambda x: (qtype in str(x).lower())*1)



CPU times: user 6min 2s, sys: 4.92 s, total: 6min 7s
Wall time: 6min 8s


In [88]:
%%time
question1 = df_test_base[['test_id','question1']].drop_duplicates().rename(columns={'test_id':'qid','question1':'question'})
question1['type'] = 1
question2 = df_test_base[['test_id','question2']].drop_duplicates().rename(columns={'test_id':'qid','question2':'question'})
question2['type'] = 2
allquestions_test = pd.concat([question1,question2])
print (allquestions_test.shape)
print (allquestions_test.head())

(4691592, 3)
   qid                                question  type
0    0          surface pro 4 compare ipad pro     1
1    1  hair transplant age 24 much would cost     1
2    2            best way send money china us     1
3    3                        food emulsifiers     1
4    4               aberystwyth start reading     1
CPU times: user 3.32 s, sys: 664 ms, total: 3.98 s
Wall time: 3.99 s


In [89]:
tfidf = tfidf_vectorizer.transform(allquestions_test.question)

In [90]:
allquestions_test['tfidf_score'] = tfidf.sum(axis=1)
allquestions_test['tfidf_max'] = sparse_max_row(tfidf)
allquestions_test['tfidf_min'] = [min_sparse(tfidf.getrow(i)) for i in range(tfidf.shape[0])]

In [91]:
allquestions.head()

Unnamed: 0,qid,question,type,tfidf_score,tfidf_max,tfidf_min
0,1,step step guide invest share market india,1,1.998898,0.788155,0.208347
1,3,story kohinoor kohinoor diamond,1,1.0,1.0,1.0
2,5,increase speed internet connection using vpn,1,2.22027,0.537069,0.372623
3,7,mentally lonely solve,1,1.0,1.0,1.0
4,9,one dissolve water quikly sugar salt methane c...,1,1.985718,0.540787,0.393676


In [99]:
%%time
df_test = pd.merge(df_test_base, allquestions_test[allquestions_test.type==1], left_on='test_id',right_on='qid', how='left')
df_test = pd.merge(df_test, allquestions_test[allquestions_test.type==2], left_on='test_id',right_on='qid', suffixes=('_question1','_question2'), how='left')
df_test['jaccard_dist'] = df_test.apply(lambda row: get_dist(row['question_question1'], 
                                                        row['question_question2'], jaccard_distance)
                                                         , axis=1)
df_test['masi_dist'] = df_test.apply(lambda row: get_dist(row['question_question1'], 
                                                        row['question_question2'], masi_distance)
                                                         , axis=1)
df_test['word_match_share'] = df_test.apply(lambda row: get_word_match_share(row['question_question1'], row['question_question2']), axis=1)

for col in ['act_len','imp_len','tfidf_score']:
    df_test[col+'_diff'] = df_test[col+'_question1'] - df_test[col+'_question2']
for qtype in ['what','why','where','how','which']:
     df_test[qtype + '_'  + 'diff'] = (df_test[qtype+'_question1'] != df_test[qtype + '_question2']).astype(int)

df_test[features].head()

CPU times: user 12min 40s, sys: 18.1 s, total: 12min 58s
Wall time: 12min 55s


In [104]:
df_test[features].to_pickle('data/df_test.pkl')