## Quora Kaggle competition

Welcome to the Quora Question Pairs competition! Here, our goal is to identify which questions asked on Quora, a quasi-forum website with over 100 million visitors a month, are duplicates of questions that have already been asked. This could be useful, for example, to instantly provide answers to questions that have already been answered. We are tasked with predicting whether a pair of questions are duplicates or not, and submitting a binary prediction against the logloss metric.

In [43]:
import numpy as np
import pandas as pd
import os,re
import seaborn as sns
import gensim as gn
import logging
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from gensim.models.word2vec import Word2Vec
import nltk
import scipy.sparse as sparse
from nltk.data import load
from fuzzywuzzy import fuzz
from sklearn import linear_model
import keras.layers as lyr
from keras.models import Model
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
pd.set_option('display.max_columns', 200)

### Training data

In [44]:
df_train = pd.read_csv('./data/train.csv').sample(10000,random_state=44)
df_test = pd.read_csv('./data/test.csv').sample(10000,random_state=44)
df_train.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
273872,273872,392405,392406,What are the pros & cons of democracy?,What are the pros and cons of a democracy?,1
342308,342308,438846,462606,How will Brexit impact the flow of goods and p...,Can a post-Brexit Britain really survive witho...,1
353135,353135,482139,482140,If I got a Buddhist tattoo would I be disrespe...,"Do many Buddhists actually ""throw away the raft""?",0


In [45]:
print('Total number of question pairs for training: {}'.format(len(df_train)))
print('Total number of question pairs for test data: {}'.format(len(df_test)))
print('Duplicate pairs : {} %'.format(round(df_train['is_duplicate'].mean()*100,2)))

Total number of question pairs for training: 10000
Total number of question pairs for test data: 10000
Duplicate pairs : 37.46 %


## Generate features

### Text handcrafted features (fs_1)

In [46]:
def generate_features(df_train):
    df_train['len_q1'] = df_train['question1'].apply(lambda x:len(str(x)))
    df_train['len_q2'] = df_train['question2'].apply(lambda x:len(str(x)))
    df_train['diff_len'] = df_train.len_q1-df_train.len_q2
    df_train['len_char_q1'] = df_train.question1.apply(lambda x:len(''.join(set(str(x).replace(' ','')))))
    df_train['len_char_q2'] = df_train.question2.apply(lambda x:len(''.join(set(str(x).replace(' ','')))))
    df_train['len_word_q1'] = df_train.question1.apply(lambda x:len(str(x).split()))
    df_train['len_word_q2'] = df_train.question2.apply(lambda x:len(str(x).split()))
    df_train['common_words'] = df_train.apply(lambda x:len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))),axis=1)

    df_train['fuzzy_qratio'] = df_train.apply(lambda x: fuzz.QRatio(str(x['question1']),str(x['question2'])),axis=1)
    df_train['fuzzy_wratio'] = df_train.apply(lambda x:fuzz.WRatio(str(x['question1']),str(x['question2'])),axis=1)
    df_train['fuzzy_partial_ratio'] = df_train.apply(lambda x:fuzz.partial_ratio(str(x['question1']),str(x['question2'])),axis=1)
    return df_train

In [47]:
df_train = generate_features(df_train)
df_train.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzzy_qratio,fuzzy_wratio,fuzzy_partial_ratio
273872,273872,392405,392406,What are the pros & cons of democracy?,What are the pros and cons of a democracy?,1,38,42,-4,17,16,8,9,7,92,95,87
342308,342308,438846,462606,How will Brexit impact the flow of goods and p...,Can a post-Brexit Britain really survive witho...,1,105,145,-40,26,30,18,22,7,62,71,71
353135,353135,482139,482140,If I got a Buddhist tattoo would I be disrespe...,"Do many Buddhists actually ""throw away the raft""?",0,85,49,36,22,21,15,8,1,44,86,49


### LDA (Lattent Dirichlet Allocation) features 

In [48]:
# Steaming 
p_stemmer = PorterStemmer()
STOP_WORDS = nltk.corpus.stopwords.words()

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

# Function to vuild a corpus
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
    return corpus

In [7]:
data = clean_dataframe(df_train)
corpus = build_corpus(data)
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary)

In [8]:
def common_lda_topic(sentence1,sentence2,dictionary,ldamodel,min_proba):
    "find #common topic based on lattent dirichlet allocation model"
    sentence1 = sentence1.split()
    sentence2 = sentence2.split()

    sentence1 = dictionary.doc2bow(sentence1)
    sentence2 = dictionary.doc2bow(sentence2)
    
    topic_a = ldamodel.get_document_topics(sentence1,minimum_probability=min_proba)
    topic_b = ldamodel.get_document_topics(sentence2,minimum_probability=min_proba)
    
    topic_a = list(sorted(topic_a, key=lambda x: x[1]))
    topic_b = list(sorted(topic_b, key=lambda x: x[1]))
    common_topic = set([x[0] for x in topic_a]).intersection(x[0] for x in topic_b)
    return(len(common_topic))

In [9]:
df_train['common_topics'] = df_train.apply(lambda x:common_lda_topic(str(x['question1']),str(x['question2']),dictionary,ldamodel,0.1),axis=1)
df_train.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzzy_qratio,fuzzy_wratio,fuzzy_partial_ratio,common_topics
273872,273872,392405,392406,What are the pros & cons of democracy?,What are the pros and cons of a democracy?,1,38,42,-4,17,16,8,9,7,92,95,87,1
342308,342308,438846,462606,How will Brexit impact the flow of goods and p...,Can a post-Brexit Britain really survive witho...,1,105,145,-40,26,30,18,22,7,62,71,71,0
353135,353135,482139,482140,If I got a Buddhist tattoo would I be disrespe...,"Do many Buddhists actually ""throw away the raft""?",0,85,49,36,22,21,15,8,1,44,86,49,1


### POS-Tagging features

In [50]:
#Generate all pos-tag columns
tagdict = load('help/tagsets/upenn_tagset.pickle')
pos_tag = tagdict.keys()
for tag in pos_tag:
    df_train[tag] = 0

In [51]:
def common_pos_tagging(question1,question2):
    question1 = nltk.word_tokenize(question1)
    question2 = nltk.word_tokenize(question2)
    pos_question1 = nltk.pos_tag(question1)
    pos_question2 = nltk.pos_tag(question2)

    pos_1_array = [x[1] for x in pos_question1]
    pos_2_array = [x[1] for x in pos_question2]
    return(len(set(pos_1_array).intersection(pos_2_array)))

In [52]:
for index, row in df_train.iterrows():
    question1 = row.question1.decode('utf-8')
    question1 = nltk.word_tokenize(question1)
    pos_question1 = nltk.pos_tag(question1)
    pos_question1 = [x[1] for x in pos_question1]
    
    for tag in pos_question1:
        if(tag != "#"):
            df_train.set_value(index,tag,row[tag]+1)

In [53]:
df_train['common_pos_count'] = df_train.apply(lambda x:common_pos_tagging(str(x['question1']).decode('utf-8'),str(x['question2']).decode('utf-8')),axis=1)
df_train.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzzy_qratio,fuzzy_wratio,fuzzy_partial_ratio,PRP$,VBG,VBD,``,VBN,",",'',VBP,WDT,JJ,WP,VBZ,DT,RP,$,NN,),(,FW,POS,.,TO,LS,RB,:,NNS,NNP,VB,WRB,CC,PDT,RBS,RBR,CD,PRP,EX,IN,WP$,MD,NNPS,--,JJS,JJR,SYM,UH,common_pos_count
273872,273872,392405,392406,What are the pros & cons of democracy?,What are the pros and cons of a democracy?,1,38,42,-4,17,16,8,9,7,92,95,87,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,8
342308,342308,438846,462606,How will Brexit impact the flow of goods and p...,Can a post-Brexit Britain really survive witho...,1,105,145,-40,26,30,18,22,7,62,71,71,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,8
353135,353135,482139,482140,If I got a Buddhist tattoo would I be disrespe...,"Do many Buddhists actually ""throw away the raft""?",0,85,49,36,22,21,15,8,1,44,86,49,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,6


In [12]:
query = 'How will Brexit impact the flow of goods and'
query2 = 'How will Brexit impact the flow of goods and'
query = query.split()
query2 = query2.split()

query = dictionary.doc2bow(query)
query2 = dictionary.doc2bow(query2)

topic_a = ldamodel.get_document_topics(query,minimum_probability=0.1)
topic_b = ldamodel.get_document_topics(query2,minimum_probability=0.1)

topic_a = list(sorted(topic_a, key=lambda x: x[1]))
topic_b = list(sorted(topic_b, key=lambda x: x[1]))
common_topic = set([x[0] for x in topic_a]).intersection(x[0] for x in topic_b)

print("------ ******* SENTENCES TOPIC DISTRIBUTIONS ******* -------")
print(topic_a)
print("\n")
print(topic_b)
print("\n")
print("------ ******* WORD TOPIC DISTRIBUTIONS ******* -------")
print("topic : {} ======> {} \n".format(topic_a[-1][0],ldamodel.print_topic(topic_a[0][0])))
print("topic : {} ======> {} \n".format(topic_b[-1][0],ldamodel.print_topic(topic_b[0][0])))
print("------ ******* COMMON TOPICS ******* -------")
print("#common topic : {} ======> {}".format(common_topic,len(common_topic)))


------ ******* SENTENCES TOPIC DISTRIBUTIONS ******* -------
[(21, 0.25833306419026431), (23, 0.2583332845892552), (28, 0.25833365122047008)]


[(21, 0.25833306129282602), (23, 0.25833328458926808), (28, 0.25833365411779446)]


------ ******* WORD TOPIC DISTRIBUTIONS ******* -------


------ ******* COMMON TOPICS ******* -------


In [22]:
question1 = nltk.word_tokenize("Messi is the best player in the word, he is wonderful")
question2 = nltk.word_tokenize("What are the pros and cons of a democracy?")
pos_question1 = nltk.pos_tag(question1)
pos_question2 = nltk.pos_tag(question2)

pos_1_array = [x[1] for x in pos_question1]
pos_2_array = [x[1] for x in pos_question2]

print("POS Question for 1 is {} : {}".format(len(pos_1_array),pos_1_array))
print("POS Question for 2 is {} : {}".format(len(pos_2_array),pos_2_array))


POS Question for 1 is 12 : ['NNP', 'VBZ', 'DT', 'JJS', 'NN', 'IN', 'DT', 'NN', ',', 'PRP', 'VBZ', 'JJ']
POS Question for 2 is 10 : ['WP', 'VBP', 'DT', 'NNS', 'CC', 'NNS', 'IN', 'DT', 'NN', '.']


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,...,EX,IN,WP$,MD,NNPS,--,JJS,JJR,SYM,UH
273872,273872,392405,392406,What are the pros & cons of democracy?,What are the pros and cons of a democracy?,1,38,42,-4,17,...,0,0,0,0,0,0,0,0,0,0
342308,342308,438846,462606,How will Brexit impact the flow of goods and p...,Can a post-Brexit Britain really survive witho...,1,105,145,-40,26,...,0,0,0,0,0,0,0,0,0,0
353135,353135,482139,482140,If I got a Buddhist tattoo would I be disrespe...,"Do many Buddhists actually ""throw away the raft""?",0,85,49,36,22,...,0,0,0,0,0,0,0,0,0,0
332098,332098,338155,31995,When did I create my Instagram account?,How can I track down who created an Instagram ...,0,39,54,-15,18,...,0,0,0,0,0,0,0,0,0,0
176156,176156,271084,271085,Is Hulu Plus Free Trial really free?,How long is the Hulu Plus free trial?,0,36,37,-1,15,...,0,0,0,0,0,0,0,0,0,0
199069,199069,25819,171795,Which smartphone would be best under 15000? (2...,Which is the best mobile under 15000,1,50,36,14,26,...,0,0,0,0,0,0,0,0,0,0
18553,18553,35142,35143,Which one is better moto g4 plus or nexus 5x?,I'm 20 years old . My voice is relatively low....,0,45,110,-65,20,...,0,0,0,0,0,0,0,0,0,0
287990,287990,93468,372977,Is it true you get paid to answer Quora Questi...,Will people get paid to answer here?,1,50,36,14,17,...,0,0,0,0,0,0,0,0,0,0
68316,68316,98705,118083,What will happen to my body if I do 100 push-u...,I've been doing 100 push ups twice a week for ...,0,83,136,-53,24,...,0,0,0,0,0,0,0,0,0,0
386994,386994,519283,519284,What is the best financial advice?,What is the best financial advice in their 21's?,0,34,48,-14,15,...,0,0,0,0,0,0,0,0,0,0


### Word2vec features
Generate word2vec based on pre-trained model on Google News corpus (3 billion running words) word vector model (3 million 300-dimension English word vectors).

## Logistic Regression 

In [12]:
logreg = linear_model.LogisticRegression(C=1e5)
X = df_train.ix[:, 6:,]
Y = df_train.is_duplicate

In [13]:
X.head()

Unnamed: 0,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzzy_qratio,fuzzy_wratio,fuzzy_partial_ratio,common_topics
273872,38,42,-4,17,16,8,9,7,92,95,87,1
342308,105,145,-40,26,30,18,22,7,62,71,71,0
353135,85,49,36,22,21,15,8,1,44,86,49,2
332098,39,54,-15,18,18,7,10,3,70,70,77,0
176156,36,37,-1,15,17,7,8,4,65,83,67,0


In [14]:
logreg.fit(X, Y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [15]:
test_data = generate_features(df_test)

In [16]:
test_data['common_topics'] = test_data.apply(lambda x:common_lda_topic(str(x['question1']),str(x['question2']),dictionary,ldamodel,0.1),axis=1)

test_data = df_train.ix[:, 6:,]

In [18]:
proba_replicated = logreg.predict_proba(test_data)
proba = proba_replicated[:,1]
len(proba)
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = proba
sub.to_csv('simple_xgb.csv', index=False)

### Train Word2vec model

In [87]:
STOP_WORDS = nltk.corpus.stopwords.words()
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

data = clean_dataframe(df_train)
data.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzzy_qratio,fuzzy_wratio,fuzzy_partial_ratio
273872,273872,392405,392406,pros cons democracy,pros cons democracy,1,38,42,-4,17,16,8,9,7,92,95,87
342308,342308,438846,462606,brexit impact flow goods people northern irela...,postbrexit britain really survive without land...,1,105,145,-40,26,30,18,22,7,62,71,71
353135,353135,482139,482140,got buddhist tattoo would disrespecting people...,many buddhists actually throw away raft,0,85,49,36,22,21,15,8,1,44,86,49
332098,332098,338155,31995,create instagram account,track created instagram account,0,39,54,-15,18,18,7,10,3,70,70,77
176156,176156,271084,271085,hulu plus free trial really free,long hulu plus free trial,0,36,37,-1,15,17,7,8,4,65,83,67


### Create submission file

In [325]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]