## Quora Kaggle competition 

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import gensim as gn
from gensim.models import word2vec
import nltk
from fuzzywuzzy import fuzz
from sklearn import linear_model
%matplotlib inline

### Training data

In [2]:
df_train = pd.read_csv('./data/train.csv').sample(10000,random_state=44)
df_test = pd.read_csv('./data/test.csv')
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
273872,273872,392405,392406,What are the pros & cons of democracy?,What are the pros and cons of a democracy?,1
342308,342308,438846,462606,How will Brexit impact the flow of goods and p...,Can a post-Brexit Britain really survive witho...,1
353135,353135,482139,482140,If I got a Buddhist tattoo would I be disrespe...,"Do many Buddhists actually ""throw away the raft""?",0
332098,332098,338155,31995,When did I create my Instagram account?,How can I track down who created an Instagram ...,0
176156,176156,271084,271085,Is Hulu Plus Free Trial really free?,How long is the Hulu Plus free trial?,0


In [3]:
print('Total number of question pairs for training: {}'.format(len(df_train)))
print('Total number of question pairs for test data: {}'.format(len(df_test)))
print('Duplicate pairs : {} %'.format(round(df_train['is_duplicate'].mean()*100,2)))

Total number of question pairs for training: 10000
Total number of question pairs for test data: 2345796
Duplicate pairs : 37.46 %


### Data features

In [4]:
def generate_features(df_train):
    df_train['len_q1'] = df_train['question1'].apply(lambda x:len(str(x)))
    df_train['len_q2'] = df_train['question2'].apply(lambda x:len(str(x)))
    df_train['diff_len'] = df_train.len_q1-df_train.len_q2
    df_train['len_char_q1'] = df_train.question1.apply(lambda x:len(''.join(set(str(x).replace(' ','')))))
    df_train['len_char_q2'] = df_train.question2.apply(lambda x:len(''.join(set(str(x).replace(' ','')))))
    df_train['len_word_q1'] = df_train.question1.apply(lambda x:len(str(x).split()))
    df_train['len_word_q2'] = df_train.question2.apply(lambda x:len(str(x).split()))
    df_train['common_words'] = df_train.apply(lambda x:len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))),axis=1)

    df_train['fuzzy_qratio'] = df_train.apply(lambda x: fuzz.QRatio(str(x['question1']),str(x['question2'])),axis=1)
    df_train['fuzzy_wratio'] = df_train.apply(lambda x:fuzz.WRatio(str(x['question1']),str(x['question2'])),axis=1)
    df_train['fuzzy_partial_ratio'] = df_train.apply(lambda x:fuzz.partial_ratio(str(x['question1']),str(x['question2'])),axis=1)
    return df_train
    
    
    
    
    

In [5]:
df_train = generate_features(df_train)
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzzy_qratio,fuzzy_wratio,fuzzy_partial_ratio
273872,273872,392405,392406,What are the pros & cons of democracy?,What are the pros and cons of a democracy?,1,38,42,-4,17,16,8,9,7,92,95,87
342308,342308,438846,462606,How will Brexit impact the flow of goods and p...,Can a post-Brexit Britain really survive witho...,1,105,145,-40,26,30,18,22,7,62,71,71
353135,353135,482139,482140,If I got a Buddhist tattoo would I be disrespe...,"Do many Buddhists actually ""throw away the raft""?",0,85,49,36,22,21,15,8,1,44,86,49
332098,332098,338155,31995,When did I create my Instagram account?,How can I track down who created an Instagram ...,0,39,54,-15,18,18,7,10,3,70,70,77
176156,176156,271084,271085,Is Hulu Plus Free Trial really free?,How long is the Hulu Plus free trial?,0,36,37,-1,15,17,7,8,4,65,83,67


### Logistic Regression 

In [6]:
logreg = linear_model.LogisticRegression(C=1e5)
X = df_train.ix[:, 6:,]
Y = df_train.is_duplicate

In [7]:
X.head()

Unnamed: 0,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzzy_qratio,fuzzy_wratio,fuzzy_partial_ratio
273872,38,42,-4,17,16,8,9,7,92,95,87
342308,105,145,-40,26,30,18,22,7,62,71,71
353135,85,49,36,22,21,15,8,1,44,86,49
332098,39,54,-15,18,18,7,10,3,70,70,77
176156,36,37,-1,15,17,7,8,4,65,83,67


In [8]:
logreg.fit(X, Y)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [None]:
test_data = generate_features(df_test)

In [10]:
test_data = df_train.ix[:, 6:,]

In [13]:
proba_replicated = logreg.predict_proba(test_data)
proba = proba_replicated[:,1]

### Create submission file

In [16]:
sub = pd.DataFrame({'test_id': df_test['test_id'], 'is_duplicate': proba})

In [17]:
sub

Unnamed: 0,is_duplicate,test_id
298227,0.682079,298227
74551,0.155582,74551
829639,0.167017,829639
2200959,0.374958,2200959
394329,0.461047,394329
1521381,0.379355,1521381
1956073,0.143516,1956073
70243,0.399537,70243
388073,0.116967,388073
1076836,0.608091,1076836
