In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [2]:
!ls

__notebook_source__.ipynb


In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip')

In [5]:
data

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [6]:
# Ref: https://hub.packtpub.com/use-tensorflow-and-nlp-to-detect-duplicate-quora-questions-tutorial/
# adding new features

def feature_qlen(data):    
    data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
    data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
    data['diff_len'] = data.len_q1 - data.len_q2

def feature_qlen_char(data):
    data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split(' ')))
    data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split(' ')))

    # common words
    data['common_words'] = data.apply(
        lambda x: 
        len(set(str(x['question1']).lower().split())
            .intersection(
                set(str(x['question2']).lower().split()))), 
        axis=1)

fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 
        'len_char_q2', 'len_word_q1', 'len_word_q2',     
        'common_words']

In [7]:
feature_qlen(data)

In [8]:
feature_qlen_char(data)

In [9]:
# using fuzzywuzzy package to find levenshtein distance between question pairs
from fuzzywuzzy import fuzz

In [10]:
def apply_fuzz(data, column_name, function):
    data[column_name] = data.apply(
        lambda x: function(str(x['question1']), str(x['question2'])), axis=1
    )

def feature_fuzz(data):
    batch = [('fuzz_ratio', fuzz.QRatio),
        ('fuzz_WRatio', fuzz.WRatio),
        ('fuzz_partial_ratio', fuzz.partial_ratio),
        ('fuzz_partial_token_set_ratio', fuzz.partial_token_set_ratio),
        ('fuzz_partial_token_sort_ratio', fuzz.partial_token_sort_ratio),
        ('fuzz_token_set_ratio', fuzz.token_set_ratio),
        ('fuzz_token_sort_ratio', fuzz.token_sort_ratio)]

    for (param, fn) in batch:
        apply_fuzz(data, param, fn)
        print(f"Finished calculating {param}")

fs_2 = ['fuzz_ratio', 'fuzz_WRatio', 'fuzz_partial_ratio', 
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio']

In [11]:
feature_fuzz(data)

Finished calculating fuzz_ratio
Finished calculating fuzz_WRatio
Finished calculating fuzz_partial_ratio
Finished calculating fuzz_partial_token_set_ratio
Finished calculating fuzz_partial_token_sort_ratio
Finished calculating fuzz_token_set_ratio
Finished calculating fuzz_token_sort_ratio


In [12]:
# tf-idf implementation

# from sklearn.feature_extraction.text import TfidfVectorizer
# from copy import deepcopy

# tfv_q1 = TfidfVectorizer(
#     min_df=3, 
#     max_features=None, 
#     strip_accents='unicode', 
#     analyzer='word', 
#     token_pattern=r'\w{1,}',
#     ngram_range=(1, 2), 
#     use_idf=1, 
#     smooth_idf=1, 
#     sublinear_tf=1,
#     stop_words='english')

# tfv_q2 = TfidfVectorizer(
#     min_df=3, 
#     max_features=None, 
#     strip_accents='unicode', 
#     analyzer='word', 
#     token_pattern=r'\w{1,}',
#     ngram_range=(1, 2), 
#     use_idf=1, 
#     smooth_idf=1, 
#     sublinear_tf=1,
#     stop_words='english')


In [13]:
# q1_tfidf = tfv_q1.fit_transform(data.question1.fillna(""))
# q2_tfidf = tfv_q2.fit_transform(data.question2.fillna(""))

In [14]:
# from sklearn.decomposition import TruncatedSVD
# svd_q1 = TruncatedSVD(n_components=180)
# svd_q2 = TruncatedSVD(n_components=180)

In [15]:
# question1_vectors = svd_q1.fit_transform(q1_tfidf)
# question2_vectors = svd_q2.fit_transform(q2_tfidf)

In [16]:
# from scipy import sparse
# # obtain features by stacking the sparse matrices together
# fs3_1 = sparse.hstack((q1_tfidf, q2_tfidf))

In [17]:
# tfv = TfidfVectorizer(min_df=3, 
#                       max_features=None, 
#                       strip_accents='unicode', 
#                       analyzer='word', 
#                       token_pattern=r'\w{1,}',
#                       ngram_range=(1, 2), 
#                       use_idf=1, 
#                       smooth_idf=1, 
#                       sublinear_tf=1,
#                       stop_words='english')

In [18]:
# # combine questions and calculate tf-idf
# q1q2 = data.question1.fillna("") 
# q1q2 += " " + data.question2.fillna("")
# fs3_2 = tfv.fit_transform(q1q2)

In [19]:
# fs3_3 = np.hstack((question1_vectors, question2_vectors))

In [20]:
# import gensim.downloader as api

# model = api.load('word2vec-google-news-300')


In [21]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [22]:
# from nltk.corpus import stopwords
# from nltk import word_tokenize
# stop_words = set(stopwords.words('english'))

# def sent2vec(s, model): 
#     M = []
#     words = word_tokenize(str(s).lower())
#     for word in words:
#     #It shouldn't be a stopword
#         if word not in stop_words:
#             #nor contain numbers
#             if word.isalpha():
#                 #and be part of word2vec
#                 if word in model:
#                     M.append(model[word])
#     M = np.array(M)
#     if len(M) > 0:
#         v = M.sum(axis=0)
#         return v / np.sqrt((v ** 2).sum())
#     else:
#         return np.zeros(300)

In [23]:
# w2v_q1 = np.array([sent2vec(q, model) 
#                    for q in data.question1])
# w2v_q2 = np.array([sent2vec(q, model) 
#                    for q in data.question2])

In [24]:
# from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

In [25]:
# data['cosine_distance'] = [cosine(x,y) 
# for (x,y) in zip(w2v_q1, w2v_q2)]
# data['cityblock_distance'] = [cityblock(x,y) 
# for (x,y) in zip(w2v_q1, w2v_q2)]
# data['jaccard_distance'] = [jaccard(x,y) 
# for (x,y) in zip(w2v_q1, w2v_q2)]
# data['canberra_distance'] = [canberra(x,y) 
# for (x,y) in zip(w2v_q1, w2v_q2)]
# data['euclidean_distance'] = [euclidean(x,y) 
# for (x,y) in zip(w2v_q1, w2v_q2)]
# data['minkowski_distance'] = [minkowski(x,y,3) 
# for (x,y) in zip(w2v_q1, w2v_q2)]
# data['braycurtis_distance'] = [braycurtis(x,y) 
# for (x,y) in zip(w2v_q1, w2v_q2)]

In [26]:
# fs4_1 = ['cosine_distance', 'cityblock_distance', 
#          'jaccard_distance', 'canberra_distance', 
#          'euclidean_distance', 'minkowski_distance',
#          'braycurtis_distance']


In [27]:
# w2v = np.hstack((w2v_q1, w2v_q2))

In [28]:
# def wmd(s1, s2, model):
#     s1 = str(s1).lower().split()
#     s2 = str(s2).lower().split()
#     stop_words = stopwords.words('english')
#     s1 = [w for w in s1 if w not in stop_words]
#     s2 = [w for w in s2 if w not in stop_words]
#     return model.wmdistance(s1, s2)

In [29]:
# data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2'], model), axis=1)
# model.init_sims(replace=True) 
# data['norm_wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2'], model), axis=1)
# fs4_2 = ['wmd', 'norm_wmd']

In [30]:
import gc
import psutil
# del([tfv_q1, tfv_q2, tfv, q1q2, 
#      question1_vectors, question2_vectors, svd_q1, 
#      svd_q2, q1_tfidf, q2_tfidf])
# del([w2v_q1, w2v_q2])
# del([model])
gc.collect()
psutil.virtual_memory()

svmem(total=16787144704, available=14896041984, percent=11.3, used=1607577600, free=9261756416, active=1350983680, inactive=5570461696, buffers=1048616960, cached=4869193728, shared=1003520, slab=491917312)

In [31]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [32]:
def feature_clean(data):
    scaler = StandardScaler()
    data = data.replace([np.inf, -np.inf], np.nan).fillna(0).values
    data = scaler.fit_transform(data)

In [33]:
X = data[fs_1+fs_2]
feature_clean(X)
y = data.is_duplicate.values
y = y.astype('float32').reshape(-1, 1)
# X = np.hstack((X, fs3_3))

In [34]:
np.random.seed(42)
n_all, _ = y.shape
idx = np.arange(n_all)
np.random.shuffle(idx)
n_split = n_all // 10
idx_val = idx[:n_split]
idx_train = idx[n_split:]
x_train = X.iloc[idx_train]
y_train = np.ravel(y[idx_train])
x_val = X.iloc[idx_val]
y_val = np.ravel(y[idx_val])

In [35]:
# logres = linear_model.LogisticRegression(C=0.1, 
#                                  solver='sag', max_iter=1000)
# logres.fit(x_train, y_train)
# lr_preds = logres.predict(x_val)
# log_res_accuracy = np.sum(lr_preds == y_val) / len(y_val)
# print("Logistic regr accuracy: %0.3f" % log_res_accuracy)

In [36]:
params = dict()
params['objective'] = 'binary:logistic'
params['eval_metric'] = ['logloss', 'error']
params['eta'] = 0.02
params['max_depth'] = 4
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_val, label=y_val)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 5000, watchlist, 
                early_stopping_rounds=50, verbose_eval=100)
xgb_preds = (bst.predict(d_valid) >= 0.5).astype(int)
xgb_accuracy = np.sum(xgb_preds == y_val) / len(y_val)
# print("Xgb accuracy: %0.3f" % xgb_accuracy)



[0]	train-logloss:0.68786	train-error:0.30518	valid-logloss:0.68786	valid-error:0.30510
[100]	train-logloss:0.52635	train-error:0.29646	valid-logloss:0.52787	valid-error:0.29560
[200]	train-logloss:0.50768	train-error:0.29134	valid-logloss:0.50968	valid-error:0.29095
[300]	train-logloss:0.50083	train-error:0.28722	valid-logloss:0.50299	valid-error:0.28668
[400]	train-logloss:0.49617	train-error:0.28455	valid-logloss:0.49867	valid-error:0.28413
[500]	train-logloss:0.49214	train-error:0.28102	valid-logloss:0.49506	valid-error:0.28126
[600]	train-logloss:0.48939	train-error:0.27917	valid-logloss:0.49264	valid-error:0.28010
[700]	train-logloss:0.48668	train-error:0.27705	valid-logloss:0.49026	valid-error:0.27856
[800]	train-logloss:0.48457	train-error:0.27540	valid-logloss:0.48855	valid-error:0.27710
[900]	train-logloss:0.48291	train-error:0.27374	valid-logloss:0.48722	valid-error:0.27592
[1000]	train-logloss:0.48130	train-error:0.27221	valid-logloss:0.48593	valid-error:0.27522
[1100]	trai

In [37]:
bst.save_model('xgbook-duplicate-question.model')

In [38]:
test_df = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv.zip")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [39]:
feature_qlen(test_df)

In [40]:
feature_qlen_char(test_df)

In [42]:
feature_fuzz(test_df)

Finished calculating fuzz_ratio
Finished calculating fuzz_WRatio
Finished calculating fuzz_partial_ratio
Finished calculating fuzz_partial_token_set_ratio
Finished calculating fuzz_partial_token_sort_ratio
Finished calculating fuzz_token_set_ratio
Finished calculating fuzz_token_sort_ratio


In [None]:
# !wget https://www.soundjay.com/buttons/beep-09.wav
# from IPython.display import Audio
# Audio('./beep-09.wav', autoplay=True)

In [44]:
test_df1 = test_df[fs_1+fs_2]
feature_clean(test_df1)

In [46]:
test_dm = xgb.DMatrix(test_df1)

In [47]:
predictions = bst.predict(test_dm)

In [54]:
test_df['predictions'] = predictions

In [57]:
submission_df = test_df[['test_id', 'predictions']]

In [61]:
submission_df.to_csv('submission.csv', index=False)

In [62]:
!head submission.csv

test_id,predictions
0,0.035313632
1,0.4844765
2,0.5462187
3,0.0017051517
4,0.43695524
5,0.072966434
6,0.5528069
7,0.2206572
8,0.58935547
