In [1]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [2]:
df = pd.read_csv('/kaggle/input/quora-question-pairs/train.csv.zip')

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
df.shape

(404290, 6)

In [5]:
new_df = df.sample(60000)

In [6]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [7]:
new_df.duplicated().sum()

0

In [8]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [9]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
388304,Why doesn't honey spoil?,Does honey go off/spoil?
225528,What is the way to get a job in Yash Raj Films?,Whats the fastest way to get a job in retail?
219164,Are fundamental particles singularities? If ye...,Are black holes fundamental particles?
373622,If Hitler had quit aggressions after a few sma...,Elon Musk: “If you take this year’s revenue or...
351343,What would you want to know about yourself on ...,"As a woman, if you successfully defended yours..."


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features=5000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2) # splits the array vertically into 2 equal parts along rows.


In [11]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1) # axis=1 → column-wise concatenation
temp_df.shape

(60000, 10000)

In [12]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
388304,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
219164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
373622,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
351343,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
285119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
126828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33355,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
temp_df['is_duplicate'] = new_df['is_duplicate']

In [14]:
temp_df.shape

(60000, 10001)

In [15]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)

# .values → convert to NumPy array

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7644166666666666

In [17]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

0.7436666666666667

In [23]:
def check_duplicate(q1, q2, vectorizer, model):
    from scipy.sparse import hstack
    q1_vec = vectorizer.transform([q1])
    q2_vec = vectorizer.transform([q2])
    X_new = hstack([q1_vec, q2_vec]) #  # hstack -> Combines two matrices side by side (along columns)
    pred = model.predict(X_new)
    return "Duplicate" if pred[0]==1 else "Not Duplicate"


In [21]:
y_pred = rf.predict(X_new)

In [27]:
q1 = "How to learn Python programming?"
q2 = "Tips to learn Python programming"

check_duplicate(q1, q2, cv, rf)

'Duplicate'

In [31]:
q1 = "How to learn Python programming?"
q2 = "Tips to learn car"

check_duplicate(q1, q2, cv, rf)

'Not Duplicate'