### Problem statement

    It is a binary classification problem, for a given pair of questions we need to predict if they are duplicate or not. 

### Import Laibraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from nltk.corpus import stopwords
from fuzzywuzzy import fuzz

from bs4 import BeautifulSoup
import re
import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Questions.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
df.isna().sum()
df.dropna(inplace = True)
df.duplicated().sum()

0

In [4]:
new_df = df.sample(60000)
print('shape of new_df : ',new_df.shape)
new_df.head()

shape of new_df :  (60000, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
366314,366314,496495,133148,Is space time infinite?,Is the universe finite or infinite? Is there a...,0
230210,230210,339688,339689,Is it possible to export my location history f...,Is it possible to merge a foursquare venue int...,0
263639,263639,22835,120154,How do I get a man to ask me out?,How do you ask a girl out on a date?,0
245468,245468,199417,175569,How can I learn English well in a short time?,How can I learn english quickly and well?,1
163512,163512,254313,254314,What is the best way to prepare for all MBA ex...,"Where are we? Let's assume our earth a ball, k...",0


In [5]:
ques_df = new_df[['question1','question2']]
print('shape of new_df : ',ques_df.shape)
ques_df.head()

shape of new_df :  (60000, 2)


Unnamed: 0,question1,question2
366314,Is space time infinite?,Is the universe finite or infinite? Is there a...
230210,Is it possible to export my location history f...,Is it possible to merge a foursquare venue int...
263639,How do I get a man to ask me out?,How do you ask a girl out on a date?
245468,How can I learn English well in a short time?,How can I learn english quickly and well?
163512,What is the best way to prepare for all MBA ex...,"Where are we? Let's assume our earth a ball, k..."


In [6]:
def preprocess(q):
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    from contractions import contraction
    q_decontracted = []

    for word in q.split():
        if word in contraction:
            word = contraction[word]
        q_decontracted.append(word)
            
    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")

    
    # Removing HTML tags
    
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q

In [7]:
preprocess("I've already! wasn't <b>done</b>?")

'i have already  was not done'

In [8]:
new_df['question1'] = new_df['question1'].apply(preprocess)
new_df['question2'] = new_df['question2'].apply(preprocess)

In [10]:
questions = list(new_df['question1']) + list(new_df['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray().astype(np.uint8),2)

# used astype(np.uint8) to avoid MemoryError: Unable to allocate 283. GiB for an array with shape (156816, 36, 53806)

In [11]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)

temp_df = pd.concat([temp_df1, temp_df2], axis=1)

print(temp_df.shape)
temp_df.head()

(60000, 6000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
366314,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
263639,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
245468,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163512,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
x_train,x_test,y_train,y_test = train_test_split(temp_df,new_df['is_duplicate'],test_size=0.2)

In [14]:
start_time = time.time()

rf = RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(accuracy_score(y_test,y_pred))

print('confusion_matrix \n',confusion_matrix(y_test,y_pred))

end_time = time.time()

print(f"Time taken for execution : {end_time - start_time} Seconds")

0.7574166666666666
confusion_matrix 
 [[6903  662]
 [2249 2186]]
Time taken for execution : 295.8309214115143 Seconds


In [15]:
start_time = time.time()

mnb = MultinomialNB()
mnb.fit(x_train,y_train)
y_pred = mnb.predict(x_test)
print(accuracy_score(y_test,y_pred))

print('confusion_matrix \n',confusion_matrix(y_test,y_pred))

end_time = time.time()

print(f"Time taken for execution : {end_time - start_time} Seconds")

0.7121666666666666
confusion_matrix 
 [[5972 1593]
 [1861 2574]]
Time taken for execution : 12.389833688735962 Seconds


In [13]:
start_time = time.time()

bnb = BernoulliNB()
bnb.fit(x_train,y_train)
y_pred = bnb.predict(x_test)
print(accuracy_score(y_test,y_pred))

print('confusion_matrix \n',confusion_matrix(y_test,y_pred))

end_time = time.time()

print(f"Time taken for execution : {end_time - start_time} Seconds")

0.7024166666666667
confusion_matrix 
 [[5669 1896]
 [1675 2760]]
Time taken for execution : 18.408474922180176 Seconds


### Step 1 : Analysis of data, preprocessing, Model Training, Evaluation

        RandomForestClassifier = 0.7574
        MultinomialNB = 0.7121
        BernoulliNB = 0.7024