# Naive Bayes Classifier for Quora Question Pairs

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
nltk.download()
path = "C:/Users/aksha/Downloads/7610 Final"

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


### Import Training & Testing Dataset

In [3]:
train = pd.read_csv(path+"/train.csv")
print("Total samples:",len(train))
train.head(10)

Total samples: 404290


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [4]:
print(train.isnull().sum(axis=0))#dropping null values
train.dropna(axis=0,inplace=True)

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64


### Preprocessing the text

#### Lemmatize & Remove Stopwords

In [5]:
#preprocessing
def preprocess(series):
  #remove characters other than alphabets & numerics
  words = re.sub("[^A-Za-z0-9]"," ",series).lower().split()

  #lemmatize words
  lemm = WordNetLemmatizer()
  stpwords = stopwords.words('english')
  lemmitized = [lemm.lemmatize(word) for word in words if word not in stpwords]
  sent = ' '.join(lemmitized)
  return sent

In [6]:
train['question1'] =train['question1'].apply(preprocess)#Apply preprocessing
train['question2'] =train['question2'].apply(preprocess)

#### Concat Question 1 and Question 2

In [7]:
def concat(ser):#concatenate Question 1 & Question 2
  print(ser['question1'])
  return 1
train['combine'] = train.apply(lambda ser: ser['question1'] + " " + ser['question2'],axis=1)
train.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,combine
0,0,1,2,step step guide invest share market india,step step guide invest share market,0,step step guide invest share market india step...
1,1,3,4,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,0,story kohinoor koh noor diamond would happen i...
2,2,5,6,increase speed internet connection using vpn,internet speed increased hacking dns,0,increase speed internet connection using vpn i...
3,3,7,8,mentally lonely solve,find remainder math 23 24 math divided 24 23,0,mentally lonely solve find remainder math 23 2...
4,4,9,10,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0,one dissolve water quikly sugar salt methane c...
5,5,11,12,astrology capricorn sun cap moon cap rising say,triple capricorn sun moon ascendant capricorn say,1,astrology capricorn sun cap moon cap rising sa...
6,6,13,14,buy tiago,keep childern active far phone video game,0,buy tiago keep childern active far phone video...
7,7,15,16,good geologist,great geologist,1,good geologist great geologist
8,8,17,18,use instead,use instead,0,use instead use instead
9,9,19,20,motorola company hack charter motorolla dcx3400,hack motorola dcx3400 free internet,0,motorola company hack charter motorolla dcx340...


#### Vectorizing using TF-IDF (Converting Words to Vectors) and Splitting Dataset

In [8]:
cv = TfidfVectorizer(max_features=50000)#Word to Vectors using Tf-Idf

#Take combine questions data as X
X = cv.fit_transform(train['combine'])
y = np.array(train['is_duplicate'])
print(X.shape)

#Tarin-Test Spilt
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.05)
print(X_train.shape,X_test.shape)

(404287, 50000)
(384072, 50000) (20215, 50000)


### Naive Bayes Modeling

In [9]:
naive_model = MultinomialNB()#Training
naive_model.fit(X_train,y_train)

#Predictions
y_pred_train = naive_model.predict(X_train)
y_pred_test = naive_model.predict(X_test)

#### Fitting the model on testing set

In [10]:
accuracy_train = sum((y_pred_train == y_train).astype(int))/len(y_train)
accuracy_test = sum((y_pred_test == y_test).astype(int))/len(y_test)
print(accuracy_train,accuracy_test)

0.7519241183944677 0.7406381399950531
