## Identifying Duplicate Questions

In [19]:
import pandas as pd
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.auto import tqdm
import gensim
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading the data
message = pd.read_csv("./data/train.csv")
message

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


### Exploration

In [3]:
#checking for null values
message.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [4]:
#checking for type and shape of data
message.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [5]:
#exploring the entries having null values
message[message.question1.isnull() | message.question2.isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [6]:
#checking for balance of data
message['is_duplicate'].value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

### Cleaning

- Tokenization
- Stopwords cleaning
- Removing punctuation
- Normalizing

In [7]:
#dropping the null values and useless columns
message = message.drop(['id','qid1', 'qid2'], axis=1)
message.columns = ['question1', 'question2', 'label']
message = message.dropna()

message.head()

Unnamed: 0,question1,question2,label
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [8]:
#double checking
message.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   question1  404287 non-null  object
 1   question2  404287 non-null  object
 2   label      404287 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 12.3+ MB


In [9]:
#removing punctuation
punctuation = string.punctuation
message['question1'] = message['question1'].apply(lambda x: "".join([i for i in x if i not in punctuation]))
message['question2'] = message['question2'].apply(lambda x: "".join([i for i in x if i not in punctuation]))

message.head(5)

Unnamed: 0,question1,question2,label
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor KohiNoor Diamond,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely How can I solve it,Find the remainder when math2324math is divide...,0
4,Which one dissolve in water quikly sugar salt ...,Which fish would survive in salt water,0


In [10]:
#tokenizing
message['question1'] = message['question1'].apply(lambda x: x.lower().split())
message['question2'] = message['question2'].apply(lambda x: x.lower().split())
message.head(5)

Unnamed: 0,question1,question2,label
0,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv...",0
1,"[what, is, the, story, of, kohinoor, kohinoor,...","[what, would, happen, if, the, indian, governm...",0
2,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,...",0
3,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, math2324math, is,...",0
4,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]",0


In [11]:
#removing stopwords
common_words = stopwords.words('english')
message['question1'] = message['question1'].apply(lambda x: [i for i in x if i not in common_words])
message['question2'] = message['question2'].apply(lambda x: [i for i in x if i not in common_words])
message.head(5)

Unnamed: 0,question1,question2,label
0,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0
1,"[story, kohinoor, kohinoor, diamond]","[would, happen, indian, government, stole, koh...",0
2,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0
3,"[mentally, lonely, solve]","[find, remainder, math2324math, divided, 2423]",0
4,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0


In [12]:
#lemmatizing on adjectives
lemmatizer = WordNetLemmatizer()
message['question1'] = message['question1'].apply(lambda x: [lemmatizer.lemmatize(i, pos ="a") for i in x])
message['question2'] = message['question2'].apply(lambda x: [lemmatizer.lemmatize(i, pos ="a") for i in x])
message.head(5)

Unnamed: 0,question1,question2,label
0,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0
1,"[story, kohinoor, kohinoor, diamond]","[would, happen, indian, government, stole, koh...",0
2,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0
3,"[mentally, lonely, solve]","[find, remainder, math2324math, divided, 2423]",0
4,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0


In [13]:
#joining the tokens
message['question1'] = message['question1'].apply(lambda x: ' '.join(x))
message['question2'] = message['question2'].apply(lambda x: ' '.join(x))
message.head(5)

Unnamed: 0,question1,question2,label
0,step step guide invest share market india,step step guide invest share market,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,mentally lonely solve,find remainder math2324math divided 2423,0
4,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0


In [14]:
#saving the cleaned data
message.to_csv('./data/cleaned_data.csv', index=False)

### Feature Engineering
- word2vec

In [14]:
#loading the cleaned data
message = pd.read_csv("./data/cleaned_data.csv") 
message = message.dropna().reset_index(drop=True)
message

Unnamed: 0,question1,question2,label
0,step step guide invest share market india,step step guide invest share market,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,mentally lonely solve,find remainder math2324math divided 2423,0
4,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0
...,...,...,...
404160,many keywords racket programming language late...,many keywords perl programming language late v...,0
404161,believe life death,true life death,1
404162,one coin,whats coin,0
404163,approx annual cost living studying uic chicago...,little hairfall problem want use hair styling ...,0


In [15]:
#double checking
message.isnull().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404165 entries, 0 to 404164
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   question1  404165 non-null  bool 
 1   question2  404165 non-null  bool 
 2   label      404165 non-null  bool 
dtypes: bool(3)
memory usage: 1.2 MB


In [16]:
#tokenizing
message['question1'] = message['question1'].apply(lambda x: x.split())
message['question2'] = message['question2'].apply(lambda x: x.split())
message.head(5)


Unnamed: 0,question1,question2,label
0,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0
1,"[story, kohinoor, kohinoor, diamond]","[would, happen, indian, government, stole, koh...",0
2,"[increase, speed, internet, connection, using,...","[internet, speed, increased, hacking, dns]",0
3,"[mentally, lonely, solve]","[find, remainder, math2324math, divided, 2423]",0
4,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]",0


In [17]:
#preparing the data for word2vec training
training_data = pd.concat([message['question1'], message['question2']], ignore_index=True).reset_index(drop=True)
training_data

0         [step, step, guide, invest, share, market, india]
1                      [story, kohinoor, kohinoor, diamond]
2         [increase, speed, internet, connection, using,...
3                                 [mentally, lonely, solve]
4         [one, dissolve, water, quikly, sugar, salt, me...
                                ...                        
808325    [many, keywords, perl, programming, language, ...
808326                                  [true, life, death]
808327                                        [whats, coin]
808328    [little, hairfall, problem, want, use, hair, s...
808329                                  [like, sex, cousin]
Length: 808330, dtype: object

In [45]:
#trainig word2vec model
model_word2vec = gensim.models.Word2Vec(training_data, vector_size = 50, window = 4, min_count = 0)


In [46]:
#vectorizing each document
for i in tqdm(range(len(message)), desc='Line progress'):
    message['question1'][i] = np.mean([model_word2vec.wv[j] for j in message['question1'][i]], axis=0)
    message['question2'][i] = np.mean([model_word2vec.wv[j] for j in message['question2'][i]], axis=0)

message

Line progress:   0%|          | 0/404165 [00:00<?, ?it/s]

Unnamed: 0,question1,question2,label
0,"[0.084315024, -0.43867692, 0.16768922, 1.41204...","[0.13600636, -0.11791914, 0.17393465, 1.426967...",0
1,"[0.044232555, -0.20270082, 0.31274968, 0.28486...","[-0.04752487, -0.09834049, -0.6486826, 0.65928...",0
2,"[-1.0890497, 1.6613846, -1.9789753, 0.7882204,...","[-0.3092385, 0.701918, -0.8519165, 0.6879722, ...",0
3,"[-0.46515092, -0.100840546, -1.0132018, -0.709...","[-0.47147432, -0.07619099, -1.1264488, -0.1881...",0
4,"[-0.27265447, 0.2602761, -0.3034411, -0.370914...","[-0.13271591, 0.7018069, -0.119340345, -0.2957...",0
...,...,...,...
404160,"[-0.21498217, 0.7691367, -0.8485753, 0.6892931...","[-0.19659114, 0.8102891, -0.86834186, 0.681570...",0
404161,"[0.59661853, 0.11590024, 0.9140854, -0.3295170...","[0.38637877, 0.25045493, 0.27857238, -0.124994...",1
404162,"[-0.26028737, -0.2893486, -1.3648713, -0.14995...","[-0.3367908, 1.2499323, -0.3206843, -0.4281272...",0
404163,"[0.3810428, -0.28028515, -0.44077843, 0.755348...","[-0.03575552, -0.6857369, -0.9841931, 0.289953...",0


In [47]:
# Separate majority and minority classes
max_class = message[message.label == 0]
min_class = message[message.label == 1]
 
# Downsample majority class
tab_max = resample(max_class, 
                   replace = False,   
                   n_samples = 149259,
                   random_state=123)
 
# Combine minority class with downsampled majority class
message = pd.concat([tab_max, min_class])
message = message.reset_index(drop=True)

#checking output
message['label'].value_counts()

0    149259
1    149259
Name: label, dtype: int64

In [48]:
#making the data ready for training model
message = pd.concat([pd.DataFrame(message['question1'].to_list()), 
                     pd.DataFrame(message['question2'].to_list()), 
                     message['label']], axis=1)
message['label'] = message['label'] == 1
message

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,label
0,-0.213509,-1.082804,-0.693351,0.369103,-0.108251,-1.111832,0.830733,0.677242,0.098630,-0.697507,...,1.117449,0.046870,0.530342,1.320442,0.356326,-0.325528,-1.063512,-0.556652,0.347261,False
1,-0.463997,-0.850641,-0.884004,-0.475247,-0.393655,0.099382,0.904130,2.170671,0.437404,-0.478747,...,-0.213898,0.479381,0.396462,0.327342,1.050491,0.242397,-0.947426,-1.054893,-0.125640,False
2,-0.199463,0.831412,-0.310542,0.353159,0.235259,0.180069,0.562740,-0.496421,-0.510517,0.391568,...,-0.238208,-0.205090,0.247873,0.790015,-0.143626,-0.238548,0.104901,0.519135,-0.335472,False
3,-1.166725,0.536482,-0.912262,0.425048,0.759891,0.448969,1.531494,1.977282,1.131591,-0.865472,...,0.220038,-0.452904,-1.067953,1.092840,-0.288255,-0.943807,-0.390419,-1.068165,1.629557,False
4,0.042892,-0.889975,0.012821,0.270956,1.034804,-0.876800,1.064468,0.592614,0.723141,-0.303773,...,-0.025413,-1.136871,-0.595781,0.578811,0.842442,-0.227120,-0.776430,-0.646797,0.314845,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298513,-0.430804,-1.403104,0.036221,0.461148,0.245892,0.082112,-0.542974,0.979220,-1.723915,-0.366360,...,0.098844,0.030471,-0.828767,0.478655,0.501471,-0.456402,0.345943,0.029242,-0.189989,True
298514,-1.432746,-0.258690,0.713670,-0.780895,0.010248,0.397989,-0.509905,2.193705,-0.950716,-0.393788,...,-1.014094,-0.424181,1.658149,2.480246,-0.504542,-0.202235,-0.693232,1.390310,0.423398,True
298515,1.887982,-0.114261,-0.883198,-0.222273,-0.442446,0.048275,1.549740,2.666224,-0.216451,-1.656290,...,0.517772,0.958617,0.883227,0.584391,-0.199153,-0.150757,-2.186577,-0.853170,0.286194,True
298516,-0.055591,0.214790,-0.343749,-0.070566,-0.043207,0.413206,-0.832587,0.527157,-0.725689,-0.509928,...,-0.062420,-0.575232,0.613737,0.619815,-0.421812,-0.464425,-0.248014,-0.414195,0.196849,True


In [49]:
#splitting the data into train and test
X_train, y_train = message.drop(['label'], axis = 1), message['label']

train_ratio = 0.7
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, shuffle=True, 
                                                    train_size=train_ratio,
                                                    random_state=42)

print(f'{len(X_train)} training samples and {len(X_test)} test samples')

208962 training samples and 89556 test samples


In [51]:
#train random forest
model = RandomForestClassifier(random_state=42, max_depth=10, n_estimators=150)
model = model.fit(X_train, y_train)

print('Model score on training data:', model.score(X_train, y_train))
print('Model score on testing data:', model.score(X_test, y_test))


Model score on training data: 0.746504149079737
Model score on testing data: 0.7168587252668721


In [52]:
#evaluating test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.69      0.80      0.74     44676
        True       0.76      0.64      0.69     44880

    accuracy                           0.72     89556
   macro avg       0.72      0.72      0.72     89556
weighted avg       0.72      0.72      0.72     89556



In [58]:
#confusion matrix
confusion_matrix(y_test, y_pred, labels=[True, False])

array([[28632, 16248],
       [ 9109, 35567]], dtype=int64)

In [59]:
#saving model
import pickle
pickle.dump(model, open('model_NV.sav', 'wb'))