In [189]:
from IPython.display import display, HTML

In [None]:
pip install pandas numpy nltk sklearn gensim pyemd keras

# Data cleaning

We do some basic data cleaning including stemming (i.e. removing suffixes) and removing common words, tagging parts of speech, and finding the duplicate words between given question pairs

In [177]:
import pandas as pd
import numpy as np

from nltk.stem.porter import *
from nltk.tokenize import *
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

data_file_location = "./quora_duplicate_questions.tsv"
data = pd.read_csv(
    data_file_location,
    sep='\t',
)

stemmed_q1s = []
tagged_q1s = []
stemmed_q2s = []
tagged_q2s = []
dups_all = []
cs_all = []


common_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

for index, question in data.iterrows():
    question1 = question.question1
    question2 = question.question2

    try:
        tokens1 = [token for token in wordpunct_tokenize(question1) if token not in common_words]
        stemmed1 = [stemmer.stem(word) for word in tokens1]
        tagged1 = nltk.pos_tag(stemmed1)

        tokens2 = [token for token in wordpunct_tokenize(question2) if token not in common_words]
        stemmed2 = [stemmer.stem(word) for word in tokens2]
        tagged2 = nltk.pos_tag(stemmed2)
        
        dups = [word for word in stemmed1 if word in stemmed2]
        try:
            cs = word_vectors.n_similarity(stemmed1,stemmed2)
        except ValueError as er:
            print(dups)
            print(er)
            cs = 0
        
        stemmed_q1s.append(stemmed1)
        tagged_q1s.append(tagged1)
        stemmed_q2s.append(stemmed2)
        tagged_q2s.append(tagged2)
        dups_all.append(len(dups))
        cs_all.append(cs)

    except Exception as e:
        data.drop([index],inplace=True)
        print(e)

        print("Ran into problem with data, removing question:")
        print(question)
        continue
data.insert(4,'q1_stems',stemmed_q1s)
data.insert(5,'q1_tags',tagged_q1s)
data.insert(7,'q2_stems',stemmed_q2s)
data.insert(8,'q2_tags',tagged_q2s)
data.insert(9,'duplicates',dups_all)
data.insert(10,'cosine_similarity',cs_all)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/annguilinger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/annguilinger/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annguilinger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


expected string or bytes-like object
Ran into problem with data, removing question:
id                                      105780
qid1                                    174363
qid2                                    174364
question1       How can I develop android app?
question2                                  NaN
is_duplicate                                 0
Name: 105780, dtype: object
At least one of the passed list is empty.
Ran into problem with data, removing question:
id                                           108978
qid1                                         178936
qid2                                         178937
question1                                         i
question2       What questions to ask any drdummer?
is_duplicate                                      0
Name: 108978, dtype: object
At least one of the passed list is empty.
Ran into problem with data, removing question:
id                                                       115347
qid1                    

In [178]:
print(data.head())

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                            q1_stems  \
0  [what, step, step, guid, invest, share, market...   
1  [what, stori, kohinoor, (, koh, -, -, noor, ),...   
2  [how, i, increas, speed, internet, connect, us...   
3         [whi, i, mental, lone, ?, how, i, solv, ?]   
4  [which, one, dissolv, water, quikli, sugar, ,,...   

                                             q1_tags  \
0  [(what, WP), (step, VB), (step, NN), (guid, NN...   
1  [(what, WP), (stori, VBD), (kohinoor, NN), ((,...   
2  [(how, WRB), (i, JJ), (increas, VBP), (speed, ...   
3  [(w

In [179]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1],data.iloc[:,-1])

In [180]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data.loc[:,['q1_stems','q2_stems']].to_numpy().flatten())

In [182]:
def tokenandpad(dat):
    tokened = tokenizer.texts_to_sequences(dat.to_numpy())
    return pad_sequences(tokened,maxlen=100,padding='post')

q1s_train = tokenandpad(X_train.loc[:,'q1_stems'])
q2s_train = tokenandpad(X_train.loc[:,'q2_stems'])

q1s_test = tokenandpad(X_test.loc[:,'q1_stems'])
q2s_test = tokenandpad(X_test.loc[:,'q2_stems'])

In [217]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense,Embedding,Bidirectional,LSTM,concatenate
from tensorflow.keras import Input
from tensorflow.keras.metrics import Precision, Recall

text_input1           = Input(shape = (None,), dtype = 'int32')
embedding1            = Embedding(100,64)(text_input1)
encoded_text1_forward = LSTM(32)
encoded_text1_back    = LSTM(32, go_backwards=True)
encoded_text1         = Bidirectional(encoded_text1_forward, backward_layer=encoded_text1_back)(embedding1)

text_input2           = Input(shape = (None,), dtype = 'int32')
embedding2            = Embedding(100,64)(text_input2)
encoded_text2_forward = LSTM(32)
encoded_text2_back    = LSTM(32, go_backwards=True)
encoded_text2         = Bidirectional(encoded_text2_forward, backward_layer=encoded_text2_back)(embedding2)


input3   = Input(shape = (None,), dtype = 'float32')
input4   = Input(shape = (None,), dtype = 'float32')

concatenated = concatenate([encoded_text1, encoded_text2], axis = -1)
output = Dense(64, activation = 'relu')(concatenated)
output = Dense(1,  activation = 'sigmoid')(output)

model = Model([text_input1, text_input2, input3, input4], output)
model.compile(optimizer = 'adam', 
              loss = 'binary_crossentropy', 
              metrics = ['accuracy', Precision(), Recall()])

model.summary()

Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_137 (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 input_138 (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 embedding_90 (Embedding)       (None, None, 64)     6400        ['input_137[0][0]']              
                                                                                                  
 embedding_91 (Embedding)       (None, None, 64)     6400        ['input_138[0][0]']              
                                                                                           

In [184]:
print(X_train.head())

            id    qid1    qid2  \
281837  281837  401653  177660   
187348  187348  285555   54803   
102770  102770  169988  169989   
346916  346916  475313  475314   
201186  201186  303128  303129   

                                                question1  \
281837  What is the responsibility of a boater to prot...   
187348  Somebody sent me an inappropriate snapchat. Wh...   
102770    How do the Japanese feel about pre-marital sex?   
346916              What are some of you favorite dreams?   
201186  For what values of x, y, and n is (x+y)^n>x^n+...   

                                                 q1_stems  \
281837      [what, respons, boater, protect, shorelin, ?]   
187348  [somebodi, sent, inappropri, snapchat, ., what...   
102770        [how, japanes, feel, pre, -, marit, sex, ?]   
346916                          [what, favorit, dream, ?]   
201186  [for, valu, x, ,, ,, n, (, x, +, )^, n, >, x, ...   

                                                  q1_tags  \


In [218]:
res = model.fit([q1s_train, q2s_train, X_train['cosine_similarity'], X_train['duplicates']], y_train.to_numpy(),
           batch_size=50,
           epochs=12,
           validation_data=[[q1s_test, q2s_test, X_test['cosine_similarity'], X_test['duplicates']], y_test.to_numpy()])

Epoch 1/12


2022-08-17 18:44:05.185695: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:44:07.525617: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:44:07.535982: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:44:07.595403: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:44:07.595453: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:44:10.738241: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:44:10.774073: I tensorflow/core/grappler/optimizers/cust



2022-08-17 18:59:39.140770: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:59:39.453416: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:59:39.453728: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:59:39.477427: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-17 18:59:39.480937: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
