In [2]:
from sklearn.model_selection import train_test_split
import csv

import pandas as pd
import numpy as np

# Read data file

In [3]:
data = pd.read_csv("data/quora/quora_duplicate_questions.tsv", sep='\t')

In [4]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [6]:
data[data.isnull().any(axis=1)]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [7]:
data.dropna(inplace=True)


In [8]:
len(data)

404287

In [9]:
data['question1'] = data['question1'].apply(lambda x: (str(x)))
data['question2'] = data['question2'].apply(lambda x: (str(x)))

In [10]:
labels = np.array(data['is_duplicate'], dtype=int)


In [11]:
question1 = list(data['question1'])
question2 = list(data['question2'])

# Cleaning Data (Remove question mark and punctuations)

In [14]:
import string,re
question1 = [s.translate(string.punctuation) for s in question1]
question2 = [s.translate(string.punctuation) for s in question2]

question1 = [re.sub(r'[^\x00-\x7f]',r' ',s) for s in question1]
question2 = [re.sub(r'[^\x00-\x7f]',r' ',s) for s in question2]

In [15]:
#split words 
question1_splitted = [s.split() for s in question1]
question2_splitted = [s.split() for s in question2]

In [16]:
#make all words lower case
question1_lower = [[w.lower() for w in s] for s in question1_splitted]
question2_lower = [[w.lower() for w in s] for s in question2_splitted]

In [17]:
tokens = [item for sublist in (question1_lower + question2_lower) for item in sublist]

In [20]:
words = list(set(tokens))
words.insert(0,"<PAD>")
words_index = dict([(idx, word) for (word, idx) in enumerate(words)])
print('Number of tokens {}'.format(len(tokens)))
print('Number of words {}'.format(len(words)))

Number of tokens 8951229
Number of words 197432


# Transfer sentences into list of indexes with fixed length

In [21]:
max_length = 30
def sentToIndex(sent):
    sents_index = np.zeros( max_length)
    for i in range(min(max_length,len(sent))):
        sents_index[i] = words_index[sent[i]]
    return sents_index

In [23]:
q1_index = np.zeros((len(question1_lower),max_length))
q2_index = np.zeros((len(question2_lower),max_length))

for i,sent in enumerate(question1_lower):
    q1_index[i] = sentToIndex(sent)
    
for i,sent in enumerate(question2_lower):
    q2_index[i] = sentToIndex(sent)

print('shape of q1s {}'.format(q1_index.shape))
print('shape of q2s {}'.format(q2_index.shape))

shape of q1s (404287, 30)
shape of q2s (404287, 30)


In [42]:
acc = 0
for i in range(q1_index.shape[0]):
    union = np.union1d(q1_index[i], q2_index[i])
    intersection = np.intersect1d(q1_index[i], q2_index[i])
    iou = float(len(intersection))/len(union)
    if iou > 0.95:
        pred = 1
    else:
        pred = 0
    if pred == labels[i]:
        acc += 1
    

In [43]:
data_len = labels.shape[0]
float(acc)/data_len

0.6319495803723593

# Word Embedding (Glove)

In [45]:
embeddings_index = {}
f = open('data/glove/glove.6B.300d.txt',encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [47]:
embedding_matrix = np.zeros((len(words_index) + 1, 300))
unknown_words = []
for word, i in words_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.random.randn(1,300) * 0.25
        unknown_words.append(word)
print('Embedding Shape {}'.format(embedding_matrix.shape))
print('Number of unknown words = {}'.format(len(unknown_words)))

Embedding Shape (197433, 300)
Number of unknown words = 140675


In [49]:
print(unknown_words[10:50])

['presently.com', 'pound.?', 'student/doctor?', 'iisc/top', 'wrinkly?', 'swagger"', 'body),', 'ghostbathacker', 'oppurtunity?', 'xl,but', 'chimp?', "'to'", 'codefights.org?', 'miserable?', '48,000.', '4.2.2', "f'(x)?", 'unsubcribe', 'signature?', '"funny"?', 'steroids,', 'ames,', 'angeles,', 'triathlon?', 'cesses?', 'giver"?', 'cement?', 'invertebrates?', 'cfo,', 'photographer,', 'greek"?', 'else;', 'oops,', 'savings.com', 'commonwealth,', 'words,', 'holism?', 'headache.how', 'commonplace,', '10^14']


# Save all data

In [21]:
import os

directory = 'data/processed'

if not os.path.exists(directory):
    os.makedirs(directory)
    
np.save(open('data/processed/q1_processed.npy', 'wb'), q1_index)
np.save(open('data/processed/q2_processed.npy', 'wb'), q2_index)

np.save(open('data/processed/label_processed.npy', 'wb'), labels)
np.save(open('data/processed/glove_word_embedding_matrix.npy', 'wb'), embedding_matrix)