In [9]:
import tensorflow as tf
import numpy as np
import re
import pandas as pd
import spacy

** Getting input data ready **

In [10]:
filePath = "/home/ubuntu/adi_aws/data/quora_duplicate_questions.tsv"
df = pd.read_csv(filePath,delimiter="\t")
df.question1 = df.question1.fillna("")
df.question1 = df.question1.apply(str.lower)
df.question2 = df.question2.fillna("")
df.question2 = df.question2.apply(str.lower)

***Find unique word in dataset to creat Vocabulary***

In [None]:
def tokenize(s,nlp):
    doc = nlp(s)
    tokSen = []
    for word in doc:
        tokSen.append(word.text)
    return tokSen

nlp = spacy.load('en')
uniqueQuestions = df.question1.unique()
tokenizedQns = [tokenize(unicode(sentence,'utf8'),nlp) for sentence in uniqueQuestions]
words = [word for tokWords in tokenizedQns for word in tokWords]

words2 = df.question2.unique()
words2 = [tokenize(unicode(sentence,'utf8'),nlp) for sentence in words2]
words2 = [word for tokWords in words2 for word in tokWords]
words.extend(words2)

Adding PAD as filler for normalizing sentence length and UNK for unkown tokens

In [None]:
words = set(words)
vocabulary = dict(zip(words,range(2,len(words)+2)))
vocabulary['PAD'] = 0
vocabulary['UNK'] = 1
print("Vocabulary Size including PAD and UNK: ",len(vocabulary))

Each question represented as list of index in the vocabulary

In [None]:
def loadWordVectors(filePath,vocab):
    txt = open('data/wiki.en.vec')
    wordVecs = np.zeros((len(vocab),300),dtype=float)
    for line in txt:
        splitData = line.split(" ")
        word = splitData[0]
        word = unicode(word,'utf8')
        if(word not in vocab):
            continue
        vector = splitData[1:len(splitData)-1]
        wordVecs[vocab[word]] = np.array(vector,dtype=float)
    return wordVecs
wordVecSize = 300
wordVecs = loadWordVectors('wiki/wiki.en.vec',vocabulary)

In [None]:
idx = 0 
for w in wordVecs:
    if(w is None):
        count += 1
        wordVecs[idx] = 2 * np.random.random_sample(wordVecSize) - 1

In [None]:

def tokenizeAndIndex(sentence):
    words = tokenize(unicode(sentence,'utf8'),nlp)
    retVal = [vocabulary[word] if word in vocabulary else vocabulary['UNK'] for word in words]
    return retVal
df['Q1Indexed'] = df.question1.apply(tokenizeAndIndex)
df['Q2Indexed'] = df.question2.apply(tokenizeAndIndex)

Threshold questions with total words <= 50

In [None]:

seqLength = 50
df = df[df.Q1Indexed.apply(len) <= seqLength]
df = df[df.Q2Indexed.apply(len) <= seqLength]

def normalizeSequenceLength(sequence):
    if(len(sequence) < seqLength):
        padding = [vocabulary['PAD'] for i in range(seqLength - len(sequence))]
        sequence.extend(padding)
    return sequence
df.Q1Indexed = df.Q1Indexed.apply(normalizeSequenceLength)
df.Q2Indexed = df.Q2Indexed.apply(normalizeSequenceLength)

**Building the Network**

Creating setence embedding

In [None]:
tf.reset_default_graph()

In [None]:
vocab_size = len(vocabulary)
embedding_size = wordVecSize

W = tf.Variable(wordVecs,name="W")

q1Input = tf.placeholder(tf.int32, [None, seqLength], name="q1Input")
q1Embeddings = tf.nn.embedding_lookup(W, q1Input)
q1Embeddings = tf.reduce_sum(q1Embeddings, 1)

q2Input = tf.placeholder(tf.int32, [None, seqLength], name="q1Input")
q2Embeddings = tf.nn.embedding_lookup(W, q2Input)
q2Embeddings = tf.reduce_sum(q2Embeddings, 1)

sentenceEmbedding = tf.concat([q1Embeddings,q2Embeddings],axis=1,name='sentenceEmbedding')

Dense layers and output

In [None]:
dense1 = tf.layers.dense(inputs=sentenceEmbedding, units=embedding_size*2, activation=tf.nn.tanh,name='dense1')
dense2 = tf.layers.dense(inputs=dense1, units=embedding_size*2, activation=tf.nn.tanh,name='dense2')
dense3 = tf.layers.dense(inputs=dense2, units=embedding_size*2, activation=tf.nn.tanh,name='dense3')
logits = tf.layers.dense(inputs=dense3, units=2,name='logits')
predictions = tf.argmax(input=tf.nn.softmax(logits=logits,dim=-1,name='softmax'),axis=1,name='output')

dense1 = tf.layers.dense(inputs=sentenceEmbedding, units=embedding_size*2, activation=tf.nn.tanh,name='dense1')
dense2 = tf.layers.dense(inputs=dense1, units=embedding_size*2, activation=tf.nn.tanh,name='dense2')
dense3 = tf.layers.dense(inputs=dense2, units=embedding_size*2, activation=tf.nn.tanh,name='dense3')
logits = tf.layers.dense(inputs=dense3, units=2,name='logits')
predictions = tf.argmax(input=tf.nn.softmax(logits=logits,dim=-1,name='softmax'),axis=1,name='output')