In [None]:
import pandas as pd
import numpy as np
import re
from autocorrect import spell

train_file = '/home/abhilash/Kaggle/Quora/Data/train.csv'
glove_file = '/home/abhilash/MajorProject/MajorProject/glove.6B/glove.6B.300d.txt'
glove_dim = 300
batch_size = 32
split_ratio = 0.99
max_len = 0
n_hidden = 128 
n_classes = 2 
learning_rate = 0.01

In [None]:
def clean_question(question):
    question = str(question).lower()
    question = question.strip('?/-().,:;')
    question = question.strip(' ?')
    question = re.sub(r'[^a-zA-Z0-9 ]','', question)
    
    clean_question = []
    for i, ch in enumerate(question):
        if i > 0 and ((question[i-1].isdigit() and question[i].isalpha()) or (question[i-1].isalpha() and question[i].isdigit())):
            clean_question.append(' ')
        clean_question.append(question[i])
    processed_question = "".join(clean_question)
    return processed_question

In [None]:
#Unknown words not handled yet
def generate_embeddings(word_dict, vocab_size):
    embedding_matrix = np.random.uniform(low=-1, high=1, size=(vocab_size, glove_dim))
    num_embeddings = 0
    f = open(glove_file)
    for line in f:
        values = line.split()
        word = values[0]
        if(word in word_dict):
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_matrix[word_dict[word]] = coefs
            num_embeddings+=1
    f.close()
    print("Number of words ", vocab_size)
    print("Number of embeddings ", num_embeddings)
    return embedding_matrix


In [None]:
word_dict = {}
count = 1 #0 for padding
train_tuples = []

train_df = pd.read_csv(train_file)
for (index, row) in train_df.iterrows():
    q1 = clean_question(row['question1'])
    q2 = clean_question(row['question2'])

    train_tuples.append((q1, q2, row['is_duplicate']))

    words1 = q1.split(' ')
    words2 = q2.split(' ')

    for word in words1:
        if(word not in word_dict):
            word_dict[word] = count
            count += 1
#             spelt_word = spell(word)
#             if(word == spelt_word):
#                 word_dict[word] = count
#                 count += 1
#             else:
#                 word_dict[spelt_word] = count
#                 count += 1
    for word in words2:
        if(word not in word_dict):
            word_dict[word] = count
            count += 1
#             spelt_word = spell(word)
#             if(word == spelt_word):
#                 word_dict[word] = count
#                 count+=1
#             else:
#                 word_dict[spelt_word] = count
#                 count += 1

In [None]:
vocab_size = count+1
embedding_matrix = generate_embeddings(word_dict, vocab_size)

Tensor Flow Code

In [None]:
import tensorflow as tf
import random
from tensorflow.contrib import rnn


In [None]:
def vectorize(data, word_dict):
    
    q1s = []
    q2s = []
    labels = []
    for tup in data:
        q1_seq = []
        q2_seq = []
        
        q1_words = tup[0].split(' ')
        for word in q1_words:
            q1_seq.append(word_dict[word])
            
        q2_words = tup[1].split(' ')
        for word in q2_words:
            q2_seq.append(word_dict[word])
        
        q1s.append(q1_seq)
        q2s.append(q2_seq)
        labels.append(tup[2])
    return q1s, q2s, labels
    

In [None]:
def get_batches(size, batch_size, shuffle=False):
    idx_list = np.arange(0, size, batch_size)
    if(shuffle):
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + batch_size, size)))
    return minibatches

In [None]:
def pad_data(seqs):
    global max_len
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    
    max_len = np.max(lengths)
    x = np.zeros((n_samples, max_len)).astype('int32')
    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
    return x

In [None]:
def gen_batch_data(q1, q2, labels, batch_size):
    minibatches = get_batches(len(q1), batch_size)
    examples = []
    for minibatch in minibatches:
        m_q1 = [q1[t] for t in minibatch]
        m_q2 = [q2[t] for t in minibatch]
        l = []
        for t in minibatch:
            if(labels[t] == 0):
                l.append([1, 0]) 
            else:
                l.append([0, 1])        
        m_q1 = pad_data(m_q1)
        m_q2 = pad_data(m_q2)
        
        examples.append((m_q1, m_q2, l))
    return examples

In [None]:
def split_data(q1, q2, labels, split_ratio):
    l = len(q1)
    train_len = int(l * split_ratio)
    return q1[:train_len], q2[:train_len], labels[:train_len], q1[train_len:l], q2[train_len:l], labels[train_len:l]

In [None]:
#Could be optimized shuffling can be performed after vectorization
def generate_data(): 
    random.shuffle(train_tuples)
    q1, q2, labels = vectorize(train_tuples, word_dict)
    q1_train, q2_train, labels_train, q1_val, q2_val, labels_val = split_data(q1, q2, labels, split_ratio)
    train_data = gen_batch_data(q1_train, q2_train, labels_train, batch_size)
    val_data = gen_batch_data(q1_val, q2_val, labels_val, batch_size)
    return train_data, val_data

In [None]:
W = tf.Variable(tf.constant(0.0, shape=[embedding_matrix.shape[0], embedding_matrix.shape[1]], dtype=tf.float64), trainable=True, name="W", dtype=tf.float64)
embedding_placeholder = tf.placeholder(tf.float64, [embedding_matrix.shape[0], embedding_matrix.shape[1]])
embedding_init = W.assign(embedding_placeholder)

In [None]:
n_steps = 28 

In [None]:
x1 = tf.placeholder('int32', [batch_size, None])
x2 = tf.placeholder('int32', [batch_size, None])


y_train = tf.placeholder('int32', [batch_size, 2])

weights = {
    'out': tf.Variable(tf.random_normal([4*n_hidden, n_classes], dtype=tf.float64))
}
biases = {
    'out': tf.Variable(tf.random_normal([n_classes], dtype=tf.float64))
}

In [None]:
def BiRNN(x1_emb, x2_emb, weights, biases):

    print(len(x1_emb), len(x2_emb))
    print(x1_emb[0].shape, x2_emb[0].shape)
    with tf.variable_scope('forward1'):
        lstm_fw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    with tf.variable_scope('backward1'):
        lstm_bw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    with tf.variable_scope('op1'):
        outputs1, _f, _w = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x1_emb, dtype=tf.float64)

        
    with tf.variable_scope('forward2'):
        lstm_fw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    with tf.variable_scope('backward2'):
        lstm_bw_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    with tf.variable_scope('op2'):
        outputs2, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x2_emb, dtype=tf.float64)
    
    outputs1 = tf.reverse(outputs1, [1])
    outputs2 = tf.reverse(outputs2, [1])
    
    q1_final_emb = tf.reshape(tf.slice(outputs1, [0, 0, 0], [-1, 1, -1]), [batch_size, 2*n_hidden])
    q2_final_emb = tf.reshape(tf.slice(outputs2, [0, 0, 0], [-1, 1, -1]), [batch_size, 2*n_hidden])
    
    return tf.matmul(tf.concat([q1_final_emb, q2_final_emb], 1), weights['out']) + biases['out']

In [None]:
x1_embedding = tf.nn.embedding_lookup(W, x1)
x2_embedding = tf.nn.embedding_lookup(W, x2)

pred = BiRNN(tf.unstack(x1_embedding, axis=0), tf.unstack(x2_embedding, axis=0), weights, biases)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y_train))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y_train,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float64))

init = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:
    sess.run(init)
    sess.run(embedding_init, feed_dict={embedding_placeholder: embedding_matrix})
    
    epoch = 1
    while epoch < 3:
        train_data, val_data = generate_data()
        ct=1
        for tup in train_data:
            sess.run(optimizer, feed_dict={x1:tup[0], x2:tup[1], y_train:tup[2]})
            ac = sess.run(accuracy, feed_dict={x1:tup[0], x2:tup[1], y_train:tup[2]})
            print("In ct " + str(ct) + " Training Accuracy is " + "{:.6f}".format(ac) )
            if(ct%10 == 0):
                val_acc = tf.reduce_mean([sess.run(accuracy, feed_dict={x1:val_tup[0], x2:val_tup[1], y_train:val_tup[2]}) for val_tup in val_data if(len(val_tup[0])==32)])
                print(val_acc.eval())
#                 print("In Epoch " + str(epoch) + " Validation Accuracy is " + "{:.6f}".format(val_acc) )
            ct+=1
        epoch += 1
    print("Optimization Finished!")
