# LSTM Model with Supervised Word Embeddings
No pretrained embeddings were used in this notebook

In [1]:
import pandas as pd
import re
import os
import unicodedata
import string
import numpy as np
import tensorflow as tf

In [2]:
class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "pad"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.lower().split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1



#returns object which has word2index mapping, index2word, vocalbulary size and vocabulary
def get_question_metadata(question_list, reverse=False):
    question_repo = Lang()
    print("Read %s questions " % len(question_list))
    print("Counting words...")
    for question in question_list:
        question_repo.addSentence(question)

    print("Counted words:")
    print(question_repo.n_words)

    return question_repo

#make all the input text of the same size as size max_length input senetnce, padding with word "PAD"(zero padding)
def make_input(sentence):
    sent_len = len(sentence.split(' '))
    if sent_len < MAX_LEN:
        padded_sentence = sentence + (MAX_LEN - len(sentence.split(' '))) * " PAD"
    else:
        padded_sentence = sentence

    return padded_sentence,sent_len

#
# def question_index_vector(question):
#     input_question = indexesFromSentence(processed_input, question)
#     return (input_question)

#function to return batch of data
def get_sentence_batch(batch_size,data_x,data_y,data_seqlens,input_metadata):
    
    #shuffling and creating training batch data of batch_size
    instance_indices = list(range(len(data_x)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    
    #converting sentence to index vector using word2index dictionary
    x = [[input_metadata.word2index[word] for word in data_x[i].lower().split(' ')]for i in batch]
    y = [data_y[i] for i in batch]
    seqlens = [data_seqlens[i] for i in batch]
    return x,y,seqlens

def get_test_batch(batch_size,data_x, data_seqlens, input_metadata):
    
    #shuffling and creating training batch data of batch_size
    instance_indices = list(range(len(data_x)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    
    #converting sentence to index vector using word2index dictionary
    x = [[input_metadata.word2index[word] for word in data_x[i].lower().split(' ')]for i in batch]
    seqlens = [data_seqlens[i] for i in batch]
    return x, seqlens


In [3]:


#loading data
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")




In [4]:

#extracting data required for training
X_train = train_data["question_text"]
Y_train = train_data["target"]

X_test = test_data["question_text"]
# Y_test = test_data["target"]

In [None]:
#computing maximum length of question available in train.csv
MAX_LEN = X_train.map(lambda x: len(x.split(' '))).max()

In [None]:

batch_size = 128;embedding_dimension = 64;num_classes = 2
hidden_layer_size = 32;time_steps = 122; element_size =1


test_seqlens = []
train_seqlens = []
new_x_train = []
new_y_train = []
new_x_test = []
# new_y_test = []


In [None]:


question_metadata = get_question_metadata(X_train)
question_metadata_test = get_question_metadata(X_test)

Read 1306122 questions 
Counting words...
Counted words:
450693
Read 56370 questions 
Counting words...
Counted words:
64222


In [None]:

#representing output in the form of one-hot format
for i in range(len(Y_train)):
    label = Y_train[i]
    one_hot_encoding = [0]*2
    one_hot_encoding[label] = 1
    new_y_train.append(one_hot_encoding)

for ques in X_train:
    padded_sentence, sent_len = make_input(ques)
    new_x_train.append(padded_sentence)
    train_seqlens.append(sent_len)
    
# for i in range(len(Y_test)):
#     label = Y_test[i]
#     one_hot_encoding = [0]*2
#     one_hot_encoding[label] = 1
#     new_y_test.append(one_hot_encoding)
#
for ques in X_test:
    padded_sentence, sent_len = make_input(ques)
    new_x_test.append(padded_sentence)
    test_seqlens.append(sent_len)


# question_vectors = []
# for question in X_train:
#     question_vectors.append(indexesFromSentence(processed_input,question))

In [None]:
#creating placeholders for data
_inputs = tf.placeholder(tf.int32,shape=[None, time_steps], name="inputs" )
_labels = tf.placeholder(tf.float32, shape=[batch_size, num_classes], name="labels" )

#seqlens for dynamic calculations
_seqlens = tf.placeholder(tf.int32, shape=[None], name="seqs" )
_inputs_test = tf.placeholder(tf.int32,shape=[None, time_steps], name="inputs_test" )

In [None]:

#to obtain word's vector, tf.nn.embedding_lookup is used
with tf.name_scope("embeddings"):
    embeddings = tf.Variable(tf.random_uniform([question_metadata.n_words,embedding_dimension],
                                               -1.0,1.0,name='embedding'))
    embed = tf.nn.embedding_lookup(embeddings,_inputs)

with tf.variable_scope("lstm"):
    
    lstm_cell = tf.nn.rnn_cell.LSTMCell(hidden_layer_size,forget_bias=1.0)
    
    #resolving the issue of adding noise as the form "PAD", by passing actual sequence length to tf.nn.dynamic_rnn() 
    output, states = tf.nn.dynamic_rnn(lstm_cell, embed, sequence_length = _seqlens,dtype=tf.float32)


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


In [None]:
weights = {
    'linear_layer': tf.Variable(tf.truncated_normal([hidden_layer_size,num_classes],mean=0,stddev=.01))
}

biases = {
    'linear_layer': tf.Variable(tf.truncated_normal([num_classes],mean=0,stddev=.01))
}

In [None]:
#Extract the last relevant output and use in a linear  layer

final_output = tf.matmul(states[1],weights['linear_layer'])+ biases['linear_layer']
softmax = tf.nn.softmax_cross_entropy_with_logits(logits=final_output,labels=_labels)
cross_entropy = tf.reduce_mean(softmax)

train_step = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(_labels,1),tf.argmax(final_output,1))
accuracy = (tf.reduce_mean(tf.cast(correct_prediction,tf.float32)))*100

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for step in range(1000):
        x_batch, y_batch, seqlen_batch = get_sentence_batch(batch_size,new_x_train,new_y_train,train_seqlens,question_metadata)

        sess.run(train_step,feed_dict={_inputs:x_batch,_labels:y_batch,_seqlens:seqlen_batch})

        if step % 100 == 0:
            acc = sess.run(accuracy,feed_dict={_inputs:x_batch,_labels:y_batch,_seqlens:seqlen_batch})
            print("Accuracy at %d: %.5f" % (step,acc))

Accuracy at 0: 19.53125
Accuracy at 100: 93.75000
Accuracy at 200: 94.53125
Accuracy at 300: 95.31250
Accuracy at 400: 93.75000
Accuracy at 500: 92.18750
Accuracy at 600: 94.53125
Accuracy at 700: 94.53125
Accuracy at 800: 92.96875
Accuracy at 900: 93.75000
