In [1]:
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
import tensorflow as tf

In [2]:
max_sent_length = 0
def read_data(filename):
    global max_sent_length
    questions = []
    labels = []
    fptr = open(filename,'r',encoding='latin-1')
    lines = fptr.readlines()
    for line in lines:
        row_str = line.split(":")
        lb,q = row_str[0],row_str[1]
        q = q.lower()
        labels.append(lb)
        questions.append(q.split())        
        if len(questions[-1])>max_sent_length:
            max_sent_length = len(questions[-1])
    return questions,labels

In [3]:
global train_questions,train_labels
global test_questions,test_labels


train_questions,train_labels = read_data(os.path.join('question-classif-data','trec-train-1000.txt'))
assert len(train_questions)==len(train_labels)

test_questions,test_labels = read_data(os.path.join('question-classif-data','trec-test.txt'))
assert len(test_questions)==len(test_labels)
for j in range(5):
    print('\tQuestion %d: %s' %(j,train_questions[j]))
    print('\tLabel %d: %s\n'%(j,train_labels[j]))
        
print('Max Sentence Length: %d'%max_sent_length)
print('\nNormalizing all sentences to same length now.....')

	Question 0: ['manner', 'how', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?']
	Label 0: DESC

	Question 1: ['cremat', 'what', 'films', 'featured', 'the', 'character', 'popeye', 'doyle', '?']
	Label 1: ENTY

	Question 2: ['manner', 'how', 'can', 'i', 'find', 'a', 'list', 'of', 'celebrities', "'", 'real', 'names', '?']
	Label 2: DESC

	Question 3: ['animal', 'what', 'fowl', 'grabs', 'the', 'spotlight', 'after', 'the', 'chinese', 'year', 'of', 'the', 'monkey', '?']
	Label 3: ENTY

	Question 4: ['exp', 'what', 'is', 'the', 'full', 'form', 'of', '.com', '?']
	Label 4: ABBR

Max Sentence Length: 33

Normalizing all sentences to same length now.....


In [19]:
#print(type(train_questions))
for qi,que in enumerate(train_questions):
    for _ in range(max_sent_length-len(que)):
        que.append('PAD')
    assert len(que)==max_sent_length
    train_questions[qi] = que
print('\tTrain questions normalized')
for qi,que in enumerate(test_questions):
    for _ in range(max_sent_length-len(que)):
        que.append('PAD')
    assert len(que)==max_sent_length
    test_questions[qi] = que
print('\tTest questions normalized')  
print('\t\tSample test question: %s',test_questions[0])

	Train questions normalized
	Test questions normalized
		Sample test question: %s ['dist', 'how', 'far', 'is', 'it', 'from', 'denver', 'to', 'aspen', '?', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


In [5]:
# Make data Numerical

def build_dataset(questions):
    #print(questions)
    words = []
    data_list = []
    count = []
    for d in questions:
        words.extend(d)
    print('%d Words found.'%len(words))    
    print('Found %d words in the vocabulary. '%len(collections.Counter(words).most_common()))
    count.extend(collections.Counter(words).most_common())
    #print(count)
    #print("####")
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    #for i in range (0,5) :
        #print(str(list(dictionary.keys())[i]) + ":" + str(dictionary[list(dictionary.keys())[i]]))
    for d in questions:
        data = list()
        for word in d:
            index = dictionary[word]        
            data.append(index)
            
        data_list.append(data)
    #print(data_list)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data_list, count, dictionary, reverse_dictionary    
    #return questions
    
#build_dataset(train_questions)

global data_list, count, dictionary, reverse_dictionary

all_questions = list(train_questions)
all_questions.extend(test_questions)

all_question_ind, count, dictionary, reverse_dictionary = build_dataset(all_questions)
#print(train_questions)

49500 Words found.
Found 3369 words in the vocabulary. 


In [6]:
# Printing a few values(To check)

print('All words (count)', count[:5])
print('0th entry in dictionary: %s',reverse_dictionary[0])
print('\nSample data') 
print(all_question_ind[0])
print(all_question_ind[1])

print('\nVocabulary size: ',len(dictionary))
vocabulary_size = len(dictionary)

print('\nTrain size: ',len(train_questions))
print('Test size: ',len(test_questions))

All words (count) [('PAD', 34407), ('?', 1454), ('the', 999), ('what', 963), ('is', 587)]
0th entry in dictionary: %s PAD

Sample data
[38, 12, 19, 2995, 1454, 6, 28, 2886, 2164, 850, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[44, 3, 827, 3120, 2, 175, 1597, 1413, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Vocabulary size:  3369

Train size:  1000
Test size:  500


In [7]:
batch_size = 16
sent_length = 33
num_classes = 6
all_labels = ['NUM','LOC','HUM','DESC','ENTY','ABBR']

class BatchGenerator(object) :
    def __init__(self,batch_size,questions,labels):
        self.questions = questions
        self.labels = labels
        self.text_size = len(questions)
        self.batch_size = batch_size
        self.data_index = 0
        
        assert len(self.questions)==len(self.labels)
        
    def generate_batch(self) :
        global sent_length,num_classes
        global dictionary
        inputs = np.zeros((self.batch_size,sent_length,vocabulary_size),dtype=np.float32)
        labels_ohe = np.zeros((self.batch_size,num_classes),dtype=np.float32)
        if (self.data_index + self.batch_size) >= self.text_size :
            self.data_index = 0
        for qi,que in enumerate(self.questions[self.data_index:self.data_index+batch_size]) :
            for wi,word in enumerate(que) :
                inputs[qi,wi,dictionary[word]] = 1.0
                
            labels_ohe[qi,all_labels.index(self.labels[self.data_index+qi])] = 1.0
            
        self.data_index = (self.data_index + self.batch_size)%self.text_size
        
        return inputs,labels_ohe

    def return_index(self):
        return self.data_index
            

In [8]:
sample_gen = BatchGenerator(batch_size,train_questions,train_labels)
#print(train_questions)
#print(train_labels)
sample_batch_inputs,sample_batch_labels = sample_gen.generate_batch()
print(sample_gen.return_index())
sample_batch_inputs_2,sample_batch_labels_2 = sample_gen.generate_batch()
print(sample_gen.return_index())
print("#############")
print(sample_batch_inputs[0],sample_batch_labels[0])
print(len(sample_batch_inputs),len(sample_batch_labels))


16
32
#############
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]] [0. 0. 0. 1. 0. 0.]
16 16


In [9]:
print(train_questions[0])
print(len(train_questions[0]))
print(np.argmax(sample_batch_inputs[0,:,:],axis=1))
print(len(np.argmax(sample_batch_inputs[0,:,:],axis=1)))

['manner', 'how', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
33
[  38   12   19 2995 1454    6   28 2886 2164  850    1    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]
33


In [10]:
print(np.asarray([dictionary[w] for w in train_questions[0]],dtype=np.int32))
print(len(np.asarray([dictionary[w] for w in train_questions[0]],dtype=np.int32)))

[  38   12   19 2995 1454    6   28 2886 2164  850    1    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]
33


In [11]:
print(sample_batch_inputs[0,:,:])
print(len(sample_batch_inputs))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
16


In [12]:
print('Sample batch labels')
print(np.argmax(sample_batch_labels,axis=1))
print(np.argmax(sample_batch_labels_2,axis=1))

Sample batch labels
[3 4 3 4 5 2 2 2 3 2 0 3 2 2 4 1]
[3 0 3 3 0 4 2 3 3 4 2 1 4 1 5 4]


In [13]:
w1 = tf.Variable(tf.truncated_normal([3,vocabulary_size,1],stddev=0.02,dtype=tf.float32),name='weights_1')
print(w1.shape)

(3, 3369, 1)


In [14]:
# CNN Model

batch_size = 16

# inputs and labels
sent_inputs = tf.placeholder(shape=[batch_size,sent_length,vocabulary_size],dtype=tf.float32,name='sentence_inputs')
sent_labels = tf.placeholder(shape=[batch_size,num_classes],dtype=tf.float32,name='sentence_labels')

# 3 filters with different context window sizes (3,5,7)
w1 = tf.Variable(tf.truncated_normal([3,vocabulary_size,1],stddev=0.02,dtype=tf.float32),name='weights_1')
b1 = tf.Variable(tf.random_uniform([1],0,0.01,dtype=tf.float32),name='bias_1')

w2 = tf.Variable(tf.truncated_normal([5,vocabulary_size,1],stddev=0.02,dtype=tf.float32),name='weights_2')
b2 = tf.Variable(tf.random_uniform([1],0,0.01,dtype=tf.float32),name='bias_2')

w3 = tf.Variable(tf.truncated_normal([7,vocabulary_size,1],stddev=0.02,dtype=tf.float32),name='weights_3')
b3 = tf.Variable(tf.random_uniform([1],0,0.01,dtype=tf.float32),name='bias_3')

# Calculate the output for all the filters with a stride 1
h1_1 = tf.nn.tanh(tf.nn.conv1d(sent_inputs,w1,stride=1,padding='SAME') + b1)
h1_2 = tf.nn.tanh(tf.nn.conv1d(sent_inputs,w2,stride=1,padding='SAME') + b2)
h1_3 = tf.nn.tanh(tf.nn.conv1d(sent_inputs,w3,stride=1,padding='SAME') + b3)

# Max Pooling
h2_1 = tf.reduce_max(h1_1,axis=1)
h2_2 = tf.reduce_max(h1_2,axis=1)
h2_3 = tf.reduce_max(h1_3,axis=1)

h2 = tf.concat([h2_1,h2_2,h2_3],axis=1)
h2_shape = h2.get_shape().as_list()
print(h2_shape)


w_fc1 = tf.Variable(tf.truncated_normal([h2_shape[1],num_classes],stddev=0.005,dtype=tf.float32),name='weights_fulcon_1')
b_fc1 = tf.Variable(tf.random_uniform([num_classes],0,0.01,dtype=tf.float32),name='bias_fulcon_1')


logits = tf.matmul(h2,w_fc1) + b_fc1

predictions = tf.argmax(tf.nn.softmax(logits),axis=1)

# Loss (Cross-Entropy)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sent_labels,logits=logits))

# Momentum Optimizer
optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,momentum=0.9).minimize(loss)

[16, 3]


In [15]:
batch_size = 16

# inputs and labels
sent_inputs = tf.placeholder(shape=[batch_size,sent_length,vocabulary_size],dtype=tf.float32,name='sentence_inputs')
sent_labels = tf.placeholder(shape=[batch_size,num_classes],dtype=tf.float32,name='sentence_labels')

# 3 filters with different context window sizes (3,5,7)
# Each of this filter spans the full one-hot-encoded length of each word and the context window width
w1 = tf.Variable(tf.truncated_normal([3,vocabulary_size,1],stddev=0.02,dtype=tf.float32),name='weights_1')
b1 = tf.Variable(tf.random_uniform([1],0,0.01,dtype=tf.float32),name='bias_1')

w2 = tf.Variable(tf.truncated_normal([5,vocabulary_size,1],stddev=0.02,dtype=tf.float32),name='weights_2')
b2 = tf.Variable(tf.random_uniform([1],0,0.01,dtype=tf.float32),name='bias_2')

w3 = tf.Variable(tf.truncated_normal([7,vocabulary_size,1],stddev=0.02,dtype=tf.float32),name='weights_3')
b3 = tf.Variable(tf.random_uniform([1],0,0.01,dtype=tf.float32),name='bias_3')

# Calculate the output for all the filters with a stride 1
h1_1 = tf.nn.tanh(tf.nn.conv1d(sent_inputs,w1,stride=1,padding='SAME') + b1)
h1_2 = tf.nn.tanh(tf.nn.conv1d(sent_inputs,w2,stride=1,padding='SAME') + b2)
h1_3 = tf.nn.tanh(tf.nn.conv1d(sent_inputs,w3,stride=1,padding='SAME') + b3)

# This is doing the max pooling. Thereare two options to do the max pooling
# 1. Use tf.nn.max_pool operation on a tensor made by concatenating h1_1,h1_2,h1_3 and converting that tensor to 4D
# (Because max_pool takes a tensor of rank >= 4 )
# 2. Do the max pooling separately for each filter output and combine them using tf.concat 
# (this is the one used in the code)

h2_1 = tf.reduce_max(h1_1,axis=1)
h2_2 = tf.reduce_max(h1_2,axis=1)
h2_3 = tf.reduce_max(h1_3,axis=1)

h2 = tf.concat([h2_1,h2_2,h2_3],axis=1)
h2_shape = h2.get_shape().as_list()

# Weights and bias of the output layer
w_fc1 = tf.Variable(tf.truncated_normal([h2_shape[1],num_classes],stddev=0.005,dtype=tf.float32),name='weights_fulcon_1')
b_fc1 = tf.Variable(tf.random_uniform([num_classes],0,0.01,dtype=tf.float32),name='bias_fulcon_1')

# since h2 is 2d [batch_size,output_width] reshaping the output is not required as it usually do in CNNs
logits = tf.matmul(h2,w_fc1) + b_fc1

predictions = tf.argmax(tf.nn.softmax(logits),axis=1)

# Loss (Cross-Entropy)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=sent_labels,logits=logits))

# Momentum Optimizer
optimizer = tf.train.MomentumOptimizer(learning_rate=0.01,momentum=0.9).minimize(loss)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [16]:
# With filter widths [3,5,7] the algorithm achieves around ~90% accuracy on test dataset (50 epochs). 

session = tf.InteractiveSession()

batch_size = 16
num_steps = 50
tf.global_variables_initializer().run()
train_gen = BatchGenerator(batch_size,train_questions,train_labels)
test_gen = BatchGenerator(batch_size,test_questions,test_labels)

test_interval = 1

# Get accuracy 
def accuracy(labels,preds):
    return np.sum(np.argmax(labels,axis=1)==preds)/labels.shape[0]

print('Initialized\n')


for step in range(num_steps):
    avg_loss = []
    for tr_i in range((len(train_questions)//batch_size)-1):
        tr_inputs, tr_labels = train_gen.generate_batch()
        
        l,_ = session.run([loss,optimizer],feed_dict={sent_inputs: tr_inputs, sent_labels: tr_labels})
        avg_loss.append(l)
        
    print('Train Loss at Epoch %d: %.2f'%(step,np.mean(avg_loss)))
    test_accuracy = []
    if (step+1)%test_interval==0:        
        for ts_i in range((len(test_questions)-1)//batch_size):
            ts_inputs,ts_labels = test_gen.generate_batch()
            preds = session.run(predictions,feed_dict={sent_inputs: ts_inputs, sent_labels: ts_labels})
            test_accuracy.append(accuracy(ts_labels,preds))
            
        print('Test accuracy at Epoch %d: %.3f'%(step,np.mean(test_accuracy)*100.0))


Initialized

Train Loss at Epoch 0: 1.74
Test accuracy at Epoch 0: 18.548
Train Loss at Epoch 1: 1.69
Test accuracy at Epoch 1: 18.548
Train Loss at Epoch 2: 1.67
Test accuracy at Epoch 2: 18.548
Train Loss at Epoch 3: 1.66
Test accuracy at Epoch 3: 18.548
Train Loss at Epoch 4: 1.65
Test accuracy at Epoch 4: 29.032
Train Loss at Epoch 5: 1.61
Test accuracy at Epoch 5: 30.242
Train Loss at Epoch 6: 1.52
Test accuracy at Epoch 6: 30.242
Train Loss at Epoch 7: 1.38
Test accuracy at Epoch 7: 31.452
Train Loss at Epoch 8: 1.29
Test accuracy at Epoch 8: 31.452
Train Loss at Epoch 9: 1.24
Test accuracy at Epoch 9: 31.452
Train Loss at Epoch 10: 1.20
Test accuracy at Epoch 10: 31.452
Train Loss at Epoch 11: 1.19
Test accuracy at Epoch 11: 31.452
Train Loss at Epoch 12: 1.17
Test accuracy at Epoch 12: 32.258
Train Loss at Epoch 13: 1.14
Test accuracy at Epoch 13: 53.629
Train Loss at Epoch 14: 1.11
Test accuracy at Epoch 14: 58.065
Train Loss at Epoch 15: 1.07
Test accuracy at Epoch 15: 58.871

In [17]:
session.close()