#### Sentence Classification

We will download the corpus from [http://cogcomp.org/Data/QA/QC/](http://cogcomp.org/Data/QA/QC/).


In [7]:
import os
from urllib.request import urlretrieve
import shutil

url = 'http://cogcomp.org/Data/QA/QC/'
train_file_name = 'train_1000.label'
test_file_name = 'TREC_10.label'

def maybe_download(url, file_name):
    if os.path.exists(file_name):
        print('Requested file', file_name, 'exists locally, no download will be performed')
    else:
        file_url = url + file_name
        print('Downloading from', file_url)
        local_tmp_file, _ = urlretrieve(file_url)
        shutil.move(local_tmp_file, file_name)
        print('Remote file successfully downloaded')

maybe_download(url, train_file_name)
maybe_download(url, test_file_name)

Downloading from http://cogcomp.org/Data/QA/QC/train_1000.label
Remote file successfully downloaded
Downloading from http://cogcomp.org/Data/QA/QC/TREC_10.label
Remote file successfully downloaded


The file has following format
``<class>:<subclass> <question>?``

The taxonomy of the class and subclass can be found [here](http://cogcomp.org/Data/QA/QC/definition.html)


In [32]:
def read_questions(in_file):
    #returns 
    # class: List of class, the total results are same as the number of questions in the file
    # sub_class: List of sub class, the total results are same as the number of questions in the file    
    # questions: each item in the list of is a list of split of the question by space
    # max_question_len: Maximum length of the question after splitting by space
    question_class = []
    question_subclass = []
    splits = []
    max_len = 0
    with open(in_file, 'r', encoding = 'latin-1') as f:
        lines = f.readlines()
        for line in lines:
            head, *tail = line.split(':')
            tail = tail[0].lower().split()
            question_class.append(head)
            question_subclass.append(tail[0])
            splits.append(tail[1:])
            max_len = max(max_len, len(tail) - 1)
    
    return question_class, question_subclass, splits, max_len

In [33]:
train_question_class, _ , train_questions, train_max_len =  read_questions(train_file_name)
test_question_class, _ , test_questions, test_max_len =  read_questions(test_file_name)
print('Max num of words in train question corpus is', train_max_len)
print('Max num of words in test question corpus is', test_max_len)

Max num of words in train question corpus is 32
Max num of words in test question corpus is 17


Lets look at first few question categories and question splits

In [34]:
for i in range(3):
    print('Question category is', train_question_class[i])
    print('\tQuestion tokens are', train_questions[i])    

Question category is DESC
	Question tokens are ['how', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?']
Question category is ENTY
	Question tokens are ['what', 'films', 'featured', 'the', 'character', 'popeye', 'doyle', '?']
Question category is DESC
	Question tokens are ['how', 'can', 'i', 'find', 'a', 'list', 'of', 'celebrities', "'", 'real', 'names', '?']


We will pad the words with a padding string ``PAD`` to ensure all questions have same length

In [38]:
def pad_questions(unpadded_questions, max_len):
    padded_questions = []
    for up in unpadded_questions:
        q = ['PAD'] * max_len
        padded_questions.append(q)
        for i, token in enumerate(up):
            q[i] = token
        
    return padded_questions

max_len = max(train_max_len, test_max_len)
padded_train_set = pad_questions(train_questions, max_len)
padded_test_set = pad_questions(test_questions, max_len)

print('Length of padded questions is', max_len)
print('Sample padded training question is\n\t', padded_train_set[0])

Length of padded questions is 32
Sample padded training question is
	 ['how', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


---

We will create the following 4 data structures from the padded questions

- dictionary: Mapping between (word, word_id) in corpus
- reverse_dictionary: Mapping between (word_id, word) in corpus
- count: list of tuples of (word, count of word) ordered by the number of occurrances
- data: The data where all words in the question are replaced by the id

In [63]:
import collections


def prepare_dataset(padded_question_set):
    all_words = [token for q in padded_question_set for token in q]
    counts = collections.Counter(all_words).most_common()
    dictionary = {}
    reverse_dictionary = {}
    
    for i, (word, _) in enumerate(counts):
        dictionary[word] = i
        reverse_dictionary[i] = word
    
    train_data = [[dictionary[w] for w in q] for q in padded_question_set]
    return dictionary, reverse_dictionary, train_data, counts

all_questions = list(padded_train_set)
all_questions.extend(padded_test_set)
dictionary, reverse_dictionary, dataset, counts = prepare_dataset(all_questions)
print('counts(top 5)', counts[:5])
print('Number of unique words in corpus are', len(counts))
print('Sample question(0) is', dataset[0])
print('Reversed sample question(0) is', [reverse_dictionary[i] for i in dataset[0]])
unique_labels = set(train_question_class)
unique_labels.update(test_question_class)
print('Unique labels are', unique_labels)

counts(top 5) [('PAD', 34407), ('?', 1454), ('the', 999), ('what', 963), ('is', 587)]
Number of unique words in corpus are 3349
Sample question(0) is [9, 15, 982, 983, 6, 23, 984, 985, 518, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Reversed sample question(0) is ['how', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'russia', '?', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
Unique labels are {'LOC', 'ENTY', 'DESC', 'NUM', 'HUM', 'ABBR'}


---

We will now define a ``BatchGenerator`` which will generate batches of the give data
the batch will return two value

- an input of dimension (batch_size, max_sent_length, embedding_size), in our case embedding size will be same as the size of the vocabulary size and the value will be one hot encoded vector
- The labels which will be one hot encoded vectors of size same as the number of labels

In [99]:
import numpy as np

class BatchGenerator(object):
    
    def __init__(self, batch_size, dataset, labels, word2emb, unique_labels):
        #
        # batch_size: Batch size
        # dataset: is the prepared dataset from previous step        
        # word2emb: function that generates embedding from given word
        #
        self.dataset = dataset
        self.word2emb = word2emb
        self.current_idx = 0
        self.labels = labels
        self.batch_size = batch_size
        self.unique_labels = list(unique_labels)
        None
    
    
    def __shape_question__(self, padded_question):
        return [to_one_hot_encoding(w) for w in padded_question]

    def reset(self):
        self.current_idx = 0
    
    def generate_batch(self):
        batch = []
        labels = []
        for _ in range(self.batch_size):
            batch.append(self.__shape_question__(dataset[self.current_idx]))
            c_label = [0] * len(self.unique_labels)
            c_label[self.unique_labels.index(self.labels[self.current_idx])] = 1
            labels.append(c_label)
            self.current_idx += self.current_idx % len(dataset)
            
        return np.array(batch), np.array(labels)
    

def to_one_hot_encoding(word):
    one_hot = [0] * len(dictionary)
    one_hot[word] = 1
    return one_hot


prepared_train_dataset = dataset[0:1000]
prepared_test_dataset = dataset[1000:]
the_one_hot = to_one_hot_encoding(dictionary['the'])
print('One hot for "the" has element number', the_one_hot.index(1), 'set')
batch_size = 16

train_batch_generator = BatchGenerator(
                            batch_size, 
                            prepared_train_dataset, 
                            train_question_class, 
                            to_one_hot_encoding, 
                            unique_labels)

input_batch, input_labels = train_batch_generator.generate_batch()
print('shape of input_batch is', input_batch.shape, ', input_labels has shape', input_labels.shape)
print('first label has value', input_labels[0], ' labels are', train_batch_generator.unique_labels)

One hot for "the" has element number 2 set
shape of input_batch is (16, 32, 3349) , input_labels has shape (16, 6)
first label has value [0 0 1 0 0 0]  labels are ['LOC', 'ENTY', 'DESC', 'NUM', 'HUM', 'ABBR']
