# KAIST Summer Session 2018

## Sentense Classification using Convolutional Neural Network (08.20.2018)

- This dataset is obtained from http://cogcomp.org/Data/QA/QC/. This consists of question sentences classified into 6 types (whether the question is about person, location, numeric information, etc.).
- This code is adapted from https://github.com/DSKSD/DeepNLP-models-Pytorch

### 0. Data Preparation

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import re
import gensim      # need to install      # pip install gensim
from gensim.models.wrappers import FastText
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [2]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch
        
def pad_to_batch(batch):
    x,y, y_sub = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
            x_p.append(torch.cat([x[i], torch.LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1), torch.cat(y_sub).view(-1)

def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
    return torch.LongTensor(idxs[1::])

def getKeysByValue(dictOfElements, valueToFind):
    listOfKeys = list()
    listOfItems = dictOfElements.items()
    for item  in listOfItems:
        if item[1] == valueToFind:
            listOfKeys.append(item[0])
    return  listOfKeys

In [3]:
data = open('sentence classification/train_5500.label.txt', 'r', encoding='latin-1').readlines()
data = [[d.split(':')[1][:-1], d.split(':')[0]] for d in data]
X, y = list(zip(*data))
y_sub = []
X = list(X)
for i in range(len(X)):
    y_sub.append(X[i].split(' ')[0])


for i, x in enumerate(X):
    X[i] = re.sub('\d', '#', x).split()
    
    
vocab = list(set(flatten(X)))

word2index={'<PAD>': 0, '<UNK>': 1}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

target2index = {}
target_sub2index = {}

for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)
        
for cl in set(y_sub):
    if target_sub2index.get(cl) is None:
        target_sub2index[cl] = len(target_sub2index)

index2target = {v:k for k, v in target2index.items()}
index2target_sub = {v:k for k, v in target_sub2index.items()}


X_p, y_p, y_sub_p = [], [], []

for pair in zip(X, y, y_sub):
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_p.append(torch.LongTensor([target2index[pair[1]]]).view(1, -1))
    y_sub_p.append(torch.LongTensor([target_sub2index[pair[2]]]).view(1, -1))


    
data_p = list(zip(X_p, y_p, y_sub_p))
data_p_5 = []

for i in range(len(data_p)):
    if len(data_p[i][0].data.tolist()[0])>= 5 :
        data_p_5.append(data_p[i])

random.shuffle(data_p_5)
train_data = data_p_5[: int(len(data_p_5) * 0.9)]
test_data = data_p_5[int(len(data_p_5) * 0.9):]

- Let's look inside the dataset

In [4]:
for i in range(10):
    ran = random.randint(0, len(train_data)-1)
    print('Source sentence: ', end=' ')
    for j in range(len(train_data[ran][0].data.tolist()[0])):
        word_find = getKeysByValue(word2index, train_data[ran][0].data.tolist()[0][j])
        print(word_find[0], end=' ')
    print('\nTrue category: ', getKeysByValue(target2index, train_data[ran][1].data.tolist()[0][0]))
    print('True sub_category: ', getKeysByValue(target_sub2index, train_data[ran][2].data.tolist()[0][0]))
    print("")
    
'''
Definition of Question Classes
    ABBR = ABBREVIATION: abbreviation
    ENTY = ENTITY: entities
    DESC = DESCRIPTION: description and abstract concepts
    HUM = HUMAN: human beings
    LOC = LOCATION: locations
    NUM = NUMERIC: numeric values
'''

Source sentence:  What two countries contain Sierra Nevada mountains ? 
True category:  ['LOC']
True sub_category:  ['country']

Source sentence:  The name of the actor who played the detective in the film Kindergarden Cop is what ? 
True category:  ['HUM']
True sub_category:  ['ind']

Source sentence:  What color flies closest to the staff on Belgium 's flag ? 
True category:  ['ENTY']
True sub_category:  ['color']

Source sentence:  The film `` Jaws '' was made in what year ? 
True category:  ['NUM']
True sub_category:  ['date']

Source sentence:  What gate opened on East and West Berlin ? 
True category:  ['LOC']
True sub_category:  ['other']

Source sentence:  What two states is Washington D.C. between ? 
True category:  ['LOC']
True sub_category:  ['state']

Source sentence:  What southwestern state is dubbed The Silver State ? 
True category:  ['LOC']
True sub_category:  ['state']

Source sentence:  At what age did Rossini stop writing opera ? 
True category:  ['NUM']
True sub_ca

'\nDefinition of Question Classes\n    ABBR = ABBREVIATION: abbreviation\n    ENTY = ENTITY: entities\n    DESC = DESCRIPTION: description and abstract concepts\n    HUM = HUMAN: human beings\n    LOC = LOCATION: locations\n    NUM = NUMERIC: numeric values\n'

### 1. Define a Neural Network

In [5]:
class CNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNN,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
    
    
    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated) # (N,len(Ks)*Co)
        out = self.fc(concated) 
        return F.log_softmax(out,1)

### 2. Define a Loss Function and Optimizer

- Random initialization for word embeddings

In [6]:
without_pretrained = []

for key in word2index.keys():
    without_pretrained.append(np.random.randn(300))

without_pretrained_vectors = np.vstack(without_pretrained)

In [7]:
n_epochs = 5
BATCH_SIZE = 50
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100
Learning_Rate = 0.001

# Instantiate CNN model
model = CNN(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES).to(device)
model.init_weights(without_pretrained_vectors) # randomly initialize embedding matrix


# Set loss and optimizer function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=Learning_Rate)

### 3. Training

In [8]:
for epoch in range(n_epochs):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs,targets, targets_sub = pad_to_batch(batch)
        
        preds = model(inputs, True)        
        loss = criterion(preds, targets)
        losses.append(loss.data)
        
        model.zero_grad()
        loss.backward()        
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, n_epochs, np.mean(losses)))
            losses = []          
            
print("Learning finished!")

[0/5] mean_loss : 2.22
[1/5] mean_loss : 0.41
[2/5] mean_loss : 0.26
[3/5] mean_loss : 0.14
[4/5] mean_loss : 0.08
Learning finished!


### 4. Evaluation

In [9]:
accuracy = 0

for test in test_data:
    pred = model(test[0]).max(1)[1]
    pred = pred.data.tolist()[0]
    target = test[1].data.tolist()[0][0]
    if pred == target:
        accuracy += 1

print(accuracy/len(test_data) * 100)

83.01886792452831


- Let's see how the model works well

In [10]:
for i in range(10):
    ran = random.randint(0, len(test_data)-1)
    print('Source sentence: ', end=' ')
    for j in range(len(test_data[ran][0].data.tolist()[0])):
        word_find = getKeysByValue(word2index, test_data[ran][0].data.tolist()[0][j])
        print(word_find[0], end=' ')
    print('\nTrue category: ', getKeysByValue(target2index, test_data[ran][1].data.tolist()[0][0]))
    pred = model(test_data[ran][0]).max(1)[1]
    pred = pred.data.tolist()[0]
    print('Predicted category: ', getKeysByValue(target2index, pred))
    print("")
    
'''
Definition of Question Classes
    ABBR = ABBREVIATION: abbreviation
    ENTY = ENTITY: entities
    DESC = DESCRIPTION: description and abstract concepts
    HUM = HUMAN: human beings
    LOC = LOCATION: locations
    NUM = NUMERIC: numeric values
'''

Source sentence:  What movie has made the most money ? 
True category:  ['ENTY']
Predicted category:  ['ENTY']

Source sentence:  What are faults in the earth 's crust ? 
True category:  ['DESC']
Predicted category:  ['HUM']

Source sentence:  Aspartame is also called what ? 
True category:  ['ENTY']
Predicted category:  ['ENTY']

Source sentence:  What was the name of the ball game played by the mayans ? 
True category:  ['ENTY']
Predicted category:  ['ENTY']

Source sentence:  What costume designer decided that Michael Jackson should only wear one glove ? 
True category:  ['HUM']
Predicted category:  ['ENTY']

Source sentence:  What do penguins eat ? 
True category:  ['ENTY']
Predicted category:  ['ENTY']

Source sentence:  What is a fear of fish ? 
True category:  ['ENTY']
Predicted category:  ['ENTY']

Source sentence:  What 's the mystery of the Bermuda Triangle ? 
True category:  ['DESC']
Predicted category:  ['ENTY']

Source sentence:  How can I get in touch with Michael Moore o

'\nDefinition of Question Classes\n    ABBR = ABBREVIATION: abbreviation\n    ENTY = ENTITY: entities\n    DESC = DESCRIPTION: description and abstract concepts\n    HUM = HUMAN: human beings\n    LOC = LOCATION: locations\n    NUM = NUMERIC: numeric values\n'

### 5. Leveraging Pre-trained Model
- We use pre-trained word vectors (Simple English), trained on Wikipedia using fastText.
- You should download the bin file (over 1GB) from https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md.

In [11]:
pretrained_model = FastText.load_fasttext_format('sentence classification/wiki.simple.bin')

In [12]:
print(pretrained_model.most_similar('professor'))
print('')
print(pretrained_model.most_similar('student'))

print('')
print('Similarity between student and professor is', pretrained_model.similarity('professor', 'student'))
print('Similarity between student and teacher is', pretrained_model.similarity('teacher', 'student'))
print('Similarity between student and pirate is', pretrained_model.similarity('pirate', 'student'))

[('professorial', 0.8434826135635376), ('professorship', 0.8112963438034058), ('professors', 0.7894449234008789), ('profesor', 0.7348014116287231), ('profess', 0.6934148669242859), ('emeritus', 0.675474226474762), ('professed', 0.671900749206543), ('professeur', 0.6563084721565247), ('faculty', 0.5926684141159058), ('university', 0.5912307500839233)]

[('studenţesc', 0.8113662600517273), ('studenten', 0.7857683897018433), ('students', 0.7746243476867676), ('studer', 0.6796103119850159), ('studentenverbindung', 0.625778079032898), ('teacher', 0.5754092931747437), ('enroll', 0.562594473361969), ('undergraduates', 0.5616234540939331), ('teachers', 0.5576453804969788), ('school', 0.5531904101371765)]

Similarity between student and professor is 0.4570124
Similarity between student and teacher is 0.5754093
Similarity between student and pirate is 0.07853423


  if np.issubdtype(vec.dtype, np.int):


In [13]:
pretrained = []

for key in word2index.keys():
    try:
        pretrained.append(pretrained_model[word2index[key]])
    except:
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)

In [14]:
model = CNN(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES).to(device)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=Learning_Rate)

In [15]:
for epoch in range(n_epochs):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs,targets, targets_sub = pad_to_batch(batch)
                
        preds = model(inputs, True)        
        loss = criterion(preds, targets)
        losses.append(loss.data)
        
        model.zero_grad()
        loss.backward()        
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, n_epochs, np.mean(losses)))
            losses = []          
            
print("Learning finished!")

[0/5] mean_loss : 2.03
[1/5] mean_loss : 0.52
[2/5] mean_loss : 0.36
[3/5] mean_loss : 0.11
[4/5] mean_loss : 0.10
Learning finished!


In [16]:
accuracy = 0

for test in test_data:
    pred = model(test[0]).max(1)[1]
    pred = pred.data.tolist()[0]
    target = test[1].data.tolist()[0][0]
    if pred == target:
        accuracy += 1

print(accuracy/len(test_data) * 100)

83.01886792452831


### 6. (Exercise) Classifying  Sub-Caterory
- Replace target_2index to target_sub2index
- Replace targets to targets_sub
- How is the classification performance for more detailed 47 sub-categories, compared to six categories?
- Let's try to improve the classifier performance.

In [17]:
# Instantiate CNN model
model = CNN(len(word2index), 300, len(target_sub2index), KERNEL_DIM, KERNEL_SIZES).to(device)
model.init_weights(pretrained_vectors) # randomly initialize embedding matrix


# Set loss and optimizer function
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=Learning_Rate)

In [18]:
#############################################
# Define the training loop

for epoch in range(n_epochs):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs,targets, targets_sub = pad_to_batch(batch)
                
        preds = model(inputs, True)        
        loss = criterion(preds, targets_sub)
        losses.append(loss.data)
        
        model.zero_grad()
        loss.backward()        
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, n_epochs, np.mean(losses)))
            losses = []          
            
print("Learning finished!")



[0/5] mean_loss : 4.17
[1/5] mean_loss : 1.58
[2/5] mean_loss : 0.90
[3/5] mean_loss : 0.35
[4/5] mean_loss : 0.29
Learning finished!


- Below is for evaluation

In [19]:
accuracy = 0

for test in test_data:
    pred = model(test[0]).max(1)[1]
    pred = pred.data.tolist()[0]
    target_sub = test[2].data.tolist()[0][0]
    if pred == target_sub:
        accuracy += 1

print(accuracy/len(test_data) * 100)

75.84905660377359


In [20]:
for i in range(10):
    ran = random.randint(0, len(test_data)-1)
    print('Source sentence: ', end=' ')
    for j in range(len(test_data[ran][0].data.tolist()[0])):
        word_find = getKeysByValue(word2index, test_data[ran][0].data.tolist()[0][j])
        print(word_find[0], end=' ')
    print('\nTrue sub_category: ', getKeysByValue(target_sub2index, test_data[ran][2].data.tolist()[0][0]))
    pred = model(test_data[ran][0]).max(1)[1]
    pred = pred.data.tolist()[0]
    print('Predicted sub_category: ', getKeysByValue(target_sub2index, pred))
    print("")

Source sentence:  How many members are in the California congressional delegation ? 
True sub_category:  ['count']
Predicted sub_category:  ['count']

Source sentence:  What year did Rossetti paint `` Beata Beatrix '' ? 
True sub_category:  ['date']
Predicted sub_category:  ['date']

Source sentence:  What kind of organization is ` Last Chance for Animals ' ? 
True sub_category:  ['gr']
Predicted sub_category:  ['desc']

Source sentence:  Who taught Matt Murdock to use his extraordinary abilities in Marvel comics ? 
True sub_category:  ['ind']
Predicted sub_category:  ['ind']

Source sentence:  When did Lucelly Garcia , a former ambassador of Columbia to Honduras , die ? 
True sub_category:  ['date']
Predicted sub_category:  ['date']

Source sentence:  What two major world religions began in India ? 
True sub_category:  ['religion']
Predicted sub_category:  ['other']

Source sentence:  What food did Marco Polo introduce into Italy from the court of Kubla Khan ? 
True sub_category:  ['f