In [74]:
import torch.nn.functional as F
from torch.autograd import Variable
import copy
from collections import Counter
import torch 
import numpy as np
import torch.nn as nn


In [24]:
def read_data(path):
    f = open(path, "r")
    lines = f.readlines()
    f.close()
    data = []
    for l in lines:
        labelSplit = l.replace('\n','').split(' ', 1)
        data.append([labelSplit[0], [word.lower() for word in labelSplit[1].split()]])
    return data

data = read_data('./questions.txt')
data_train = read_data('./training_data.txt')
data_test = read_data('./testing_data.txt')

In [25]:
def remove_stop_words(data, path):
    stop_words = []
    with open(path) as f:
        stop_words = [word for line in f for word in line.split(",")]
    data_without_stop_words = []
    for k, v in data:
        words = [t for t in v if t not in stop_words]
        data_without_stop_words.append((k, words))
    return data_without_stop_words


data = remove_stop_words(data, './stop_words.txt')
data_train = remove_stop_words(data_train, './stop_words.txt')
data_test = remove_stop_words(data_test, './stop_words.txt')

In [26]:
def get_labels(data):
    _labels = []
    for k,v in data:
        _labels.append(k)   
    _unique_label = list(set(_labels))
    _unique_label_dict = {}
    for k,v in enumerate(_unique_label):
        _unique_label_dict[v] = k
    return _unique_label_dict

labels = get_labels(data)
labels_train = get_labels(data_train)
labels_test = get_labels(data_test)


In [27]:
len(labels)

51

In [28]:
labels

{'ENTY:plant': 0,
 'DESC:desc': 1,
 'HUM:desc': 2,
 'HUM:ind': 3,
 'NUM:dist': 4,
 'NUM:money': 5,
 'NUM:weight': 6,
 'ENTY:body': 7,
 'ENTY:sport': 8,
 'ENTY:instru': 9,
 'HUM:title': 10,
 'NUM:speed': 11,
 'LOC:country': 12,
 'DESC:manner': 13,
 'NUM:code': 14,
 'ENTY:letter': 15,
 'ENTY:food': 16,
 'LOC:mount': 17,
 'ENTY:substance': 18,
 'ENTY:veh': 19,
 'ENTY:currency': 20,
 'ENTY:word': 21,
 'LOC:city': 22,
 'DESC:def': 23,
 'DESC:reason': 24,
 'ENTY:lang': 25,
 'NUM:perc': 26,
 'ENTY:event': 27,
 'ABBR:exp': 28,
 'ABBR:abb': 29,
 'NUM:volsize': 30,
 'ENTY:symbol': 31,
 'ENTY:termeq': 32,
 'NUM:period': 33,
 'NUM:date': 34,
 'LOC:state': 35,
 'ENTY:cremat': 36,
 'LOC:other': 37,
 'ENTY:product': 38,
 'HUM:gr': 39,
 'NUM:other': 40,
 'ENTY:other': 41,
 '\ufeffDESC:manner': 42,
 'ENTY:religion': 43,
 'NUM:temp': 44,
 'NUM:count': 45,
 'NUM:ord': 46,
 'ENTY:dismed': 47,
 'ENTY:animal': 48,
 'ENTY:techmeth': 49,
 'ENTY:color': 50}

In [29]:
def append_labels(data, labels):        
    cleaned_data = []
    for k,v in data:
        cleaned_data.append((labels[k],v))
        
    return np.array(cleaned_data)

data = append_labels(data, labels)
data_train = append_labels(data_train, labels)
data_test = append_labels(data_test,labels)

In [30]:
data[:5]

array([[42, list(['how', 'serfdom', 'develop', 'leave', 'russia', '?'])],
       [36,
        list(['what', 'films', 'featured', 'character', 'popeye', 'doyle', '?'])],
       [13,
        list(['how', 'find', 'list', 'celebrities', "'", 'real', 'names', '?'])],
       [48,
        list(['what', 'fowl', 'grabs', 'spotlight', 'chinese', 'year', 'monkey', '?'])],
       [28, list(['what', 'full', 'form', '.com', '?'])]], dtype=object)

In [31]:
data_test[:5]

array([[32, list(['what', 'chiricahua', 'name', '?'])],
       [1,
        list(['what', "'s", 'last', 'line', 'dickens', "'s", 'christmas', 'carol', '?'])],
       [23, list(['what', 'names', 'andrew', 'christina', 'mean', '?'])],
       [37,
        list(['where', 'song', 'anything', 'goes', 'take', 'place', '?'])],
       [13, list(['how', 'clean', 'cache', '?'])]], dtype=object)

In [32]:
data_train[:5]

array([[45, list(['how', 'many', 'varieties', 'twins', '?'])],
       [36,
        list(['what', 'tv', 'quiz', 'show', 'left', 'air', '1975', 'tune', 'vera', 'lynn', "'s", "'ll", 'meet', '?'])],
       [10, list(['what', 'oldest', 'profession', '?'])],
       [23, list(['what', 'feudal', 'system', '?'])],
       [36, list(['jude', 'law', 'what', 'movie', '?'])]], dtype=object)

In [33]:
def create_indexed_vocab(data):
    vocab = []
    for _, sent in data:
        for word in sent:
            vocab.append(word)
    count = Counter(vocab)
    count = {w : count[w] for w in count if count[w] >= 2}
    vocab = []
    for k, v in count.items():
        vocab.append(k)
    indexed_vocab = {word: idx for idx, word in enumerate(vocab)}
    return indexed_vocab
def create_vocab(data):
    total_words_orig = []
    for k,sent in data:
        for word in sent:
            total_words_orig.append(word)
    total_words = list(set(total_words_orig))
    total_words_str = ' '.join(total_words)
    vocab = set(total_words_str.split()) 
    word2idx = {word: idx for idx, word in enumerate(vocab)} # create word index
    return word2idx
word2idx=create_vocab(data)
word2idx['#UNK#'] = len(word2idx)
indexed_vocab = create_indexed_vocab(data)
indexed_vocab['#UNK#'] = len(indexed_vocab)


In [34]:
def load_glove_embeddings(path, indexed_vocab, embedding_dim=300):
    with open(path) as f:
        embeddings = np.zeros((len(indexed_vocab), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = indexed_vocab.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

glove_random = load_glove_embeddings('./glove.small.txt', indexed_vocab)
glove_pre = load_glove_embeddings('./glove.small.txt', word2idx)
embeddings_random = nn.Embedding(glove_random.size(0), glove_random.size(1))
embeddings_pretrained = nn.Embedding.from_pretrained(glove_pre, freeze=True)


In [35]:
# def split_train_test(data, test_ratio):
#     data_copy = copy.deepcopy(data)
#     np.random.shuffle(data_copy)
#     test_set_size = int(len(data) * test_ratio)
#     test = data_copy[:test_set_size]
#     train = data_copy[test_set_size:]
#     return train, test

# train, test = split_train_test(data, 0.2)
# train,dev =split_train_test(train,0.1)

In [36]:
data_train[:4]

array([[45, list(['how', 'many', 'varieties', 'twins', '?'])],
       [36,
        list(['what', 'tv', 'quiz', 'show', 'left', 'air', '1975', 'tune', 'vera', 'lynn', "'s", "'ll", 'meet', '?'])],
       [10, list(['what', 'oldest', 'profession', '?'])],
       [23, list(['what', 'feudal', 'system', '?'])]], dtype=object)

In [37]:
class BOWClassifier(nn.Module):
    def __init__(self,input_size,hidden_size, num_labels):
        super(BOWClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.input_size, num_labels)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(x)
        #output = self.sigmoid(output)
        return output

In [38]:
def make_bow_vector(sentence, indexed_vocab,embeddings):
    pt_tensor= torch.zeros(300, dtype=torch.long)
    count = 0
    for word in sentence:
        count += 1
        if word in indexed_vocab:
            pt_tensor = torch.add(pt_tensor, embeddings(torch.LongTensor([indexed_vocab[word]]))[0])
        else:
            pt_tensor = torch.add(pt_tensor, embeddings(torch.LongTensor([indexed_vocab['#UNK#']]))[0])
    pt_tensor=torch.div(pt_tensor, count)
    return pt_tensor

def get_bow_rep(data,word2idx,embeddings):
    bow_data = []
    for label, sent in data:
        bow_data.append(make_bow_vector(sent, word2idx,embeddings))
    return torch.stack(bow_data)
        
training_set = get_bow_rep(data_train,word2idx,embeddings_pretrained)
training_set_rand = get_bow_rep(data_train,indexed_vocab,embeddings_random)

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# input_size = 300
# hidden_size = 100
# num_classes = 51
# num_epochs = 350
# learning_rate = 0.0001
# batch_size = 100
accu=[]
input_size = 300
hidden_size = [50,100,200,250]
num_classes = 51
num_epochs = [10,300,500,1000]
learning_rate = [0.1,0.001,0.0001,0.00001]
batch_size = [2,50,100,200]
for k in hidden_size:
    for j in num_epochs:
        for r in learning_rate:
            for x in batch_size:
                print("hidden_size={0}\nnum_epochs={1}\nlearning_rate={2}\nbatch_size={3}".format(k,j,r,x))
                model = BOWClassifier(input_size, k, num_classes).to(device)
                criterion = torch.nn.CrossEntropyLoss()
                optimizer = torch.optim.Adam(model.parameters(), lr=r) 
                #training in batches
                for epoch in range(j):
                    permutation = torch.randperm(training_set.size()[0])
                    for i in range(0, training_set.size()[0], x):
                        optimizer.zero_grad()
                        indices = permutation[i:i + x]
                        batch_features = training_set[indices]
                        batch_features = batch_features.reshape(-1, 300).to(device)
                        batch_labels = torch.LongTensor([label for label, sent in data_train[indices]]).to(device)
                        batch_outputs = model(batch_features)
                        loss = criterion(batch_outputs, batch_labels)
                        loss.backward()
                        optimizer.step()
                  #print(loss.item())
                    #print(epoch)
                #acc.append(value_of_patameter)
                accur=accuracy(data_test)*100
                accu.append([k,j,r,x,accur])
                print("\nAccuracy=",accur)
                print("\n") 

hidden_size=50
num_epochs=10
learning_rate=0.1
batch_size=2


  



Accuracy= 65.3211009174312


hidden_size=50
num_epochs=10
learning_rate=0.1
batch_size=50

Accuracy= 66.97247706422019


hidden_size=50
num_epochs=10
learning_rate=0.1
batch_size=100

Accuracy= 69.72477064220183


hidden_size=50
num_epochs=10
learning_rate=0.1
batch_size=200

Accuracy= 71.92660550458716


hidden_size=50
num_epochs=10
learning_rate=0.001
batch_size=2

Accuracy= 71.37614678899082


hidden_size=50
num_epochs=10
learning_rate=0.001
batch_size=50

Accuracy= 60.91743119266055


hidden_size=50
num_epochs=10
learning_rate=0.001
batch_size=100

Accuracy= 56.513761467889914


hidden_size=50
num_epochs=10
learning_rate=0.001
batch_size=200

Accuracy= 48.62385321100918


hidden_size=50
num_epochs=10
learning_rate=0.0001
batch_size=2

Accuracy= 55.779816513761475


hidden_size=50
num_epochs=10
learning_rate=0.0001
batch_size=50

Accuracy= 37.06422018348624


hidden_size=50
num_epochs=10
learning_rate=0.0001
batch_size=100

Accuracy= 36.69724770642202


hidden_size=50
num_epochs=10


Accuracy= 73.39449541284404


hidden_size=100
num_epochs=300
learning_rate=0.0001
batch_size=2

Accuracy= 73.21100917431193


hidden_size=100
num_epochs=300
learning_rate=0.0001
batch_size=50

Accuracy= 70.27522935779817


hidden_size=100
num_epochs=300
learning_rate=0.0001
batch_size=100

Accuracy= 67.52293577981652


hidden_size=100
num_epochs=300
learning_rate=0.0001
batch_size=200

Accuracy= 63.11926605504588


hidden_size=100
num_epochs=300
learning_rate=1e-05
batch_size=2

Accuracy= 66.23853211009174


hidden_size=100
num_epochs=300
learning_rate=1e-05
batch_size=50

Accuracy= 45.50458715596331


hidden_size=100
num_epochs=300
learning_rate=1e-05
batch_size=100

Accuracy= 39.816513761467895


hidden_size=100
num_epochs=300
learning_rate=1e-05
batch_size=200

Accuracy= 36.3302752293578


hidden_size=100
num_epochs=500
learning_rate=0.1
batch_size=2

Accuracy= 65.13761467889908


hidden_size=100
num_epochs=500
learning_rate=0.1
batch_size=50

Accuracy= 66.05504587155964


hidden_s


Accuracy= 53.57798165137615


hidden_size=200
num_epochs=500
learning_rate=1e-05
batch_size=100

Accuracy= 45.68807339449542


hidden_size=200
num_epochs=500
learning_rate=1e-05
batch_size=200

Accuracy= 40.0


hidden_size=200
num_epochs=1000
learning_rate=0.1
batch_size=2

Accuracy= 63.48623853211009


hidden_size=200
num_epochs=1000
learning_rate=0.1
batch_size=50

Accuracy= 66.05504587155964


hidden_size=200
num_epochs=1000
learning_rate=0.1
batch_size=100

Accuracy= 64.77064220183486


hidden_size=200
num_epochs=1000
learning_rate=0.1
batch_size=200

Accuracy= 66.78899082568807


hidden_size=200
num_epochs=1000
learning_rate=0.001
batch_size=2

Accuracy= 66.42201834862385


hidden_size=200
num_epochs=1000
learning_rate=0.001
batch_size=50

Accuracy= 69.1743119266055


hidden_size=200
num_epochs=1000
learning_rate=0.001
batch_size=100

Accuracy= 70.64220183486239


hidden_size=200
num_epochs=1000
learning_rate=0.001
batch_size=200

Accuracy= 72.11009174311927


hidden_size=200
num

In [59]:
accu[:3]

[[50, 10, 0.1, 2, 65.3211009174312],
 [50, 10, 0.1, 50, 66.97247706422019],
 [50, 10, 0.1, 100, 69.72477064220183]]

In [70]:
accuracy_list=[]
l_rate=[]
for i in range(len(accu)):
    accuracy_list.append(accu[i][4])
    l_rate.append(accu[i][2])
print(max(accuracy_list))


73.76146788990826


In [95]:
# import matplotlib.pyplot as plt
# plt.scatter(l_rate,accuracy_list) 

In [71]:
for i in range(len(accu)):
    if accu[i][4]==73.76146788990826:
        print("Hidden size={}\nNum_epochs={}\nlearning rate={}\nBatch Size={}\nAccuracy={}".format(accu[i][0],accu[i][1],accu[i][2],accu[i][3],accu[i][4]))

Hidden size=250
Num_epochs=1000
learning rate=0.0001
Batch Size=50
Accuracy=73.76146788990826


In [75]:
#testing
def accuracy(data_test):
    test_len =len(data_test)
    correct=0
    for label,data in data_test:
        bow_vec = make_bow_vector(data, word2idx,embeddings_pretrained)
        logprobs = model(bow_vec)
        logprobs = F.softmax(logprobs)
        pred = np.argmax(logprobs.data.numpy())
        if pred==label:
            correct+=1
          #print('prediction: {}'.format(pred))
        #print('actual: {}'.format(label))
    acc = correct/test_len
    return acc
    #print('accuracy: {}'.format(accuracy))

In [79]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        #self.word_embeddings=nn.Embedding.from_pretrained(glove_pre, freeze=True)


        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        # Decode the hidden state of the last time step
        tag_space = self.hidden2tag(lstm_out[-1, :])
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [84]:
#LSTM training
vocab_size=glove_random.size(0)
embedding_dim=glove_random.size(1)
tagset_size=51
hidden_dim=300
learning_rate=[0.1,0.01,0.0001]
num_epochs=[1,2,3,5]
accur=0
accuu=[]
for n in num_epochs :
    for r in learning_rate:
        print("num_epochs={0}\nlearning_rate={1}".format(n,r))
        ModelLSTM = LSTMTagger(embedding_dim,hidden_dim,vocab_size,tagset_size)
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(ModelLSTM.parameters(), lr=r)

        for epoch in range(n):
            for label,sent in data_train:
                ids=[]
                unk = 3375
                for word in sent:
                    index =indexed_vocab.get(word)
                    if index:
                        ids.append([indexed_vocab[word]])
                    else:
                        ids.append([unk])
                batch_labels=torch.tensor([label])
                sentence=torch.tensor(ids).squeeze()
                optimizer.zero_grad()
                batch_outputs = ModelLSTM(sentence)
                loss = criterion(batch_outputs, batch_labels)
                loss.backward()
                optimizer.step()
#                 
        accur=accuracy(data_test)*100
        accuu.append([,accur])
        print("\nAccuracy=",accur)
        print("\n") 

num_epochs=1
learning_rate=0.1





Accuracy= 12.477064220183486


num_epochs=1
learning_rate=0.01

Accuracy= 33.211009174311926


num_epochs=1
learning_rate=0.0001

Accuracy= 57.064220183486235


num_epochs=2
learning_rate=0.1

Accuracy= 11.926605504587156


num_epochs=2
learning_rate=0.01

Accuracy= 31.926605504587158


num_epochs=2
learning_rate=0.0001

Accuracy= 63.6697247706422


num_epochs=3
learning_rate=0.1

Accuracy= 13.394495412844037


num_epochs=3
learning_rate=0.01

Accuracy= 36.14678899082569


num_epochs=3
learning_rate=0.0001

Accuracy= 67.1559633027523


num_epochs=5
learning_rate=0.1

Accuracy= 13.761467889908257


num_epochs=5
learning_rate=0.01

Accuracy= 33.94495412844037


num_epochs=5
learning_rate=0.0001

Accuracy= 71.19266055045873




In [85]:
accuu

[[1, 0.1, 12.477064220183486],
 [1, 0.01, 33.211009174311926],
 [1, 0.0001, 57.064220183486235],
 [2, 0.1, 11.926605504587156],
 [2, 0.01, 31.926605504587158],
 [2, 0.0001, 63.6697247706422],
 [3, 0.1, 13.394495412844037],
 [3, 0.01, 36.14678899082569],
 [3, 0.0001, 67.1559633027523],
 [5, 0.1, 13.761467889908257],
 [5, 0.01, 33.94495412844037],
 [5, 0.0001, 71.19266055045873]]

In [83]:
#LSTM testing
def lstm_acc(data_test):
    test_len =len(data_test)
    with torch.no_grad():
        correct = 0
        for label,sent in data_test:
            ids=[]
            unk = 3375
            labels=[]
            for word in sent:
                index =indexed_vocab.get(word)
                if index:
                    ids.append([indexed_vocab[word]])
                else:
                    ids.append([unk])
            sentence=torch.tensor(ids).squeeze()
            batch_outputs = ModelLSTM(sentence)
            predicted = np.argmax(F.softmax(batch_outputs).data.numpy())
            if predicted==label:
                 correct+=1
#             print('prediction: {}'.format(predicted))
#             print('actual: {}'.format(label))
        accuracy = correct/test_len
#         print('accuracy: {}'.format(accuracy))
        return accuracy

In [14]:
len(data_train)

4416

In [119]:
class LSTMTaggerpre(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, glove):
        torch.backends.cudnn.deterministic = True
        super(LSTMTaggerpre, self).__init__()
        self.hidden_dim = hidden_dim
      
        self.word_embeddings = nn.Embedding.from_pretrained(glove, freeze=True)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        # Decode the hidden state of the last time step
        tag_space = self.hidden2tag(lstm_out[-1, :])
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores



In [122]:
vocab_size=glove_pre.size(0)
embedding_dim=glove_pre.size(1)
tagset_size=51
hidden_dim=300
learning_rate=0.0001
num_epochs=5
print("Training started !!! It may take a few minutes.")

ModelLSTMM = LSTMTaggerpre(embedding_dim, hidden_dim, vocab_size, tagset_size, glove_pre)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ModelLSTMM.parameters(), lr=learning_rate)

unk = word2idx.get('#UNK#')
for epoch in range(num_epochs):
    for label, sent in data:
        ids = []
        for word in sent:
            index = word2idx.get(word)
            if index:
                ids.append([word2idx[word]])
            else:
                ids.append([unk])
        batch_labels = torch.tensor([label])
        sentence = torch.tensor(ids).squeeze()
        optimizer.zero_grad()
        batch_outputs = ModelLSTMM(sentence)
        loss = criterion(batch_outputs, batch_labels)
        loss.backward()
        optimizer.step()
# print("loss", loss.item())
# print("epoch", epoch)
print("training finished")
print("accuracy=",lstm_accc(data_test)*100)
# return ModelLSTM

Training started !!! It may take a few minutes.
training finished




accuracy= 86.05504587155963


In [120]:
def lstm_accc(data_test):
    test_len =len(data_test)
    unk = word2idx.get('#UNK#')


    with torch.no_grad():
        correct = 0
        for label,sent in data_test:
            ids=[]
            #unk = 3375
            labels=[]
            for word in sent:
                index =word2idx.get(word)
                if index:
                    ids.append([word2idx[word]])
                else:
                    ids.append([unk])
            sentence=torch.tensor(ids).squeeze()
            batch_outputs = ModelLSTMM(sentence)
            predicted = np.argmax(F.softmax(batch_outputs).data.numpy())
            if predicted==label:
                 correct+=1
#             print('prediction: {}'.format(predicted))
#             print('actual: {}'.format(label))
        accuracy = correct/test_len
#         print('accuracy: {}'.format(accuracy))
        return accuracy

