In [1]:
import torch.nn.functional as F
from torch.autograd import Variable
import copy
from collections import Counter
import torch 
import numpy as np
import torch.nn as nn

In [2]:
def read_data(path):
    f = open(path, "r")
    lines = f.readlines()
    f.close()
    data = []
    for l in lines:
        labelSplit = l.replace('\n','').split(' ', 1)
        data.append([labelSplit[0], [word.lower() for word in labelSplit[1].split()]])
    return data

data = read_data('./questions.txt')

In [3]:
def remove_stop_words(data, path):
    stop_words = []
    with open(path) as f:
        stop_words = [word for line in f for word in line.split(",")]
    data_without_stop_words = []
    for k, v in data:
        words = [t for t in v if t not in stop_words]
        data_without_stop_words.append((k, words))
    return data_without_stop_words

data = remove_stop_words(data, './stop_words.txt')

In [4]:
def get_labels(data):
    _labels = []
    for k,v in data:
        _labels.append(k)   
    _unique_label = list(set(_labels))
    _unique_label_dict = {}
    for k,v in enumerate(_unique_label):
        _unique_label_dict[v] = k
    return _unique_label_dict

labels = get_labels(data)

In [5]:
labels

{'ENTY:veh': 0,
 'ENTY:cremat': 1,
 'ENTY:letter': 2,
 'ENTY:plant': 3,
 'ï»¿DESC:manner': 4,
 'NUM:code': 5,
 'ENTY:animal': 6,
 'DESC:desc': 7,
 'ENTY:instru': 8,
 'ENTY:other': 9,
 'ENTY:substance': 10,
 'NUM:perc': 11,
 'DESC:def': 12,
 'LOC:country': 13,
 'ENTY:color': 14,
 'ENTY:techmeth': 15,
 'ABBR:exp': 16,
 'ENTY:dismed': 17,
 'ENTY:currency': 18,
 'NUM:dist': 19,
 'DESC:reason': 20,
 'ENTY:termeq': 21,
 'HUM:gr': 22,
 'LOC:state': 23,
 'LOC:mount': 24,
 'ABBR:abb': 25,
 'ENTY:lang': 26,
 'HUM:title': 27,
 'HUM:desc': 28,
 'NUM:speed': 29,
 'ENTY:religion': 30,
 'NUM:weight': 31,
 'ENTY:sport': 32,
 'NUM:date': 33,
 'NUM:money': 34,
 'ENTY:symbol': 35,
 'ENTY:event': 36,
 'HUM:ind': 37,
 'NUM:period': 38,
 'DESC:manner': 39,
 'ENTY:body': 40,
 'LOC:city': 41,
 'ENTY:food': 42,
 'ENTY:product': 43,
 'NUM:count': 44,
 'NUM:temp': 45,
 'NUM:other': 46,
 'ENTY:word': 47,
 'LOC:other': 48,
 'NUM:volsize': 49,
 'NUM:ord': 50}

In [121]:
def append_labels(data, labels):        
    cleaned_data = []
    for k,v in data:
        cleaned_data.append((labels[k],v))
        
    return np.array(cleaned_data)

data = append_labels(data, labels)

In [122]:
def create_indexed_vocab(data):
    vocab = []
    for _, sent in data:
        for word in sent:
            vocab.append(word)
    count = Counter(vocab)
    count = {w : count[w] for w in count if count[w] >= 2}
    vocab = []
    for k, v in count.items():
        vocab.append(k)
    indexed_vocab = {word: idx for idx, word in enumerate(vocab)}
    return indexed_vocab

indexed_vocab = create_indexed_vocab(data)
indexed_vocab['#UNK#'] = len(indexed_vocab)

In [124]:
def load_glove_embeddings(path, indexed_vocab, embedding_dim=300):
    with open(path) as f:
        embeddings = np.zeros((len(indexed_vocab), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = indexed_vocab.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
            else:
                vector = np.array(values[1:], dtype='float32')
                embeddings[len(indexed_vocab) - 1] = vector
        return torch.from_numpy(embeddings).float()

glove = load_glove_embeddings('./glove.txt', indexed_vocab)
embeddings_random = nn.Embedding(glove.size(0), glove.size(1))
embeddings_pretrained = nn.Embedding.from_pretrained(glove, freeze=False)

In [125]:
def split_train_test(data, test_ratio):
    data_copy = copy.deepcopy(data)
    np.random.shuffle(data_copy)
    test_set_size = int(len(data) * test_ratio)
    test = data_copy[:test_set_size]
    train = data_copy[test_set_size:]
    return train, test

train, test = split_train_test(data, 0.1)

In [130]:
class BOWClassifier(nn.Module):
    def __init__(self,input_size,hidden_size, num_labels):
        super(BOWClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.hidden_size,num_labels)
        #self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(relu)
        #output = self.sigmoid(output)
        return output

In [131]:
input_size = 300
hidden_size = 10
num_labels = 50
bow = BOWClassifier(input_size, hidden_size, num_labels)

In [132]:
def make_bow_vector(sentence, indexed_vocab):
    pt_tensor= torch.zeros(300, dtype=torch.long)
    count = 0
    for word in sentence:
        count += 1
        if word not in indexed_vocab:
            pt_tensor = torch.add(pt_tensor, embeddings_pretrained(torch.LongTensor([indexed_vocab['#UNK#']]))[0])
        else:
            pt_tensor = torch.add(pt_tensor, embeddings_pretrained(torch.LongTensor([indexed_vocab[word]]))[0])
    pt_tensor=torch.div(pt_tensor, count)
    return pt_tensor

def get_bow_rep(data):
    bow_data = []
    for label, sent in data:
        bow_data.append(make_bow_vector(sent, indexed_vocab))
    return torch.stack(bow_data)
        
training_set = get_bow_rep(train)

In [133]:
loss_function = nn.NLLLoss()
opt = torch.optim.SGD(bow.parameters(), lr = 0.1)

In [134]:
#training
n_epochs = 2
batch_size = 100
m = nn.LogSoftmax(dim=1)
for epoch in range(n_epochs):
    permutation = torch.randperm(training_set.size()[0])
    for i in range(0, training_set.size()[0], batch_size):
        opt.zero_grad()
        indices = permutation[i:i + batch_size]
        batch_features = training_set[indices]
        batch_labels = torch.LongTensor([label for label, sent in train[indices]])
        batch_outputs = bow(batch_features)
        print(m(batch_outputs).size())
        loss = loss_function(m(batch_outputs), batch_labels)
        loss.backward(retain_graph=True)
        opt.step()
        print("done")
    

torch.Size([100, 50])


IndexError: Target 50 is out of bounds.

In [None]:
# cdata=[]
# target = 
# for l,d in b:
#     bow_vec = make_bow_vector(d, indexed_vocab)
#     cdata.append(list(bow_vec.tolist()))
#     target.append(labels[l])
# c_tensor= torch.FloatTensor(cdata)
# t_tensor=torch.LongTensor(target)
# y_pred = bow(c_tensor)
# loss = loss_function(y_pred,t_tensor)
# loss.backward()
# opt.step()

In [None]:
#testing
for label,data in test:
    bow_vec = make_bow_vector(data, indexed_vocab)
    logprobs = bow(bow_vec)
    print(logprobs)
    pred = np.argmax(logprobs.data.numpy())
    print('prediction: {}'.format(pred))
    print('actual: {}'.format(label))


In [None]:
torch.cuda.is_available()