In [1]:
import torch.nn.functional as F
from torch.autograd import Variable
import copy
from collections import Counter
import torch 
import numpy as np
import torch.nn as nn

In [2]:
def read_data(path):
    f = open(path, "r")
    lines = f.readlines()
    f.close()
    data = []
    for l in lines:
        labelSplit = l.replace('\n','').split(' ', 1)
        data.append([labelSplit[0], [word.lower() for word in labelSplit[1].split()]])
    return data

data = read_data('./questions.txt')

In [3]:
def remove_stop_words(data, path):
    stop_words = []
    with open(path) as f:
        stop_words = [word for line in f for word in line.split(",")]
    data_without_stop_words = []
    for k, v in data:
        words = [t for t in v if t not in stop_words]
        data_without_stop_words.append((k, words))
    return data_without_stop_words

data = remove_stop_words(data, './stop_words.txt')

In [4]:
def get_labels(data):
    _labels = []
    for k,v in data:
        _labels.append(k)   
    _unique_label = list(set(_labels))
    _unique_label_dict = {}
    for k,v in enumerate(_unique_label):
        _unique_label_dict[v] = k
    return _unique_label_dict

labels = get_labels(data)

In [5]:
def append_labels(data, labels):        
    cleaned_data = []
    for k,v in data:
        cleaned_data.append((labels[k],v))
        
    return np.array(cleaned_data)

data = append_labels(data, labels)

In [6]:
def create_indexed_vocab(data):
    vocab = []
    for _, sent in data:
        for word in sent:
            vocab.append(word)
    count = Counter(vocab)
    count = {w : count[w] for w in count if count[w] >= 2}
    vocab = []
    for k, v in count.items():
        vocab.append(k)
    indexed_vocab = {word: idx for idx, word in enumerate(vocab)}
    return indexed_vocab
def create_vocab(data):
    total_words_orig = []
    for k,sent in data:
        for word in sent:
            total_words_orig.append(word)
    total_words = list(set(total_words_orig))
    total_words_str = ' '.join(total_words)
    vocab = set(total_words_str.split()) 
    word2idx = {word: idx for idx, word in enumerate(vocab)} # create word index
    return word2idx
word2idx=create_vocab(data)
word2idx['#UNK#'] = len(word2idx)
indexed_vocab = create_indexed_vocab(data)
indexed_vocab['#UNK#'] = len(indexed_vocab)

In [7]:
def load_glove_embeddings(path, indexed_vocab, embedding_dim=300):
    with open(path) as f:
        embeddings = np.zeros((len(indexed_vocab), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = indexed_vocab.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

glove_random = load_glove_embeddings('./glove.small.txt', indexed_vocab)
glove_pre = load_glove_embeddings('./glove.small.txt', word2idx)
embeddings_random = nn.Embedding(glove_random.size(0), glove_random.size(1))
embeddings_pretrained = nn.Embedding.from_pretrained(glove_pre, freeze=True)

In [8]:
def split_train_test(data, test_ratio):
    data_copy = copy.deepcopy(data)
    np.random.shuffle(data_copy)
    test_set_size = int(len(data) * test_ratio)
    test = data_copy[:test_set_size]
    train = data_copy[test_set_size:]
    return train, test

train, test = split_train_test(data, 0.2)

In [9]:
class BOWClassifier(nn.Module):
    def __init__(self,input_size,hidden_size, num_labels):
        super(BOWClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.input_size, num_labels)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(x)
        #output = self.sigmoid(output)
        return output

In [48]:
def make_bow_vector(sentence, indexed_vocab):
    pt_tensor= torch.zeros(300, dtype=torch.long)
    count = 0
    for word in sentence:
        count += 1
        if word in indexed_vocab:
            pt_tensor = torch.add(pt_tensor, embeddings_pretrained(torch.LongTensor([indexed_vocab[word]]))[0])
        else:
            pt_tensor = torch.add(pt_tensor, embeddings_pretrained(torch.LongTensor([indexed_vocab['#UNK']]))[0])
    pt_tensor=torch.div(pt_tensor, count)
    return pt_tensor

def get_bow_rep(data):
    bow_data = []
    for label, sent in data:
        bow_data.append(make_bow_vector(sent, word2idx).reshape(-1, 300))
    return torch.stack(bow_data)
        
training_set = get_bow_rep(train)

torch.Size([4362, 1, 300])
tensor([[[-0.0886,  0.1401, -0.1218,  ..., -0.0261,  0.0924,  0.1195]],

        [[-0.0574,  0.2847, -0.1079,  ...,  0.0217,  0.0816,  0.2187]],

        [[-0.0646,  0.1491,  0.1250,  ..., -0.0814,  0.0329, -0.0212]],

        ...,

        [[-0.0319,  0.3197,  0.0022,  ..., -0.0563,  0.1042, -0.0492]],

        [[-0.0812,  0.1550, -0.2108,  ...,  0.0016,  0.0891,  0.0668]],

        [[ 0.0192,  0.1936,  0.0139,  ...,  0.0501,  0.2304,  0.3875]]])
tensor([[32],
        [37],
        [42],
        [35],
        [17],
        [48],
        [13],
        [48],
        [ 1],
        [43]])


In [49]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size = 300
hidden_size = 100
num_classes = 51
num_epochs = 50
learning_rate = 0.0001

model = BOWClassifier(input_size, hidden_size, num_classes).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [56]:
for epoch in range(num_epochs):
    permutation = torch.randperm(len(train))
    i = 0
    for label, question in train[permutation]:
        i += 1
        optimizer.zero_grad()
        bow_vec = make_bow_vector(question, word2idx)
        bow_vec = bow_vec.reshape(-1, 300).to(device)
        label = torch.LongTensor([label])
        label = label.to(device)
        output = model(bow_vec)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

Epoch [1/50], Loss: 3.2282
Epoch [1/50], Loss: 4.4131
Epoch [1/50], Loss: 3.3566
Epoch [1/50], Loss: 3.4310
Epoch [1/50], Loss: 2.9971
Epoch [1/50], Loss: 3.0752


KeyboardInterrupt: 

In [None]:
for epoch in range(num_epochs):
    permutation = torch.randperm(training_set.size()[0])
    for i in range(0, training_set.size()[0], batch_size):
        opt.zero_grad()
        indices = permutation[i:i + batch_size]
        batch_features = training_set[indices]
        batch_features = batch_features.reshape(-1, 300).to(device)
        batch_labels = torch.LongTensor([label for label, sent in train[indices]])
        batch_features = batch_features.reshape(-1, 1).to(device)
        batch_outputs = model(batch_features)
        loss = loss_function(batch_outputs, batch_labels)
        print(loss)
        loss.backward()
        opt.step()

In [54]:
#training in batches
batch_size=100
for epoch in range(num_epochs):
    permutation = torch.randperm(training_set.size()[0])
    for i in range(0, training_set.size()[0], batch_size):
        optimizer.zero_grad()
        indices = permutation[i:i + batch_size]
        batch_features = training_set[indices]
        batch_features=batch_features.to(device)
        print(batch_features)
        bow_label = []
        for label, sent in train[indices]:
            label = torch.LongTensor([label])
            bow_label.append(label)
        batch_labels=torch.stack(bow_label).to(device)
        print(batch_labels)
        batch_outputs = model(batch_features)
        loss = criterion(batch_outputs, batch_labels)
        print(loss)
        loss.backward()
        opt.step()

tensor([[[ 0.1588,  0.1454,  0.1403,  ...,  0.1427,  0.0615,  0.0544]],

        [[ 0.1684,  0.1359,  0.0865,  ..., -0.0125,  0.0972, -0.1372]],

        [[-0.0158,  0.1209,  0.1114,  ..., -0.1614,  0.0982,  0.1127]],

        ...,

        [[ 0.1571,  0.0427,  0.3998,  ...,  0.0472,  0.1954, -0.0246]],

        [[ 0.0224,  0.0198, -0.2315,  ...,  0.1584,  0.1841,  0.0574]],

        [[-0.1022,  0.2185, -0.0167,  ...,  0.0636,  0.1347,  0.0565]]])
tensor([[17],
        [17],
        [42],
        [ 1],
        [45],
        [39],
        [45],
        [13],
        [17],
        [12],
        [ 1],
        [32],
        [34],
        [17],
        [37],
        [48],
        [46],
        [39],
        [17],
        [45],
        [11],
        [17],
        [46],
        [ 1],
        [37],
        [ 3],
        [17],
        [10],
        [37],
        [ 1],
        [49],
        [42],
        [13],
        [45],
        [50],
        [ 4],
        [39],
        [42],
        [46],
  

ValueError: Expected target size (100, 51), got torch.Size([100, 1])

In [None]:
#testing
test_len =len(test)
correct=0
for label,data in test:
    bow_vec = make_bow_vector(data, word2idx)
    logprobs = bow(bow_vec)
    print(logprobs)
    pred = np.argmax(logprobs.data.numpy())
    if pred==label:
        correct+=1
    print('prediction: {}'.format(pred))
    print('actual: {}'.format(label))
accuracy = correct/test_len
print('accuracy: {}'.format(accuracy))