In [85]:
import torch.nn.functional as F
from torch.autograd import Variable
import copy
from collections import Counter
import torch 
import numpy as np
import torch.nn as nn

In [86]:
def read_data(path):
    f = open(path, "r")
    lines = f.readlines()
    f.close()
    data = []
    for l in lines:
        labelSplit = l.replace('\n','').split(' ', 1)
        data.append([labelSplit[0], [word.lower() for word in labelSplit[1].split()]])
    return data

data = read_data('./questions.txt')

In [87]:
def remove_stop_words(data, path):
    stop_words = []
    with open(path) as f:
        stop_words = [word for line in f for word in line.split(",")]
    data_without_stop_words = []
    for k, v in data:
        words = [t for t in v if t not in stop_words]
        data_without_stop_words.append((k, words))
    return data_without_stop_words

data = remove_stop_words(data, './stop_words.txt')

In [88]:
def get_labels(data):
    _labels = []
    for k,v in data:
        _labels.append(k)   
    _unique_label = list(set(_labels))
    _unique_label_dict = {}
    for k,v in enumerate(_unique_label):
        _unique_label_dict[v] = k
    return _unique_label_dict

labels = get_labels(data)

In [89]:
def append_labels(data, labels):        
    cleaned_data = []
    for k,v in data:
        cleaned_data.append((labels[k],v))
        
    return np.array(cleaned_data)

data = append_labels(data, labels)

In [90]:
def create_indexed_vocab(data):
    vocab = []
    for _, sent in data:
        for word in sent:
            vocab.append(word)
    count = Counter(vocab)
    count = {w : count[w] for w in count if count[w] >= 2}
    vocab = []
    for k, v in count.items():
        vocab.append(k)
    indexed_vocab = {word: idx for idx, word in enumerate(vocab)}
    return indexed_vocab
def create_vocab(data):
    total_words_orig = []
    for k,sent in data:
        for word in sent:
            total_words_orig.append(word)
    total_words = list(set(total_words_orig))
    total_words_str = ' '.join(total_words)
    vocab = set(total_words_str.split()) 
    word2idx = {word: idx for idx, word in enumerate(vocab)} # create word index
    return word2idx
word2idx=create_vocab(data)
word2idx['#UNK#'] = len(word2idx)
indexed_vocab = create_indexed_vocab(data)
indexed_vocab['#UNK#'] = len(indexed_vocab)

In [93]:
def load_glove_embeddings(path, indexed_vocab, embedding_dim=300):
    with open(path) as f:
        embeddings = np.zeros((len(indexed_vocab), embedding_dim))
        for line in f.readlines():
            values = line.split()
            word = values[0]
            index = indexed_vocab.get(word)
            if index:
                vector = np.array(values[1:], dtype='float32')
                embeddings[index] = vector
        return torch.from_numpy(embeddings).float()

glove_random = load_glove_embeddings('./glove.small.txt', indexed_vocab)
glove_pre = load_glove_embeddings('./glove.small.txt', word2idx)
embeddings_random = nn.Embedding(glove_random.size(0), glove_random.size(1))
embeddings_pretrained = nn.Embedding.from_pretrained(glove_pre, freeze=True)

In [94]:
def split_train_test(data, test_ratio):
    data_copy = copy.deepcopy(data)
    np.random.shuffle(data_copy)
    test_set_size = int(len(data) * test_ratio)
    test = data_copy[:test_set_size]
    train = data_copy[test_set_size:]
    return train, test

train, test = split_train_test(data, 0.2)
train,dev =split_train_test(train,0.1)

In [95]:
class BOWClassifier(nn.Module):
    def __init__(self,input_size,hidden_size, num_labels):
        super(BOWClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.input_size, num_labels)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(x)
        #output = self.sigmoid(output)
        return output

In [101]:
def make_bow_vector(sentence, indexed_vocab,embeddings):
    pt_tensor= torch.zeros(300, dtype=torch.long)
    count = 0
    for word in sentence:
        count += 1
        if word in indexed_vocab:
            pt_tensor = torch.add(pt_tensor, embeddings(torch.LongTensor([indexed_vocab[word]]))[0])
        else:
            pt_tensor = torch.add(pt_tensor, embeddings(torch.LongTensor([indexed_vocab['#UNK#']]))[0])
    pt_tensor=torch.div(pt_tensor, count)
    return pt_tensor

def get_bow_rep(data,word2idx,embeddings):
    bow_data = []
    for label, sent in data:
        bow_data.append(make_bow_vector(sent, word2idx,embeddings))
    return torch.stack(bow_data)
        
training_set = get_bow_rep(train,word2idx,embeddings_pretrained)
training_set_rand = get_bow_rep(train,indexed_vocab,embeddings_random)

In [38]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size = 300
hidden_size = 100
num_classes = 51
num_epochs = 50
learning_rate = 0.0001

model = BOWClassifier(input_size, hidden_size, num_classes).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [39]:
for epoch in range(num_epochs):
    permutation = torch.randperm(len(train))
    i = 0
    for label, question in train[permutation]:
        i += 1
        optimizer.zero_grad()
        bow_vec = make_bow_vector(question, word2idx,embeddings_pretrained)
        bow_vec = bow_vec.reshape(-1, 300).to(device)
        label = torch.LongTensor([label])
        label = label.to(device)
        output = model(bow_vec)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

Epoch [1/50], Loss: 3.7833
Epoch [1/50], Loss: 3.6851
Epoch [1/50], Loss: 3.6093
Epoch [1/50], Loss: 3.8592


KeyboardInterrupt: 

In [102]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size = 300
hidden_size = 100
num_classes = 51
num_epochs = 350
learning_rate = 0.0001
batch_size = 100

model = BOWClassifier(input_size, hidden_size, num_classes).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
#training in batches
for epoch in range(num_epochs):
    permutation = torch.randperm(training_set.size()[0])
    for i in range(0, training_set.size()[0], batch_size):
        optimizer.zero_grad()
        indices = permutation[i:i + batch_size]
        batch_features = training_set[indices]
        batch_features = batch_features.reshape(-1, 300).to(device)
        batch_labels = torch.LongTensor([label for label, sent in train[indices]]).to(device)
        batch_outputs = model(batch_features)
        loss = criterion(batch_outputs, batch_labels)
        loss.backward()
        optimizer.step()
    print(loss.item())
    print(epoch)

3.8577375411987305
0
3.766239881515503
1
3.7015557289123535
2
3.6341633796691895
3
3.5899248123168945
4
3.461988687515259
5
3.5528314113616943
6
3.463732957839966
7
3.2996716499328613
8
3.40735125541687
9
3.2944467067718506
10
3.2035000324249268
11
3.2594258785247803
12
3.093846321105957
13
3.092564105987549
14
3.131394863128662
15
3.1297993659973145
16
2.9859044551849365
17
3.0528383255004883
18
2.8519093990325928
19
2.9279050827026367
20
2.905634641647339
21
2.8870065212249756
22
2.7975986003875732
23
2.788588523864746
24
2.7590572834014893
25
2.6858606338500977
26
2.706007719039917
27
2.9241583347320557
28
2.818058729171753
29
2.7365477085113525
30
2.747945547103882
31
2.7600347995758057
32
2.8307321071624756
33
2.7383768558502197
34
2.7723028659820557
35
2.5529563426971436
36
2.568894386291504
37
2.33998703956604
38
2.6940736770629883
39
2.386039972305298
40
2.563384532928467
41
2.7597479820251465
42
2.4870269298553467
43
2.408236503601074
44
2.4940176010131836
45
2.575275897979736

In [103]:
#testing
test_len =len(test)
correct=0
for label,data in test:
    bow_vec = make_bow_vector(data, word2idx,embeddings_pretrained)
    logprobs = model(bow_vec)
    logprobs = F.softmax(logprobs)
    pred = np.argmax(logprobs.data.numpy())
    if pred==label:
        correct+=1
    print('prediction: {}'.format(pred))
    print('actual: {}'.format(label))
accuracy = correct/test_len
print('accuracy: {}'.format(accuracy))

  import sys


prediction: 17
actual: 17
prediction: 39
actual: 1
prediction: 5
actual: 5
prediction: 24
actual: 24
prediction: 12
actual: 12
prediction: 13
actual: 13
prediction: 12
actual: 12
prediction: 1
actual: 1
prediction: 11
actual: 11
prediction: 39
actual: 1
prediction: 4
actual: 4
prediction: 42
actual: 42
prediction: 24
actual: 24
prediction: 32
actual: 44
prediction: 13
actual: 13
prediction: 17
actual: 17
prediction: 37
actual: 37
prediction: 17
actual: 17
prediction: 2
actual: 2
prediction: 32
actual: 32
prediction: 13
actual: 13
prediction: 4
actual: 4
prediction: 17
actual: 24
prediction: 20
actual: 20
prediction: 20
actual: 20
prediction: 12
actual: 1
prediction: 12
actual: 12
prediction: 32
actual: 32
prediction: 17
actual: 17
prediction: 17
actual: 17
prediction: 24
actual: 24
prediction: 10
actual: 10
prediction: 13
actual: 1
prediction: 13
actual: 48
prediction: 11
actual: 11
prediction: 17
actual: 17
prediction: 32
actual: 10
prediction: 45
actual: 45
prediction: 13
actual: 13


actual: 32
prediction: 12
actual: 37
prediction: 2
actual: 2
prediction: 11
actual: 11
prediction: 11
actual: 11
prediction: 37
actual: 37
prediction: 17
actual: 17
prediction: 42
actual: 42
prediction: 17
actual: 17
prediction: 17
actual: 10
prediction: 32
actual: 32
prediction: 20
actual: 20
prediction: 13
actual: 1
prediction: 32
actual: 32
prediction: 17
actual: 17
prediction: 39
actual: 39
prediction: 39
actual: 39
prediction: 32
actual: 32
prediction: 17
actual: 17
prediction: 11
actual: 11
prediction: 13
actual: 12
prediction: 2
actual: 2
prediction: 17
actual: 17
prediction: 48
actual: 41
prediction: 37
actual: 49
prediction: 12
actual: 12
prediction: 17
actual: 17
prediction: 17
actual: 17
prediction: 30
actual: 50
prediction: 12
actual: 12
prediction: 32
actual: 32
prediction: 46
actual: 46
prediction: 17
actual: 17
prediction: 12
actual: 17
prediction: 17
actual: 37
prediction: 39
actual: 39
prediction: 12
actual: 12
prediction: 12
actual: 12
prediction: 28
actual: 28
predic

prediction: 17
actual: 49
prediction: 17
actual: 17
prediction: 48
actual: 48
prediction: 11
actual: 48
prediction: 42
actual: 37
prediction: 46
actual: 46
prediction: 12
actual: 12
prediction: 17
actual: 17
prediction: 13
actual: 43
prediction: 12
actual: 12
prediction: 17
actual: 48
prediction: 17
actual: 17
prediction: 28
actual: 13
prediction: 32
actual: 5
prediction: 17
actual: 17
prediction: 12
actual: 12
prediction: 11
actual: 11
prediction: 42
actual: 17
prediction: 12
actual: 12
prediction: 50
actual: 50
prediction: 11
actual: 11
prediction: 4
actual: 4
prediction: 10
actual: 48
prediction: 11
actual: 11
prediction: 12
actual: 12
prediction: 47
actual: 48
prediction: 42
actual: 42
prediction: 46
actual: 46
prediction: 32
actual: 32
prediction: 13
actual: 13
prediction: 17
actual: 17
prediction: 11
actual: 18
prediction: 17
actual: 17
prediction: 11
actual: 34
prediction: 12
actual: 12
prediction: 17
actual: 17
prediction: 13
actual: 13
prediction: 10
actual: 10
prediction: 10


prediction: 17
actual: 17
prediction: 11
actual: 17
prediction: 13
actual: 13
prediction: 1
actual: 1
prediction: 13
actual: 48
prediction: 17
actual: 17
prediction: 2
actual: 42
prediction: 17
actual: 42
prediction: 12
actual: 39
prediction: 2
actual: 42
prediction: 17
actual: 17
prediction: 39
actual: 48
prediction: 45
actual: 50
prediction: 13
actual: 43
prediction: 36
actual: 36
prediction: 42
actual: 42
prediction: 12
actual: 12
prediction: 11
actual: 11
prediction: 13
actual: 13
prediction: 12
actual: 12
prediction: 32
actual: 32
prediction: 11
actual: 11
prediction: 13
actual: 13
prediction: 10
actual: 10
prediction: 17
actual: 42
prediction: 2
actual: 2
prediction: 46
actual: 46
prediction: 13
actual: 13
prediction: 45
actual: 45
prediction: 17
actual: 17
prediction: 20
actual: 20
prediction: 1
actual: 10
prediction: 32
actual: 32
prediction: 17
actual: 17
prediction: 17
actual: 17
prediction: 17
actual: 17
prediction: 17
actual: 17
prediction: 17
actual: 17
prediction: 1
actua