In [1]:
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True) #to make sure we get actual length of batch in return value

LABEL = data.LabelField(dtype = torch.int64)

In [2]:
from torchtext.legacy import datasets

train_dataset, test_dataset = datasets.TREC.splits(TEXT, LABEL)

downloading train_5500.label


100%|██████████| 336k/336k [00:01<00:00, 199kB/s]  


downloading TREC_10.label


100%|██████████| 23.4k/23.4k [00:00<00:00, 74.8kB/s]


In [3]:
#we need to split the test into val and test data
val_dataset,test_dataset = test_dataset.split(0.5)
print(len(train_dataset))
print(len(test_dataset))
print(len(val_dataset))

5452
250
250


In [4]:
MAX_VOCAB_SIZE = 15000

TEXT.build_vocab(train_dataset, 
                 max_size = MAX_VOCAB_SIZE)
                 #needed if you are training the model, as model trained on Kaggle, commenting it
                #  vectors = "glove.6B.100d", #using pretrained embeddings
                #  unk_init = torch.Tensor.normal_) #initializing all vocab, but not in pretrained, to random values

LABEL.build_vocab(train_dataset)

In [5]:
print(LABEL.vocab.itos[:])
# HUM for questions about humans
# ENTY for questions about entities
# DESC for questions asking you for a description
# NUM for questions where the answer is numerical
# LOC for questions where the answer is a location
# ABBR for questions asking about abbreviations

['ENTY', 'HUM', 'DESC', 'NUM', 'LOC', 'ABBR']


In [6]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, val_dataset, test_dataset), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True, #for using packed, we need each batch sorted by length
    device = device)

cpu


In [7]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx) #PAD INDEX as we 
        #don't want to learn the embeddings for the paddings
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim) #no need for relu or anything as loss will cover it
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths): #we need to give text lenghths in each pass, as we are using packed 
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        #to cpu is in documentation, text_lengths we will get from iterator, and its actual lenght of each sen
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded) #LSTM returns 3 things
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors as they aren't even trained
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)) #last two hidden are concat
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [8]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 6
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [None]:
# putting pretrained embeddings and initializing embedding layer
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
print(pretrained_embeddings.shape)

In [None]:
#changing <pad> and <unk> embeddings to zero(Not compolsary)
model.embedding.weight.data[0] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[1] = torch.zeros(EMBEDDING_DIM)
print(model.embedding.weight.data)

In [9]:
from tqdm import tqdm
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text,text_len = batch.text
        predictions = model(text,text_len).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text,text_len = batch.text
            predictions = model(text,text_len).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# N_EPOCHS = 15
# from tqdm import tqdm 

# for epoch in tqdm(range(N_EPOCHS)):

#     train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)    
#     print(f'Epoch: {epoch+1:02}')
#     print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
#     if (epoch%5 == 0):
#         torch.save(model.state_dict(), 'trec-5-epoch-{}.pt') 

In [44]:
model.load_state_dict(torch.load('trec-5-epoch.pt', map_location=device))
print("Model Loaded Successfully")
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
train_loss, train_acc = evaluate(model, test_iterator, criterion)
print(valid_acc)
print(train_acc)

Model Loaded Successfully
0.8830818980932236
0.9108297377824783


In [45]:
import spacy
nlp = spacy.load('en_core_web_sm')
import numpy as np
def predict_class(model, sentence, min_len = 4):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length = [len(indexed)]
    length_tensor = torch.LongTensor(length)
    preds = model(tensor,length_tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

In [48]:
classes = ["HUM for questions about humans",
"ENTY for questions about entities",
"DESC for questions asking you for a description",
"NUM for questions where the answer is numerical",
"LOC for questions where the answer is a location",
"ABBR for questions asking about abbreviations"]

print(classes[predict_class(model,"Who was Galileo ?")])
print(classes[predict_class(model,"How old am I?")])
print(classes[predict_class(model,"What continent is Bulgaria in?")])
print(classes[predict_class(model,"What does NLP stand for?")])


ENTY for questions about entities
NUM for questions where the answer is numerical
LOC for questions where the answer is a location
ABBR for questions asking about abbreviations
