In [16]:
#codes are from https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split="train")

#Shows iteration of the dataset
#print(next(train_iter))




(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")
(3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.')
(3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.")
(3, 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.')
(3, 'Oil prices soar to all-time record, posing new menace t

In [9]:
#Some basic data processing building blocks are torchtext library, vocab, word vectors and tokenizer.

from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')
counter = Counter()


for (label, line) in train_iter:
    counter.update(tokenizer(line))
    
vocab = Vocab(counter, min_freq=1)

#Example for numerazitaion 
[vocab[token] for token in ["let's", "have", "some", "kitty", "tonight"]]


NameError: name 'length' is not defined

In [12]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x) -1

text_pipeline("what the heck is going on?")


[184, 3, 15462, 22, 665, 11, 81]

In [34]:
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = AG_NEWS(split="train")
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)


In [20]:
#data model, embeddings

from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)
    
    
    

In [25]:
#AG NEWS dataset has four class which are 1:World, 2: Sports, 3: Business, 4: Sci/Tech

train_iter = AG_NEWS(split="train")
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
#Embedding size
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
print("Model info: ", model, "\t, Model is run on: ", device)

Model info:  TextClassificationModel(
  (embedding): EmbeddingBag(95812, 64, mode=mean)
  (fc): Linear(in_features=64, out_features=4, bias=True)
) 	, Model is run on:  cuda


In [36]:
#Training the model and evaluation

import time
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
        if idx%log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print("| epoch: {:3d} -- batches: {:5d}/{:5d} -- accuracy: {:8.3f} |".format(epoch, idx, len(dataloader), total_acc/total_count))
            
            total_acc, total_count = 0,0
            start_time = time.time()
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label_size(0)
            
    return total_acc/total_count
            

In [38]:
#Splitting the dataset and testing the model

from torch.utils.data.dataset import random_split

EPOCHS = 10
LR = 5
BATCH_SIZE = 64

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

total_accu = None

train_iter, test_iter = AG_NEWS()
train_dataset = list(train_iter)
test_dataset = list(test_iter)
num_train = int(len(train_dataset)*0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset)-num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS+1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
        
    print('-'*59)
    print('| end of epoch: {:3d} -- time: {:5.2f}seconds -- valid accuracy: {:8.3f} |'.format(epoch, time.time()-epoch_start_time, accu_val))
    print('-'*59)


| epoch:   1 -- batches:   500/ 1782 -- accuracy:    0.881 |
| epoch:   1 -- batches:  1000/ 1782 -- accuracy:    0.891 |
| epoch:   1 -- batches:  1500/ 1782 -- accuracy:    0.894 |


NameError: name 'label_size' is not defined