In [None]:
! pip install -U pip
! pip install -U torch==1.5.0
! pip install -U torchtext==0.6.0
! pip install -U matplotlib==3.2.1
! pip install -U clearml>=0.15.0
! pip install -U tensorboard==2.2.1

In [None]:
import os
import time

import torch
import torch.nn as nn
from torchtext.datasets import text_classification
from torch.utils.tensorboard import SummaryWriter

from clearml import Task

%matplotlib inline

In [None]:
task = Task.init(project_name='Text Example', task_name='text classifier')
configuration_dict = {'number_of_epochs': 6, 'batch_size': 16, 'ngrams': 2, 'base_lr': 1.0}
configuration_dict = task.connect(configuration_dict)  # enabling configuration override by clearml
print(configuration_dict)  # printing actual configuration (after override in remote mode)

In [None]:
if not os.path.isdir('./data'):
    os.mkdir('./data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](root='./data', 
                                                                      ngrams=configuration_dict.get('ngrams', 2))
vocabulary = train_dataset.get_vocab()

In [None]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    # original data batch input are packed into a list and concatenated as a single tensor
    text = [entry[1] for entry in batch]
    # offsets is a tensor of delimiters to represent the beginning index of each sequence in the text tensor.
    offsets = [0] + [len(entry) for entry in text] 
    
    # torch.Tensor.cumsum returns the cumulative sum of elements in the dimension dim.
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = configuration_dict.get('batch_size', 16), 
                                           shuffle = True, pin_memory=True, collate_fn=generate_batch)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = configuration_dict.get('batch_size', 16), 
                                          shuffle = False, pin_memory=True, collate_fn=generate_batch)

classes = ("World", "Sports", "Business", "Sci/Tec")

In [None]:
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS)

device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')
print('Device to use: {}'.format(device))
model.to(device)

In [None]:
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=configuration_dict.get('base_lr', 1.0))
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 2, gamma=0.9)

In [None]:
tensorboard_writer = SummaryWriter('./tensorboard_logs')

In [None]:
def train_func(data, epoch):
    # Train the model
    train_loss = 0
    train_acc = 0
    for batch_idx, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()
        
        iteration = epoch * len(train_loader) + batch_idx
        if batch_idx % log_interval == 0: 
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                  .format(epoch, batch_idx * len(cls), len(train_dataset), 
                          100. * batch_idx / len(train_loader), loss))
            tensorboard_writer.add_scalar('training loss/loss', loss, iteration)
            tensorboard_writer.add_scalar('learning rate/lr', optimizer.param_groups[0]['lr'], iteration)

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(train_dataset), train_acc / len(train_dataset)

In [None]:
def test(data, epoch):
    loss = 0
    acc = 0
    for idx, (text, offsets, cls) in enumerate(data):
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            predicted = output.argmax(1)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (predicted == cls).sum().item()
            
        iteration = (epoch + 1) * len(train_loader)
        if idx % debug_interval == 0:    # report debug text every "debug_interval" mini-batches
            offsets = offsets.tolist() + [len(text)]
            for n, (pred, label) in enumerate(zip(predicted, cls)):
                ids_to_text = [vocabulary.itos[id] for id in text[offsets[n]:offsets[n+1]]]
                series = '{}_{}_label_{}_pred_{}'.format(idx, n, classes[label], classes[pred])
                tensorboard_writer.add_text('Test text samples/{}'.format(series), 
                                            ' '.join(ids_to_text), iteration)

    return loss / len(test_dataset), acc / len(test_dataset)

In [None]:
log_interval = 200
debug_interval = 500
for epoch in range(configuration_dict.get('number_of_epochs', 6)):
    start_time = time.time()
    
    train_loss, train_acc = train_func(train_loader, epoch)
    test_loss, test_acc = test(test_loader, epoch)
    
    secs = int(time.time() - start_time)

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(secs / 60, secs % 60))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')
    tensorboard_writer.add_scalar('accuracy/train', train_acc, (epoch + 1) * len(train_loader))
    tensorboard_writer.add_scalar('accuracy/test', test_acc, (epoch + 1) * len(train_loader))

In [None]:
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

ans = predict(ex_text_str, model.to("cpu"), vocabulary, configuration_dict.get('ngrams', 2))
print("This is a %s news" %classes[ans])