# Document Classification

Document classification is a process that involves assigning a label/category to an untagged document from predetermined sets of categories depending on its content.

In [1]:
import os
import time
import torch
import torch.nn as nn
from torchtext.datasets import AG_NEWS
from torch.utils.data import DataLoader
from torch.optim import SGD, lr_scheduler
from torch.nn.utils import clip_grad_norm_
from torchtext.data.utils import get_tokenizer
from torch.utils.data.dataset import random_split
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

In [2]:
# !pip install torchdata

## Preprocessing

In [3]:
train_iter, test_iter = AG_NEWS()
tokenizer = get_tokenizer('basic_english')

In [4]:
def get_tokens(data_iter):
  for _, text in data_iter:
    yield tokenizer(text)

In [5]:
specials = ['<UNK>', '<PAD>']
vocab = build_vocab_from_iterator(get_tokens(train_iter), specials=specials)
vocab.set_default_index(vocab["<UNK>"])
# vocab(['<UNK>'])

In [6]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

# text_pipeline('<UNK>')
# label_pipeline('10')

In [7]:
train_data = to_map_style_dataset(train_iter)
test_data = to_map_style_dataset(test_iter)
num_train = int(len(train_data) * 0.95)
train_data, valid_data = random_split(train_data, [num_train, len(train_data) - num_train])

In [8]:
EPOCHS = 10
LR = 5
BATCH_SIZE = 64
NUM_CLASSES = len(set([label for (label, text) in train_iter]))
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
def collate_fn(batch):
  label_list, text_list, offsets = [], [], [0]
  for (_label, _text) in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))
  label_list = torch.tensor(label_list, dtype=torch.int64)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return label_list.to(device), text_list.to(device), offsets.to(device)

In [10]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

## Model Definition

In [11]:
class TextClassificationModel(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
    super(TextClassificationModel, self).__init__()
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
    self.fc = nn.Linear(embed_dim, num_class)
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()

  def forward(self, text, offsets):
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

## Model Training

In [12]:
model = TextClassificationModel(VOCAB_SIZE, EMBED_SIZE, NUM_CLASSES).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [13]:
def train(dataloader):
  model.train()
  total_loss, total_acc, total_count = 0, 0, 0
  for idx, (label, text, offsets) in enumerate(dataloader):
    optimizer.zero_grad()
    predicted_label = model(text, offsets)
    loss = criterion(predicted_label, label)
    loss.backward()
    clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()
    total_loss += loss.item()
    total_acc += (predicted_label.argmax(1) == label).sum().item()
    total_count += label.size(0)
  return total_loss/total_count, total_acc/total_count

In [14]:
def evaluate(dataloader):
  model.eval()
  total_loss, total_acc, total_count = 0, 0, 0
  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
      predicted_label = model(text, offsets)
      loss = criterion(predicted_label, label)
      total_loss += loss.item()
      total_acc += (predicted_label.argmax(1) == label).sum().item()
      total_count += label.size(0)
  return total_loss/total_count, total_acc/total_count

In [15]:
if not os.path.exists('./../models'):
  os.mkdir('./../models')

best_valid_loss = float('inf')
for epoch in range(EPOCHS):
  start_time = time.time()
  train_loss, train_acc = train(train_dataloader)
  val_loss, val_acc = evaluate(valid_dataloader)
  if val_loss < best_valid_loss:
    best_valid_loss = val_loss
    torch.save(model.state_dict(), './../models/doc-basic.pt')
  print(f'Epoch: {epoch+1:02} | Time: {time.time()-start_time} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}')

Epoch: 01 | Time: 9.047109127044678 | Train Loss: 0.007 | Val Loss: 0.005
Epoch: 02 | Time: 7.889991760253906 | Train Loss: 0.004 | Val Loss: 0.005
Epoch: 03 | Time: 8.327462911605835 | Train Loss: 0.004 | Val Loss: 0.004
Epoch: 04 | Time: 7.527512073516846 | Train Loss: 0.003 | Val Loss: 0.005
Epoch: 05 | Time: 8.903427839279175 | Train Loss: 0.003 | Val Loss: 0.005
Epoch: 06 | Time: 8.236012935638428 | Train Loss: 0.003 | Val Loss: 0.006
Epoch: 07 | Time: 7.583796739578247 | Train Loss: 0.003 | Val Loss: 0.005
Epoch: 08 | Time: 8.807356834411621 | Train Loss: 0.003 | Val Loss: 0.005
Epoch: 09 | Time: 8.494863271713257 | Train Loss: 0.002 | Val Loss: 0.006
Epoch: 10 | Time: 8.160791873931885 | Train Loss: 0.002 | Val Loss: 0.006


## Model Test

In [16]:
test_loss, test_acc = evaluate(test_dataloader)
print(f'Test Loss: {test_loss:.3f} | Test Accuracy: {test_acc:.3f}')

Test Loss: 0.006 | Test Accuracy: 0.889


## Inferencing

In [17]:
def predict(text, text_pipeline):
  with torch.no_grad():
    text = torch.tensor(text_pipeline(text))
    output = model(text, torch.tensor([0]))
    return output.argmax(1).item() + 1

In [18]:
news_labels = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}
model = model.to('cpu')

text = 'The DJ used two artificial intelligence sites to create lyrics and a rap in the style of \
        the US star for a live show. The French producer has said he will not release the track commercially. \
        But he said he thinks musicians will use AI as a tool to create new sounds in the future, because \
        every new music style comes from a new technology.'

print(f"{news_labels[predict(text, text_pipeline)]} News")

World News


## References

- [Text Classification with the TorchText Library](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html#split-the-dataset-and-run-the-model)
- [Text Classification: Papers with code](https://paperswithcode.com/task/text-classification)