# Document Classification using Hierarchical Attention Network

In [None]:
# !pip install torchdata

In [None]:
import torch
import torch.nn as nn
from torchtext.datasets import AG_NEWS
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torch.utils.data.dataset import random_split
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset

## Preprocessing

In [None]:
train_iter, test_iter = AG_NEWS()
tokenizer = get_tokenizer('basic_english')

In [None]:
def get_tokens(data_iter):
  for _, text in data_iter:
    yield tokenizer(text)

In [None]:
specials = ['<PAD>', '<UNK>']
vocab = build_vocab_from_iterator(get_tokens(train_iter), specials=specials)
vocab.set_default_index(vocab['<UNK>'])
# vocab(['<UNK>'])

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

# text_pipeline('')
# label_pipeline('10')

In [None]:
train_data = to_map_style_dataset(train_iter)
test_data = to_map_style_dataset(test_iter)
num_train = int(len(train_data) * 0.95)
train_data, valid_data = random_split(train_data, [num_train, len(train_data) - num_train])

In [None]:
EPOCHS = 10
LR = 5
BATCH_SIZE = 64
NUM_CLASSES = len(set([label for (label, text) in train_iter]))
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def collate_fn(batch):
  label_list, text_list, offsets = [], [], [0]
  for (_label, _text) in batch:
    label_list.append(label_pipeline(_label))
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))
  label_list = torch.tensor(label_list, dtype=torch.int64)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return label_list.to(device), text_list.to(device), offsets.to(device)

In [None]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

## Model Definition

In [None]:
class WordAttention(nn.Module):
  def __init__(self):
    super().__init__()

In [None]:
class HAN(nn.Module):
  def __init__(self, inp_dim, emb_dim, word_hid_dim, sent_hid_dim):
    super().__init__()
  
  def forward(self, inp, lock_dropout):
    batch_size = inp.shape[0]

torch.Size([2781])


## Model Training

## References

- [Hierarchical Attention Network for Document Classification](https://aclanthology.org/N16-1174.pdf)
- [Hierarchical Sentiment](https://github.com/cedias/Hierarchical-Sentiment)