<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_doc_classification_BiLSTM_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification of Victor project using a BiSLTM as machine learning model

The notebook explores variations on the NN architecture described in the _VICTOR: a Dataset for Brazilian Legal Documents Classification_ paper. It explores resources not explored in the paper like dropout and 2 dense layers

Deep learning library: PyTorch

### Instaling dependencies

In [None]:
!pip install tqdm
!python -m spacy download pt

Collecting pt_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2MB)
[K     |████████████████████████████████| 21.2MB 1.3MB/s 
Building wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.2.5-cp36-none-any.whl size=21186283 sha256=c6fcaaaabb4111732b1b6015062766ec0bc811d808e0136ea8120d4331c3ab4f
  Stored in directory: /tmp/pip-ephem-wheel-cache-zsd24yt6/wheels/ea/94/74/ec9be8418e9231b471be5dc7e1b45dd670019a376a6b5bc1c0
Successfully built pt-core-news-sm
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
/usr/local/

### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/'

Mounted at /content/gdrive


### Application parameters

In [None]:
S = 250 # sentence length
BATCH_SIZE = 32
NUM_OF_CLASSES = 6

dataset_fraction = 0.6 # fraction of train and validation datasets to be used

DROPOUT_RATE_HIDDEN = 0.6    # dropout rate for the MLP's hidden layer
DROPOUT_RATE_OUT = 0.3       # dropout rate for the MLP's output layer
RNN_HIDDEN_UNITS = 300
MLP_HIDDEN_UNITS = 100
EMBEDDING_DIM = 300

dataset_dir = root_dir + 'Machine Learning/Victor datasets/'
model_path = dataset_dir + 'LSTM_2/'
model_file = model_path + f'pytorch_model-{S}-{dataset_fraction}.pt'

### Loading and preprocessing datasets

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
from tqdm.notebook import trange, tqdm_notebook
import numpy as np
from datetime import datetime

In [None]:
if dataset_fraction == 1.0: # full dataset
  train_ds_file = 'train_small.csv'
  validation_ds_file = 'validation_small.csv'
else:
  train_ds_file = f'train_small.csv-croped_{dataset_fraction}.csv'
  validation_ds_file = f'validation_small.csv-croped_{dataset_fraction}.csv'

In [None]:
import spacy

spacy_pt = spacy.load('pt')

def tokenizer(text):
  return [tok.text for tok in spacy_pt.tokenizer(text)]


In [None]:
%%time

TEXT = data.Field(
    tokenize=tokenizer, 
    lower=True, 
    fix_length=S)
LABEL = data.Field(
    sequential=False, 
    unk_token=None)

train_data, valid_data, test_data = data.TabularDataset.splits(
    path=dataset_dir, 
    train=train_ds_file,
    validation=validation_ds_file, 
    test='test_small.csv', 
    format='csv', 
    skip_header = True, 
    fields=[(None, None), (None, None), (None, None), ('label', LABEL), (None, None), ('text', TEXT)])

CPU times: user 3min 58s, sys: 2.63 s, total: 4min
Wall time: 4min 2s


In [None]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
  (train_data, valid_data, test_data),
  sort = False, #don't sort test/validation data
  batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
  device=device)

### Model

In [None]:
class VictorBiLSTM(nn.Module):

  def __init__(self, sentence_len, vocab_size, embed_dim, n_classes, rnn_h, mlp_h, h_dropout, o_dropout):
    """
    sentence_len: the length of the each input sentence.
    vocab_size:   the number of tokens in the vocabulary.
    embed_dim:    the dimension of each embedding word vector.
    n_classes:    number of classes, i.e., the output dimension of this NN.
    rnn_h:        number of hidden units of the BiLSTM NN.
    mlp_h:        number of hidden units of the MLP NN.
    h_dropout:    the dropout rate of the MLP's hidden layer.
    o_dropout:    the dropout rate of the MLP's output layer.
    """
    super(VictorBiLSTM, self).__init__()
    self.rnn_hidden_dim = rnn_h
        
    self.word_embeddings = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim, rnn_h, bidirectional=True)
    self.dropout_h = nn.Dropout(p=h_dropout)
    self.linear_h = nn.Linear(rnn_h * sentence_len, mlp_h)
    self.dropout_o = nn.Dropout(p=o_dropout)
    self.linear_o = nn.Linear(mlp_h, n_classes)

  def forward(self, sentence):
    s_len = sentence.shape[0]  # sentence length
    b_len = sentence.shape[1]  # batch size
    embeds = self.word_embeddings(sentence) # embeds shape: (s_len, b_len, embedding_dim)
    lstm_out, _ = self.lstm(embeds)         # lstm_out shape: (s_len, b_len, 2 * hidden_dim)
    sum = (
        lstm_out[:, :, :self.rnn_hidden_dim] +  # hidden states from forward layer 
        lstm_out[:, :, self.rnn_hidden_dim:])   # hidden states from backward layer 
    linear_input = torch.flatten(sum.transpose(0, 1), start_dim=1) # linear_input shape: (b_len, s_len * hidden_dim)
    linear_input = self.dropout_h(linear_input)
    linear_input = F.relu(self.linear_h(linear_input))
    linear_input = self.dropout_o(linear_input)
    x = self.linear_o(linear_input)
    return x

### Training functions

In [None]:
from sklearn.metrics import f1_score

def compute_metrics(targets, predictions):
  f1_macro = f1_score(targets, np.argmax(predictions, axis=1), average='macro')
  return f1_macro

def train(model, iterator, optimizer, criterion, epoch):
  epoch_loss = 0
  model.train()
  for batch in tqdm_notebook(iterator, desc='Train', unit='batch', leave=False):
    optimizer.zero_grad()
    predictions = model(batch.text)
    loss = criterion(predictions, batch.label)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

def predict(model, iterator, set_name):
  model.eval()
  predictions = None
  targets = None
  with torch.no_grad():
    for batch in tqdm_notebook(iterator, desc=f'Predicting ({set_name})', unit='batch', leave=False):
      out = model(batch.text)
      if predictions == None:
        predictions = out
        targets = batch.label
      else:
        predictions = torch.cat([predictions, out], dim=0)
        targets = torch.cat([targets, batch.label], dim=0)
  
  return predictions.cpu().numpy(), targets.cpu().numpy()

def evaluate(model, iterator, set_name):  
  predictions, targets = predict(model, iterator, set_name)
  return compute_metrics(targets, predictions)


### Training

In [None]:
EPOCHS = 15
learning_rate = 1e-3

model = VictorBiLSTM(S, len(TEXT.vocab), EMBEDDING_DIM, NUM_OF_CLASSES, RNN_HIDDEN_UNITS, MLP_HIDDEN_UNITS, DROPOUT_RATE_HIDDEN, DROPOUT_RATE_OUT)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [None]:
%%time
import pandas as pd
from IPython.display import display, update_display

metrics_df = pd.DataFrame(columns=['Epoch', 'Loss (train)', 'F1 macro (train)', 'F1 macro (validation)'])
metrics_display = display(metrics_df, display_id='metrics_table')

best_valid_f1 = 0.0

for epoch in range(EPOCHS):
  train_loss = train(model, train_iterator, optimizer, criterion, epoch)
  train_f1_m = evaluate(model, train_iterator, 'train set')
  valid_f1_m = evaluate(model, valid_iterator, 'validation set')
  
  #saving
  if valid_f1_m > best_valid_f1:
    best_valid_f1 = valid_f1_m
    torch.save(model.state_dict(), model_file)

  #printing
  metrics_df.loc[epoch] = [epoch + 1, train_loss, train_f1_m, valid_f1_m]
  metrics_display.update(metrics_df)

Unnamed: 0,Epoch,Loss (train),F1 macro (train),F1 macro (validation)
0,1.0,0.325178,0.774358,0.69878
1,2.0,0.186868,0.859997,0.742539
2,3.0,0.143794,0.91159,0.762717
3,4.0,0.122653,0.927838,0.776225
4,5.0,0.108553,0.938557,0.779934
5,6.0,0.103567,0.944232,0.78273
6,7.0,0.094812,0.944534,0.776967
7,8.0,0.088143,0.944921,0.773049
8,9.0,0.084274,0.946299,0.779666
9,10.0,0.086324,0.94177,0.766669


HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…



HBox(children=(FloatProgress(value=0.0, description='Predicting (train set)', max=2987.0, style=ProgressStyle(…



HBox(children=(FloatProgress(value=0.0, description='Predicting (validation set)', max=1910.0, style=ProgressS…



HBox(children=(FloatProgress(value=0.0, description='Train', max=2987.0, style=ProgressStyle(description_width…

### Evaluation

In [None]:
def load_saved_model(file_name):
  m = VictorBiLSTM(S, len(TEXT.vocab), EMBEDDING_DIM, NUM_OF_CLASSES, RNN_HIDDEN_UNITS, MLP_HIDDEN_UNITS, DROPOUT_RATE_HIDDEN, DROPOUT_RATE_OUT)
  m = m.to(device)
  m.load_state_dict(torch.load(file_name, map_location=device))
  m.eval()
  return m

# If the next line is keep commmented, it will load the saved model
#model_file = model_path + 'pytorch_model-500-1.0.pt'
model = load_saved_model(model_file)

In [None]:
train_predictions, train_targets = predict(model, train_iterator, 'train set')
test_predictions, test_targets = predict(model, test_iterator, 'test set')

In [None]:
from sklearn.metrics import classification_report

test_report = classification_report(
    test_targets, 
    np.argmax(test_predictions, axis=1), 
    digits=4, 
    target_names=LABEL.vocab.itos)

train_report = classification_report(
    train_targets, 
    np.argmax(train_predictions, axis=1), 
    digits=4, 
    target_names=LABEL.vocab.itos)

print(test_report)

rep_file = open(model_file + "-test_report.txt", "wt")
rep_file.write('BiLSTM+Dropout+Dense+Dropout+Dense evaluation report\n')
rep_file.write(f'Test {test_report}\n')
rep_file.write(f'Train {train_report}\n')
rep_file.write(f'learning rate: {learning_rate}\n')
rep_file.write(f'optimizer: {type(optimizer).__name__}\n')
rep_file.write(f'criterion: {type(criterion).__name__}\n')
rep_file.write(f'DROPOUT_RATE_HIDDEN: {DROPOUT_RATE_HIDDEN}\n')
rep_file.write(f'DROPOUT_RATE_OUT: {DROPOUT_RATE_OUT}\n')
rep_file.write(f'RNN_HIDDEN_UNITS: {RNN_HIDDEN_UNITS}\n')
rep_file.write(f'MLP_HIDDEN_UNITS: {MLP_HIDDEN_UNITS}\n')
rep_file.write(f'EMBEDDING_DIM: {EMBEDDING_DIM}\n')
rep_file.close()

References:

- https://medium.com/@rohit_agrawal/using-fine-tuned-gensim-word2vec-embeddings-with-torchtext-and-pytorch-17eea2883cd
