<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_doc_classification_BiLSTM_w2v.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification of Victor project using a BiSLTM as machine learning model

We use pretrained word embeddings (word2vec, 300, skip-gram) from http://nilc.icmc.usp.br/nilc/index.php/repositorio-de-word-embeddings-do-nilc

The model has just one dense layer for its output.

Deep learning library: PyTorch

### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
root_dir = '/content/gdrive/My Drive/'
dataset_dir = root_dir + 'Machine Learning/Victor datasets/'
word_embeddings_dir = dataset_dir + 'portuguese_word_vectors/'
word_embeddings_file = word_embeddings_dir + 'w2v_skip_s300.zip'

### Instaling dependencies

In [None]:
!pip install tqdm
!python -m spacy download pt

Collecting pt_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2MB)
[K     |████████████████████████████████| 21.2MB 1.2MB/s 
Building wheels for collected packages: pt-core-news-sm
  Building wheel for pt-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for pt-core-news-sm: filename=pt_core_news_sm-2.2.5-cp36-none-any.whl size=21186283 sha256=99597410e69085d29a1a9e77c819a0c4081a6a84c290682b88318f820c80b6df
  Stored in directory: /tmp/pip-ephem-wheel-cache-25ie9wkk/wheels/ea/94/74/ec9be8418e9231b471be5dc7e1b45dd670019a376a6b5bc1c0
Successfully built pt-core-news-sm
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
/usr/local/

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import sys
sys.path.append(word_embeddings_dir)
import preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
from tqdm import tqdm
import numpy as np
from datetime import datetime
from gensim.models import KeyedVectors
import zipfile

### Application parameters

In [None]:
S = 500 # sentence length

dataset_fraction = 0.6 # fraction of train and validation datasets to be used

model_path = dataset_dir + 'LSTM+w2v/'
model_file = model_path + f'pytorch_model-{S}-{dataset_fraction}.pt'

### Load word embeddings

In [None]:
%%time

zip_file = zipfile.ZipFile(word_embeddings_file, 'r')
path=zip_file.open('skip_s300.txt', 'r')
gensim_model = KeyedVectors.load_word2vec_format(path)

CPU times: user 4min 19s, sys: 1.67 s, total: 4min 21s
Wall time: 4min 24s


### Loading and preprocessing datasets

In [None]:
if dataset_fraction == 1.0: # full dataset
  train_ds_file = 'train_small.csv'
  validation_ds_file = 'validation_small.csv'
else:
  train_ds_file = f'train_small.csv-croped_{dataset_fraction}.csv'
  validation_ds_file = f'validation_small.csv-croped_{dataset_fraction}.csv'

In [None]:
import spacy

spacy_pt = spacy.load('pt')

def tokenizer(text):
  clean_text = preprocessing.clean_text(text)
  return [tok.text for tok in spacy_pt.tokenizer(clean_text)]


In [None]:
%%time

TEXT = data.Field(
    tokenize=tokenizer, 
    lower=True, 
    fix_length=S)
LABEL = data.Field(
    sequential=False, 
    unk_token=None)

train_data, valid_data, test_data = data.TabularDataset.splits(
    path=dataset_dir, 
    train=train_ds_file,
    validation=validation_ds_file, 
    test='test_small.csv', 
    format='csv', 
    skip_header = True, 
    fields=[(None, None), (None, None), (None, None), ('label', LABEL), (None, None), ('text', TEXT)])

CPU times: user 6min 44s, sys: 2.74 s, total: 6min 46s
Wall time: 6min 48s


In [None]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [None]:
W2V_SIZE = len(gensim_model.vectors[0])
word2vec_vectors = []
with tqdm(TEXT.vocab.stoi.items(), unit='token') as tepoch:
  for token, idx in tepoch:
    if token in gensim_model.vocab.keys():
      word2vec_vectors.append(torch.FloatTensor(gensim_model[token]))
    else:
      word2vec_vectors.append(torch.zeros(W2V_SIZE))

TEXT.vocab.set_vectors(TEXT.vocab.stoi, word2vec_vectors, W2V_SIZE)

  
100%|██████████| 148050/148050 [00:02<00:00, 63053.26token/s] 


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
  (train_data, valid_data, test_data),
  sort = False, #don't sort test/validation data
  batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
  device=device)

### Model

In [None]:
class VictorBiLSTM(nn.Module):

  def __init__(self, sentence_len, embedding_weights):
    super(VictorBiLSTM, self).__init__()
    self.hidden_dim = 200
    self.embedding_dim = embedding_weights.shape[1]
    self.sentence_len = sentence_len
    self.output_dim = 6

    self.word_embeddings = nn.Embedding.from_pretrained(embedding_weights)
    self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, bidirectional=True)
    self.linear = nn.Linear(self.hidden_dim * sentence_len, self.output_dim)
    self.dropout = nn.Dropout(0.20)

  def forward(self, sentence):
    s_len = sentence.shape[0] #sentence length
    b_len = sentence.shape[1] #batch size
    embeds = self.word_embeddings(sentence) #embeds shape: (s_len, b_len, embedding_dim)
    lstm_out, _ = self.lstm(embeds) #lstm_out shape: (s_len, b_len, 2 * hidden_dim) => the number 2 comes because the layer is bidirectional
    sum = (
        lstm_out[:, :, :self.hidden_dim] +  # hidden states from forward layer 
        lstm_out[:, :, self.hidden_dim:])   # hidden states from backward layer 
    linear_input = torch.flatten(sum.transpose(0, 1), start_dim=1) #linear_input shape: (b_len, s_len * hidden_dim)
    linear_input = self.dropout(linear_input)
    x = self.linear(linear_input)
    return x

### Training functions

In [None]:
from sklearn.metrics import f1_score

def compute_metrics(targets, predictions):
  f1_macro = f1_score(targets, np.argmax(predictions, axis=1), average='macro')
  f1_weighted = f1_score(targets, np.argmax(predictions, axis=1), average='weighted')
  return f1_macro, f1_weighted

def train(model, iterator, optimizer, criterion, epoch):
  epoch_loss = 0
  model.train()
  with tqdm(iterator, unit='batch') as tepoch:
    for batch in tepoch:
      tepoch.set_description(f"Epoch {epoch+1} (train)")
      optimizer.zero_grad()
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
      tepoch.set_postfix(loss=loss.item())

  return epoch_loss / len(iterator)

def predict(model, iterator):
  model.eval()
  predictions = None
  targets = None
  with torch.no_grad():
    with tqdm(iterator, unit=' batch') as tbatch:
      for batch in tbatch:
        tbatch.set_description(f'Predicting ')
        out = model(batch.text).squeeze(1)
        if predictions == None:
          predictions = out
          targets = batch.label
        else:
          predictions = torch.cat([predictions, out], dim=0)
          targets = torch.cat([targets, batch.label], dim=0)
  
  return predictions.cpu().numpy(), targets.cpu().numpy()

def evaluate(model, iterator):  
  predictions, targets = predict(model, iterator)
  return compute_metrics(targets, predictions)


### Training

In [None]:
EPOCHS = 20
learning_rate = 1e-3

model = VictorBiLSTM(S, TEXT.vocab.vectors)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [None]:
%%time
import pandas as pd
from IPython.display import display, update_display

metrics_df = pd.DataFrame(columns=['Epoch', 'Train Loss', 'Validation F1 macro', 'Validation F1 weighted'])
metrics_display = display(metrics_df, display_id='metrics_table')

best_valid_f1 = 0.0

for epoch in range(EPOCHS):
  train_loss = train(model, train_iterator, optimizer, criterion, epoch)
  valid_f1_m, valid_f1_w = evaluate(model, valid_iterator)
  
  #saving
  if valid_f1_m > best_valid_f1:
    best_valid_f1 = valid_f1_m
    torch.save(model.state_dict(), model_file)

  #printing
  metrics_df.loc[epoch] = [epoch + 1, train_loss, valid_f1_m, valid_f1_w]
  metrics_display.update(metrics_df)

Unnamed: 0,Epoch,Train Loss,Validation F1 macro,Validation F1 weighted
0,1.0,0.3303,0.680275,0.894597
1,2.0,0.183845,0.730629,0.906515
2,3.0,0.12272,0.756013,0.913665
3,4.0,0.09978,0.751363,0.913194
4,5.0,0.089001,0.7591,0.911326
5,6.0,0.080008,0.761584,0.917467
6,7.0,0.073276,0.765631,0.915173
7,8.0,0.068845,0.761339,0.916512
8,9.0,0.065283,0.757974,0.917758
9,10.0,0.060705,0.771743,0.919248


Epoch 1 (train): 100%|██████████| 2987/2987 [02:30<00:00, 19.90batch/s, loss=0.187]
Predicting : 100%|██████████| 1910/1910 [00:40<00:00, 46.63 batch/s]
Epoch 2 (train): 100%|██████████| 2987/2987 [02:34<00:00, 19.30batch/s, loss=0.0739]
Predicting : 100%|██████████| 1910/1910 [00:40<00:00, 46.76 batch/s]
Epoch 3 (train): 100%|██████████| 2987/2987 [02:34<00:00, 19.32batch/s, loss=0.355]
Predicting : 100%|██████████| 1910/1910 [00:40<00:00, 47.41 batch/s]
Epoch 4 (train): 100%|██████████| 2987/2987 [02:35<00:00, 19.20batch/s, loss=0.0248]
Predicting : 100%|██████████| 1910/1910 [00:40<00:00, 47.28 batch/s]
Epoch 5 (train): 100%|██████████| 2987/2987 [02:35<00:00, 19.26batch/s, loss=0.0411]
Predicting : 100%|██████████| 1910/1910 [00:40<00:00, 47.11 batch/s]
Epoch 6 (train): 100%|██████████| 2987/2987 [02:35<00:00, 19.20batch/s, loss=0.00794]
Predicting : 100%|██████████| 1910/1910 [00:40<00:00, 47.06 batch/s]
Epoch 7 (train): 100%|██████████| 2987/2987 [02:35<00:00, 19.25batch/s, loss=

CPU times: user 59min 21s, sys: 4min 20s, total: 1h 3min 41s
Wall time: 1h 5min 10s


### Evaluation

In [None]:
def load_saved_model(file_name):
  m = VictorBiLSTM(S, TEXT.vocab.vectors)
  m = m.to(device)
  m.load_state_dict(torch.load(file_name, map_location=device))
  m.eval()
  return m

# If the next line is keep commmented, it will load the saved model
#model_file = model_path + 'pytorch_model-500-1.0.pt'
model = load_saved_model(model_file)

In [None]:
predictions, targets = predict(model, test_iterator)

Predicting : 100%|██████████| 2986/2986 [01:03<00:00, 47.38 batch/s]


In [None]:
from sklearn.metrics import classification_report

report = classification_report(
    targets, 
    np.argmax(predictions, axis=1), 
    digits=4, 
    target_names=LABEL.vocab.itos)

print(report)

rep_file = open(model_file + "-test_report.txt", "wt")
rep_file.write(report + '\n')
rep_file.write(f'learning rate: {learning_rate}\n')
rep_file.write(f'optimizer: {type(optimizer).__name__}\n')
rep_file.write(f'criterion: {type(criterion).__name__}\n')
rep_file.close()

                                  precision    recall  f1-score   support

                          outros     0.9658    0.9691    0.9675     85408
                   peticao_do_RE     0.7083    0.7272    0.7176      6331
agravo_em_recurso_extraordinario     0.6074    0.5269    0.5643      1841
                        sentenca     0.7874    0.6956    0.7387      1475
          acordao_de_2_instancia     0.8710    0.8901    0.8804       273
     despacho_de_admissibilidade     0.6929    0.4899    0.5740       198

                        accuracy                         0.9391     95526
                       macro avg     0.7721    0.7165    0.7404     95526
                    weighted avg     0.9382    0.9391    0.9385     95526



References:

- https://medium.com/@rohit_agrawal/using-fine-tuned-gensim-word2vec-embeddings-with-torchtext-and-pytorch-17eea2883cd
