# Facts extraction with AILA data and LEGAL-BERT-BASE
### PyTorch version

The model is trained trough a fune-tuning process.

The model is evaluated through a cross-validation.

We use the train dataset from AILA 2020. This can be obtained at https://github.com/Law-AI/semantic-segmentation;

LEGAL-BERT is available at https://huggingface.co/nlpaueb/legal-bert-base-uncased.

https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb

### Notebook parameters

In [1]:
model_id = 'nlpaueb/legal-bert-base-uncased'
model_reference = 'legal-bert-base'

### Dependencies

In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import transformers
transformers.__version__

'4.17.0'

In [5]:
import torch
torch.__version__

'1.9.0a0+df837d0'

In [6]:
import sklearn
sklearn.__version__

'0.24.1'

In [7]:
import numpy as np
np.__version__

'1.19.2'

### Random numbers' seed

In [8]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

### Loading dataset

In [9]:
from os import listdir
import pandas as pd
import csv

train_dir = 'train/'

def read_docs(dir_name):
  """
  Read the docs in a directory.
  Params:
    dir_name : the directory that contains the documents.
  Returns:
    A dictionary whose keys are the names of the read files and the values are 
    pandas dataframes. Each dataframe has sentence and label columns.
  """
  docs = {} # key: file name, value: dataframe with sentences and labels
  for f in listdir(dir_name):
    df = pd.read_csv(
        dir_name + f, 
        sep='\t', 
        quoting=csv.QUOTE_NONE, 
        names=['sentence', 'label'])
    docs[f] = df
  return docs

docs_dic = read_docs(train_dir)

print('Number of documents:', len(docs_dic))

Number of documents: 50


In [10]:
df_folds = pd.read_csv(
  'train_docs_by_fold.csv', 
  sep=';', 
  names=['fold id', 'train', 'test'], 
  header=0
)

train_files_by_fold = {}
test_files_by_fold = {}
fold_ids = []

for _, row in df_folds.iterrows():
  fold_id = int(row['fold id'])
  fold_ids.append(int(fold_id))
  train_files_by_fold[fold_id] = row['train'].split(',')
  test_files_by_fold[fold_id] = row['test'].split(',')
  print(f'Fold {fold_id}:\n\tTrain files: {train_files_by_fold[fold_id]}\n\tTest files: {test_files_by_fold[fold_id]}')


Fold 0:
	Train files: ['d_44.txt', 'd_39.txt', 'd_12.txt', 'd_2.txt', 'd_7.txt', 'd_33.txt', 'd_16.txt', 'd_8.txt', 'd_42.txt', 'd_34.txt', 'd_40.txt', 'd_24.txt', 'd_36.txt', 'd_11.txt', 'd_13.txt', 'd_19.txt', 'd_18.txt', 'd_4.txt', 'd_1.txt', 'd_21.txt', 'd_15.txt', 'd_23.txt', 'd_32.txt', 'd_9.txt', 'd_5.txt', 'd_3.txt', 'd_26.txt', 'd_20.txt', 'd_30.txt', 'd_41.txt', 'd_46.txt', 'd_43.txt', 'd_50.txt', 'd_27.txt', 'd_25.txt', 'd_35.txt', 'd_45.txt', 'd_17.txt', 'd_48.txt', 'd_6.txt']
	Test files: ['d_22.txt', 'd_31.txt', 'd_49.txt', 'd_14.txt', 'd_29.txt', 'd_47.txt', 'd_10.txt', 'd_38.txt', 'd_28.txt', 'd_37.txt']
Fold 1:
	Train files: ['d_22.txt', 'd_31.txt', 'd_49.txt', 'd_14.txt', 'd_29.txt', 'd_47.txt', 'd_10.txt', 'd_38.txt', 'd_28.txt', 'd_37.txt', 'd_40.txt', 'd_24.txt', 'd_36.txt', 'd_11.txt', 'd_13.txt', 'd_19.txt', 'd_18.txt', 'd_4.txt', 'd_1.txt', 'd_21.txt', 'd_15.txt', 'd_23.txt', 'd_32.txt', 'd_9.txt', 'd_5.txt', 'd_3.txt', 'd_26.txt', 'd_20.txt', 'd_30.txt', 'd_41.

### Tokenizer and Dataset preparation

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [12]:
from torch.utils.data import Dataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

labels_to_idx = {
  'Facts' : 0, 
  'Other' : 1
}

class MyDataset(Dataset):
  def __init__(self, sentences, labels, tokenizer):
    """
    Arguments:
      sentences : list of strings.
      lables : list of strings.
    """
    self.len = len(sentences)
    self.labels = list(labels)
    self.targets = []
    for l in labels:
      self.targets.append(labels_to_idx[l])
    self.targets = torch.tensor(self.targets, dtype=torch.long)
    self.data = tokenizer(
      sentences, 
      None,
      add_special_tokens=True,
      padding='longest', 
      return_token_type_ids=True, 
      return_attention_mask=True, 
      truncation=True, 
      return_tensors='pt'
    )

  def __getitem__(self, index):
    return {
      'ids': self.data['input_ids'][index],
      'mask': self.data['attention_mask'][index], 
      'token_type_ids': self.data['token_type_ids'][index], 
      'targets': self.targets[index], 
      'labels': self.labels[index]
    }
  
  def __len__(self):
    return self.len


In [13]:
def get_dataset(docs_list):
  sentences = []
  labels = []
  for doc_id in docs_list:
    sentences.extend(docs_dic[doc_id]['sentence'].to_list())
    labels.extend(docs_dic[doc_id]['label'].to_list())
  #return MyDataset(sentences[:10], labels[:10], tokenizer) # for code validation
  return MyDataset(sentences, labels, tokenizer)

def count_labels(set_title, ds):
  print(f'{set_title} numbers:')
  print(f' Total number of sentences: {ds.len}')
  n_facts = len([l for l in ds.labels if l == "Facts"])
  print(f' Number of Facts labels: {n_facts}')
  n_other = len([l for l in ds.labels if l == "Other"])
  print(f' Number of Other labels: {n_other}')
  return n_facts, n_other

### Model

In [14]:
class SentenceClassifier(torch.nn.Module):
  def __init__(self):
    super(SentenceClassifier, self).__init__()
    self.bert = transformers.AutoModel.from_pretrained(model_id)
    self.dropout = torch.nn.Dropout(0.4)
    self.classifier = torch.nn.Linear(768, 2) # 768 => hidden vector's dimension, 2 => two classes
    torch.nn.init.xavier_uniform_(self.classifier.weight)

  def forward(self, input_ids, attention_mask, token_type_ids):
    output_1 = self.bert(
      input_ids=input_ids,            # input_ids.shape: (batch_size, seq_len)
      attention_mask=attention_mask,  # attention_mask.shape: (batch_size, seq_len)
      token_type_ids=token_type_ids   # token_type_ids.shape: (batch_size, seq_len)
    )
    hidden_state = output_1.last_hidden_state # hidden states of last BERT's layer => shape: (batch_size, seq_len, embedd_dim)
    cls_embeddings = hidden_state[:, 0]       # hidden states of the CLS tokens => shape: (batch_size, embedd_dim)
    cls_embeddings = self.dropout(cls_embeddings)
    logits = self.classifier(cls_embeddings)  # shape: (batch_size, 2)

    return logits

### Evaluation function

In [15]:
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm

def evaluate(model, test_dataloader, loss_function):
  predictions = torch.tensor([]).to(device)
  y_true = torch.tensor([]).to(device)
  eval_loss = 0
  model.eval()
  with torch.no_grad():
    for data in tqdm(test_dataloader, desc='Evaluation'):
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      token_type_ids = data['token_type_ids'].to(device)
      y_true_batch = data['targets'].to(device)
      y_hat = model.forward(ids, mask, token_type_ids)
      loss = loss_function(y_hat, y_true_batch)
      eval_loss += loss.item()
      predictions_batch = y_hat.argmax(dim=1)
      predictions = torch.cat((predictions, predictions_batch))
      y_true = torch.cat((y_true, y_true_batch))
    predictions = predictions.detach().to('cpu').numpy()
    y_true = y_true.detach().to('cpu').numpy()
  eval_loss = eval_loss / len(test_dataloader)
  # Precision, Recall, F1
  t_metrics = precision_recall_fscore_support(
    y_true, 
    predictions, 
    average='binary', 
    pos_label=labels_to_idx['Facts'], 
    zero_division=0)
  
  return eval_loss, t_metrics[0], t_metrics[1], t_metrics[2]

### Fine-tuning

In [16]:
import sys, time

supports_by_fold = {}

def train(fold_id, learning_rate, weight_decay, n_epochs, batch_size):
  ds_train = get_dataset(train_files_by_fold[fold_id])
  ds_test = get_dataset(test_files_by_fold[fold_id])
  n_facts_train, n_other_train = count_labels('Train dataset', ds_train)
  n_facts_test, n_other_test = count_labels('Test dataset', ds_test)
  supports_by_fold[fold_id] = {
    'facts_train' : n_facts_train, 
    'other_train' : n_other_train, 
    'facts_test' : n_facts_test, 
    'other_test' : n_other_test
  }
  dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True)
  dl_test = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=True)

  sentence_classifier = SentenceClassifier().to(device)
  criterion = torch.nn.CrossEntropyLoss().to(device)
  optimizer = torch.optim.Adam(
    sentence_classifier.parameters(), 
    lr=learning_rate, 
    betas=(0.9, 0.999), 
    eps=1e-8, 
    weight_decay=weight_decay
  )
  lr_scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0, 
    num_training_steps = len(dl_train) * n_epochs
  )
  
  metrics = {} # key: epoch number, value: numpy tensor storing train loss, test loss, Precision, Recall, F1
  for epoch in range(1, n_epochs + 1):
    epoch_loss = 0
    sentence_classifier.train()
    for train_data in tqdm(dl_train, desc=f'Epoch {epoch} (train)'):
      optimizer.zero_grad()
      ids = train_data['ids'].to(device)
      mask = train_data['mask'].to(device)
      token_type_ids = train_data['token_type_ids'].to(device)
      y_hat = sentence_classifier(ids, mask, token_type_ids)
      y_true = train_data['targets'].to(device)
      loss = criterion(y_hat, y_true)
      epoch_loss += loss.item()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(sentence_classifier.parameters(), 1.0)
      optimizer.step()
      lr_scheduler.step()
    epoch_loss = epoch_loss / len(dl_train)
    # evaluation
    optimizer.zero_grad()
    eval_loss, p, r, f1 = evaluate(sentence_classifier, dl_test, criterion)
    #storing metrics
    metrics[epoch] = np.array([epoch_loss, eval_loss, p, r, f1])
    print(f'=> Epoch {epoch}')
    print(f'  Train loss: {epoch_loss:.6f}')
    print(f'  Test loss:  {eval_loss:.6f}')
    print(f'  Precision:  {p:.6f}')
    print(f'  Recall:     {r:.6f}')
    print(f'  F1:         {f1:.6f}')
    time.sleep(0.5) # in order to don't mess the progress bars
  
  return metrics


In [17]:
%%time
# Training params
n_epochs = 4
learning_rate = 2e-5
weight_decay = 1e-3
batch_size = 32

#fold_ids = [0, 1] # for code validation
raw_metrics = {} # key: epoch, value: numpy tensor of shape (n_folds, 5)
for fold_id in fold_ids:
  print('********************************************')
  print(f'               FOLD {fold_id}')
  print('********************************************')
  fold_metrics = train(fold_id, learning_rate, weight_decay, n_epochs, batch_size)
  for epoch, scores in fold_metrics.items():
    epoch_metrics = raw_metrics.get(epoch, None)
    if epoch_metrics is None:
      raw_metrics[epoch] = scores
    else:
      raw_metrics[epoch] = np.vstack((epoch_metrics, scores))

metrics = pd.DataFrame(columns=['Epoch', 'Train loss', 'std', 'Test loss', 'std', 'Precision', 'P std', 'Recall', 'R std', 'F1', 'F1 std'])
for i, (epoch, scores) in enumerate(raw_metrics.items()):
  mean = np.mean(scores, axis=0)
  std = np.std(scores, axis=0)
  metrics.loc[i] = [
      f'{epoch}', 
      f'{mean[0]:.6f}', f'{std[0]:.6f}',  # train loss
      f'{mean[1]:.6f}', f'{std[1]:.6f}',  # test loss
      f'{mean[2]:.4f}', f'{std[2]:.4f}',  # precision
      f'{mean[3]:.4f}', f'{std[3]:.4f}',  # recall
      f'{mean[4]:.4f}', f'{std[4]:.4f}'   # f1
    ]

********************************************
               FOLD 0
********************************************
Train dataset numbers:
 Total number of sentences: 7693
 Number of Facts labels: 1584
 Number of Other labels: 6109
Test dataset numbers:
 Total number of sentences: 1687
 Number of Facts labels: 635
 Number of Other labels: 1052


Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 241/241 [03:20<00:00,  1.20it/s]
Evaluation: 100%|██████████| 53/53 [00:13<

=> Epoch 1
  Train loss: 0.427436
  Test loss:  0.655845
  Precision:  0.789954
  Recall:     0.272441
  F1:         0.405152


Epoch 2 (train): 100%|██████████| 241/241 [03:20<00:00,  1.20it/s]
Evaluation: 100%|██████████| 53/53 [00:13<00:00,  3.99it/s]


=> Epoch 2
  Train loss: 0.337942
  Test loss:  0.513950
  Precision:  0.715217
  Recall:     0.518110
  F1:         0.600913


Epoch 3 (train): 100%|██████████| 241/241 [03:20<00:00,  1.20it/s]
Evaluation: 100%|██████████| 53/53 [00:13<00:00,  4.00it/s]


=> Epoch 3
  Train loss: 0.285297
  Test loss:  0.511954
  Precision:  0.710775
  Recall:     0.592126
  F1:         0.646048


Epoch 4 (train): 100%|██████████| 241/241 [03:20<00:00,  1.20it/s]
Evaluation: 100%|██████████| 53/53 [00:13<00:00,  3.99it/s]


=> Epoch 4
  Train loss: 0.239111
  Test loss:  0.554965
  Precision:  0.721116
  Recall:     0.570079
  F1:         0.636763
********************************************
               FOLD 1
********************************************
Train dataset numbers:
 Total number of sentences: 7782
 Number of Facts labels: 1772
 Number of Other labels: 6010
Test dataset numbers:
 Total number of sentences: 1598
 Number of Facts labels: 447
 Number of Other labels: 1151


Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 244/244 [02:55<00:00,  1.39it/s]
Evaluation: 100%|██████████| 50/50 [00:14<

=> Epoch 1
  Train loss: 0.435274
  Test loss:  0.541622
  Precision:  0.822430
  Recall:     0.393736
  F1:         0.532526


Epoch 2 (train): 100%|██████████| 244/244 [02:55<00:00,  1.39it/s]
Evaluation: 100%|██████████| 50/50 [00:14<00:00,  3.44it/s]


=> Epoch 2
  Train loss: 0.329004
  Test loss:  0.516198
  Precision:  0.777778
  Recall:     0.469799
  F1:         0.585774


Epoch 3 (train): 100%|██████████| 244/244 [02:55<00:00,  1.39it/s]
Evaluation: 100%|██████████| 50/50 [00:14<00:00,  3.45it/s]


=> Epoch 3
  Train loss: 0.272608
  Test loss:  0.534407
  Precision:  0.739394
  Recall:     0.545861
  F1:         0.628057


Epoch 4 (train): 100%|██████████| 244/244 [02:55<00:00,  1.39it/s]
Evaluation: 100%|██████████| 50/50 [00:14<00:00,  3.45it/s]


=> Epoch 4
  Train loss: 0.216023
  Test loss:  0.594240
  Precision:  0.740385
  Recall:     0.516779
  F1:         0.608696
********************************************
               FOLD 2
********************************************
Train dataset numbers:
 Total number of sentences: 7002
 Number of Facts labels: 1844
 Number of Other labels: 5158
Test dataset numbers:
 Total number of sentences: 2378
 Number of Facts labels: 375
 Number of Other labels: 2003


Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<

=> Epoch 1
  Train loss: 0.500213
  Test loss:  0.306907
  Precision:  0.593407
  Recall:     0.576000
  F1:         0.584574


Epoch 2 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<00:00,  4.44it/s]


=> Epoch 2
  Train loss: 0.358386
  Test loss:  0.293164
  Precision:  0.630573
  Recall:     0.528000
  F1:         0.574746


Epoch 3 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<00:00,  4.43it/s]


=> Epoch 3
  Train loss: 0.294027
  Test loss:  0.317186
  Precision:  0.611765
  Recall:     0.554667
  F1:         0.581818


Epoch 4 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<00:00,  4.44it/s]


=> Epoch 4
  Train loss: 0.242084
  Test loss:  0.350818
  Precision:  0.566351
  Recall:     0.637333
  F1:         0.599749
********************************************
               FOLD 3
********************************************
Train dataset numbers:
 Total number of sentences: 7581
 Number of Facts labels: 1863
 Number of Other labels: 5718
Test dataset numbers:
 Total number of sentences: 1799
 Number of Facts labels: 356
 Number of Other labels: 1443


Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 237/237 [03:16<00:00,  1.21it/s]
Evaluation: 100%|██████████| 57/57 [00:12<

=> Epoch 1
  Train loss: 0.487663
  Test loss:  0.354011
  Precision:  0.635945
  Recall:     0.387640
  F1:         0.481675


Epoch 2 (train): 100%|██████████| 237/237 [03:16<00:00,  1.21it/s]
Evaluation: 100%|██████████| 57/57 [00:12<00:00,  4.44it/s]


=> Epoch 2
  Train loss: 0.378554
  Test loss:  0.393379
  Precision:  0.566416
  Recall:     0.634831
  F1:         0.598675


Epoch 3 (train): 100%|██████████| 237/237 [03:16<00:00,  1.21it/s]
Evaluation: 100%|██████████| 57/57 [00:12<00:00,  4.44it/s]


=> Epoch 3
  Train loss: 0.313340
  Test loss:  0.408837
  Precision:  0.534653
  Recall:     0.758427
  F1:         0.627178


Epoch 4 (train): 100%|██████████| 237/237 [03:16<00:00,  1.21it/s]
Evaluation: 100%|██████████| 57/57 [00:12<00:00,  4.43it/s]


=> Epoch 4
  Train loss: 0.256533
  Test loss:  0.405952
  Precision:  0.562780
  Recall:     0.705056
  F1:         0.625935
********************************************
               FOLD 4
********************************************
Train dataset numbers:
 Total number of sentences: 7462
 Number of Facts labels: 1813
 Number of Other labels: 5649
Test dataset numbers:
 Total number of sentences: 1918
 Number of Facts labels: 406
 Number of Other labels: 1512


Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:09<

=> Epoch 1
  Train loss: 0.517244
  Test loss:  0.356118
  Precision:  0.768657
  Recall:     0.253695
  F1:         0.381481


Epoch 2 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:09<00:00,  6.31it/s]


=> Epoch 2
  Train loss: 0.399296
  Test loss:  0.331965
  Precision:  0.650273
  Recall:     0.586207
  F1:         0.616580


Epoch 3 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:09<00:00,  6.32it/s]


=> Epoch 3
  Train loss: 0.339074
  Test loss:  0.370021
  Precision:  0.592742
  Recall:     0.724138
  F1:         0.651885


Epoch 4 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:09<00:00,  6.32it/s]


=> Epoch 4
  Train loss: 0.293725
  Test loss:  0.359000
  Precision:  0.623853
  Recall:     0.669951
  F1:         0.646081
CPU times: user 51min, sys: 16min 57s, total: 1h 7min 58s
Wall time: 1h 8min 10s


### Outline

In [18]:
from datetime import datetime

def save_report(avg_metrics, complete_metrics, dest_dir):
  """
  Arguments:
    avg_metrics : A pandas Dataframe with the averaged metrics.
    complete_metrics : A dictionary with the metrics by epoch. The key indicates the epoch. 
              Each value must be a numpy tensor of shape (n_folds, 5).
    dest_dir : The directory where the report will be saved.
  """
  report = (
      'RESULTS REPORT\n'
      'Model: Legal BERT\n'
      'Evaluation: cross-validation\n'
      'Train scheme: fine-tuning\n'
      f'Batch size: {batch_size}\n'
      f'Learning rate: {learning_rate}\n'
      f'Weight decay: {weight_decay}\n\n'
  )
  
  report += 'Averages:\n'
  report += avg_metrics.to_string(index=False, justify='center')
  
  report += '\n\nDetailed report:\n'
  report += 'Supports by fold:\n'
  for fold_id, support in supports_by_fold.items():
    report += f'=> Fold {fold_id}:\n'
    report += f'  Facts (train): {support["facts_train"]}\t\tOther (train): {support["other_train"]}\n'
    report += f'  Facts (test): {support["facts_test"]}\t\tOther (test): {support["other_test"]}\n'
  
  report += '\nScores:\n'
  for epoch, scores in complete_metrics.items():
    df = pd.DataFrame(
      scores, 
      columns=['Train loss', 'Test loss', 'Precision', 'Recall', 'F1'], 
      index=[f'Fold {i}' for i in range(scores.shape[0])])
    report += f'Epoch: {epoch}\n' + df.to_string() + '\n\n'
    
  with open(dest_dir + f'report-{model_reference}_ft_{datetime.now().strftime("%Y-%m-%d-%Hh%Mmin")}.txt', 'w') as f:
    f.write(report)

In [19]:
save_report(metrics, raw_metrics, './')

In [20]:
from IPython.display import display, HTML

metrics_display = display(metrics, display_id='metrics_table')

Unnamed: 0,Epoch,Train loss,std,Test loss,std.1,Precision,P std,Recall,R std,F1,F1 std
0,1,0.473566,0.035807,0.4429,0.13343,0.7221,0.0904,0.3767,0.115,0.4771,0.0761
1,2,0.360637,0.025831,0.409731,0.09176,0.6681,0.0725,0.5474,0.0573,0.5953,0.0142
2,3,0.300869,0.023252,0.428481,0.082919,0.6379,0.0761,0.635,0.0888,0.627,0.0246
3,4,0.249495,0.025647,0.452995,0.10182,0.6429,0.0752,0.6198,0.0681,0.6234,0.0172
