# Facts extraction with AILA data, BERT base and positional encoding of sentences
### PyTorch version

The model is trained trough a fune-tuning process. A feature vector is generated from the embedding vector of  CLS token and from the positional encoding (PE) of the sentence. The PE is generated in order to represent the document position occupied by a sentence.

The model is evaluated through a cross-validation.

We use the train dataset from AILA 2020. This can be obtained at https://github.com/Law-AI/semantic-segmentation;

### Notebook parameters

In [1]:
model_id = 'bert-base-uncased'
model_reference = 'bert-base-uncased_PE'

### Dependencies

In [2]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [3]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import transformers
transformers.__version__

'4.17.0'

In [5]:
import torch
torch.__version__

'1.9.0a0+df837d0'

In [6]:
import sklearn
sklearn.__version__

'0.24.1'

In [7]:
import numpy as np
np.__version__

'1.19.2'

### Random numbers' seed

In [8]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

### Loading dataset

In [9]:
from os import listdir
import pandas as pd
import csv

train_dir = 'train/'

def read_docs(dir_name):
  """
  Read the docs in a directory.
  Params:
    dir_name : the directory that contains the documents.
  Returns:
    A dictionary whose keys are the names of the read files and the values are 
    pandas dataframes. Each dataframe has sentence and label columns.
  """
  docs = {} # key: file name, value: dataframe with sentences and labels
  for f in listdir(dir_name):
    df = pd.read_csv(
        dir_name + f, 
        sep='\t', 
        quoting=csv.QUOTE_NONE, 
        names=['sentence', 'label'])
    docs[f] = df
  return docs

docs_dic = read_docs(train_dir)

print('Number of documents:', len(docs_dic))

Number of documents: 50


In [10]:
df_folds = pd.read_csv(
  'train_docs_by_fold.csv', 
  sep=';', 
  names=['fold id', 'train', 'test'], 
  header=0
)

train_files_by_fold = {}
test_files_by_fold = {}
fold_ids = []

for _, row in df_folds.iterrows():
  fold_id = int(row['fold id'])
  fold_ids.append(int(fold_id))
  train_files_by_fold[fold_id] = row['train'].split(',')
  test_files_by_fold[fold_id] = row['test'].split(',')
  print(f'Fold {fold_id}:\n\tTrain files: {train_files_by_fold[fold_id]}\n\tTest files: {test_files_by_fold[fold_id]}')


Fold 0:
	Train files: ['d_44.txt', 'd_39.txt', 'd_12.txt', 'd_2.txt', 'd_7.txt', 'd_33.txt', 'd_16.txt', 'd_8.txt', 'd_42.txt', 'd_34.txt', 'd_40.txt', 'd_24.txt', 'd_36.txt', 'd_11.txt', 'd_13.txt', 'd_19.txt', 'd_18.txt', 'd_4.txt', 'd_1.txt', 'd_21.txt', 'd_15.txt', 'd_23.txt', 'd_32.txt', 'd_9.txt', 'd_5.txt', 'd_3.txt', 'd_26.txt', 'd_20.txt', 'd_30.txt', 'd_41.txt', 'd_46.txt', 'd_43.txt', 'd_50.txt', 'd_27.txt', 'd_25.txt', 'd_35.txt', 'd_45.txt', 'd_17.txt', 'd_48.txt', 'd_6.txt']
	Test files: ['d_22.txt', 'd_31.txt', 'd_49.txt', 'd_14.txt', 'd_29.txt', 'd_47.txt', 'd_10.txt', 'd_38.txt', 'd_28.txt', 'd_37.txt']
Fold 1:
	Train files: ['d_22.txt', 'd_31.txt', 'd_49.txt', 'd_14.txt', 'd_29.txt', 'd_47.txt', 'd_10.txt', 'd_38.txt', 'd_28.txt', 'd_37.txt', 'd_40.txt', 'd_24.txt', 'd_36.txt', 'd_11.txt', 'd_13.txt', 'd_19.txt', 'd_18.txt', 'd_4.txt', 'd_1.txt', 'd_21.txt', 'd_15.txt', 'd_23.txt', 'd_32.txt', 'd_9.txt', 'd_5.txt', 'd_3.txt', 'd_26.txt', 'd_20.txt', 'd_30.txt', 'd_41.

### Positional encodings
Since this implementation of positional encoding computing is relatively slow, let's do it in advance.

In [11]:
%%time

def get_PE(seq_len, d, n=10000):
  """
  Returns a positional encoding matrix.
  Code adapted from https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/
  Arguments:
    seq_len : the length of the sequence.
    d : the embedding (encoding) dimension.
  Returns:
    A PyTorch tensor with shape (seq_len, d).
  """
  P = torch.zeros((seq_len, d))
  for k in range(seq_len):
    for i in torch.arange(int(d/2)):
      denominator = np.power(n, 2*i/d)
      P[k, 2*i] = np.sin(k/denominator)
      P[k, 2*i+1] = np.cos(k/denominator)
  return P

def get_PE_by_doc():
  pe_dic = {}
  for doc_id in docs_dic.keys():
    doc_len = len(docs_dic[doc_id]['sentence'].to_list())
    pe_dic[doc_id] = get_PE(doc_len, 768) # 768 is the BERT embedding dimension
  return pe_dic

pe_by_doc = get_PE_by_doc()

CPU times: user 3min 31s, sys: 52.6 ms, total: 3min 31s
Wall time: 3min 29s


### Tokenizer and Dataset preparation

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [13]:
from torch.utils.data import Dataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

labels_to_idx = {
  'Facts' : 0, 
  'Other' : 1
}

class MyDataset(Dataset):
  def __init__(self, sentences, labels, pe, tokenizer):
    """
    Arguments:
      sentences : list of strings.
      labels : list of strings.
      pe : PyTorch tensor with the positional encoding of sentences.
      tokenizer : an instance of transformes tokenizer.
    """
    self.len = len(sentences)
    self.labels = list(labels)
    self.pe = pe
    # encoding targets
    self.targets = []
    for l in labels:
      self.targets.append(labels_to_idx[l])
    self.targets = torch.tensor(self.targets, dtype=torch.long)
    # tokenizing sentences
    self.data = tokenizer(
      sentences, 
      None,
      add_special_tokens=True,
      padding='longest', 
      return_token_type_ids=True, 
      return_attention_mask=True, 
      truncation=True, 
      return_tensors='pt'
    )

  def __getitem__(self, index):
    return {
      'ids': self.data['input_ids'][index],
      'mask': self.data['attention_mask'][index], 
      'token_type_ids': self.data['token_type_ids'][index], 
      'target': self.targets[index], 
      'label': self.labels[index], 
      'pe': self.pe[index]
    }
  
  def __len__(self):
    return self.len


In [14]:
def get_dataset(docs_list):
  sentences = []
  labels = []
  pe = []
  for doc_id in docs_list:
    doc_sentences = docs_dic[doc_id]['sentence'].to_list()
    sentences.extend(doc_sentences)
    labels.extend(docs_dic[doc_id]['label'].to_list())
    pe.append(pe_by_doc[doc_id]) # 768 is the BERT embedding dimension
  pe = torch.vstack(pe)
  #return MyDataset(sentences[:10], labels[:10], pe, tokenizer) # for code validation
  return MyDataset(sentences, labels, pe, tokenizer)

def count_labels(set_title, ds):
  print(f'{set_title} numbers:')
  print(f' Total number of sentences: {ds.len}')
  n_facts = len([l for l in ds.labels if l == "Facts"])
  print(f' Number of Facts labels: {n_facts}')
  n_other = len([l for l in ds.labels if l == "Other"])
  print(f' Number of Other labels: {n_other}')
  return n_facts, n_other

### Model

In [15]:
class SentenceClassifier(torch.nn.Module):
  def __init__(self):
    super(SentenceClassifier, self).__init__()
    self.bert = transformers.AutoModel.from_pretrained(model_id)
    self.dropout = torch.nn.Dropout(0.4)
    self.classifier = torch.nn.Linear(768, 2) # 768 => hidden vector's dimension, 2 => two classes
    torch.nn.init.xavier_uniform_(self.classifier.weight)

  def forward(self, input_ids, attention_mask, token_type_ids, pe):
    output_1 = self.bert(
      input_ids=input_ids,            # input_ids.shape: (batch_size, seq_len)
      attention_mask=attention_mask,  # attention_mask.shape: (batch_size, seq_len)
      token_type_ids=token_type_ids   # token_type_ids.shape: (batch_size, seq_len)
    )
    hidden_state = output_1.last_hidden_state # hidden states of last BERT's layer => shape: (batch_size, seq_len, embedd_dim)
    cls_embeddings = hidden_state[:, 0]       # hidden states of the CLS tokens => shape: (batch_size, embedd_dim)
    embeddings = cls_embeddings + pe
    embeddings = self.dropout(embeddings)
    logits = self.classifier(embeddings)  # shape: (batch_size, 2)

    return logits

### Evaluation function

In [16]:
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm

def evaluate(model, test_dataloader, loss_function):
  predictions = torch.tensor([]).to(device)
  y_true = torch.tensor([]).to(device)
  eval_loss = 0
  model.eval()
  with torch.no_grad():
    for data in tqdm(test_dataloader, desc='Evaluation'):
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      token_type_ids = data['token_type_ids'].to(device)
      pe = data['pe'].to(device)
      y_true_batch = data['target'].to(device)
      y_hat = model.forward(ids, mask, token_type_ids, pe)
      loss = loss_function(y_hat, y_true_batch)
      eval_loss += loss.item()
      predictions_batch = y_hat.argmax(dim=1)
      predictions = torch.cat((predictions, predictions_batch))
      y_true = torch.cat((y_true, y_true_batch))
    predictions = predictions.detach().to('cpu').numpy()
    y_true = y_true.detach().to('cpu').numpy()
  eval_loss = eval_loss / len(test_dataloader)
  # Precision, Recall, F1
  t_metrics = precision_recall_fscore_support(
    y_true, 
    predictions, 
    average='binary', 
    pos_label=labels_to_idx['Facts'], 
    zero_division=0)
  
  return eval_loss, t_metrics[0], t_metrics[1], t_metrics[2]

### Fine-tuning

In [17]:
import sys, time

supports_by_fold = {}

def train(fold_id, learning_rate, weight_decay, n_epochs, batch_size):
  ds_train = get_dataset(train_files_by_fold[fold_id])
  ds_test = get_dataset(test_files_by_fold[fold_id])
  n_facts_train, n_other_train = count_labels('Train dataset', ds_train)
  n_facts_test, n_other_test = count_labels('Test dataset', ds_test)
  supports_by_fold[fold_id] = {
    'facts_train' : n_facts_train, 
    'other_train' : n_other_train, 
    'facts_test' : n_facts_test, 
    'other_test' : n_other_test
  }
  dl_train = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True)
  dl_test = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=True)

  sentence_classifier = SentenceClassifier().to(device)
  criterion = torch.nn.CrossEntropyLoss().to(device)
  optimizer = torch.optim.Adam(
    sentence_classifier.parameters(), 
    lr=learning_rate, 
    betas=(0.9, 0.999), 
    eps=1e-8, 
    weight_decay=weight_decay
  )
  lr_scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0, 
    num_training_steps = len(dl_train) * n_epochs
  )
  
  metrics = {} # key: epoch number, value: numpy tensor storing train loss, test loss, Precision, Recall, F1
  for epoch in range(1, n_epochs + 1):
    epoch_loss = 0
    sentence_classifier.train()
    for train_data in tqdm(dl_train, desc=f'Epoch {epoch} (train)'):
      optimizer.zero_grad()
      ids = train_data['ids'].to(device)
      mask = train_data['mask'].to(device)
      token_type_ids = train_data['token_type_ids'].to(device)
      pe = train_data['pe'].to(device)
      y_hat = sentence_classifier(ids, mask, token_type_ids, pe)
      y_true = train_data['target'].to(device)
      loss = criterion(y_hat, y_true)
      epoch_loss += loss.item()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(sentence_classifier.parameters(), 1.0)
      optimizer.step()
      lr_scheduler.step()
    epoch_loss = epoch_loss / len(dl_train)
    # evaluation
    optimizer.zero_grad()
    eval_loss, p, r, f1 = evaluate(sentence_classifier, dl_test, criterion)
    #storing metrics
    metrics[epoch] = np.array([epoch_loss, eval_loss, p, r, f1])
    print(f'=> Epoch {epoch}')
    print(f'  Train loss: {epoch_loss:.6f}')
    print(f'  Test loss:  {eval_loss:.6f}')
    print(f'  Precision:  {p:.6f}')
    print(f'  Recall:     {r:.6f}')
    print(f'  F1:         {f1:.6f}')
    time.sleep(0.5) # in order to don't mess the progress bars
  
  return metrics


In [18]:
%%time
# Training params
n_epochs = 4
learning_rate = 2e-5
weight_decay = 1e-3
batch_size = 32

#fold_ids = [0, 1] # for code validation
raw_metrics = {} # key: epoch, value: numpy tensor of shape (n_folds, 5)
for fold_id in fold_ids:
  print('********************************************')
  print(f'               FOLD {fold_id}')
  print('********************************************')
  fold_metrics = train(fold_id, learning_rate, weight_decay, n_epochs, batch_size)
  for epoch, scores in fold_metrics.items():
    epoch_metrics = raw_metrics.get(epoch, None)
    if epoch_metrics is None:
      raw_metrics[epoch] = scores
    else:
      raw_metrics[epoch] = np.vstack((epoch_metrics, scores))

metrics = pd.DataFrame(columns=['Epoch', 'Train loss', 'std', 'Test loss', 'std', 'Precision', 'P std', 'Recall', 'R std', 'F1', 'F1 std'])
for i, (epoch, scores) in enumerate(raw_metrics.items()):
  mean = np.mean(scores, axis=0)
  std = np.std(scores, axis=0)
  metrics.loc[i] = [
      f'{epoch}', 
      f'{mean[0]:.6f}', f'{std[0]:.6f}',  # train loss
      f'{mean[1]:.6f}', f'{std[1]:.6f}',  # test loss
      f'{mean[2]:.4f}', f'{std[2]:.4f}',  # precision
      f'{mean[3]:.4f}', f'{std[3]:.4f}',  # recall
      f'{mean[4]:.4f}', f'{std[4]:.4f}'   # f1
    ]

********************************************
               FOLD 0
********************************************
Train dataset numbers:
 Total number of sentences: 7693
 Number of Facts labels: 1584
 Number of Other labels: 6109
Test dataset numbers:
 Total number of sentences: 1687
 Number of Facts labels: 635
 Number of Other labels: 1052


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 241/241 [03:19<00:00,  1.21it/s]
Evaluation: 100%|██████████| 53/53 [00:12<00:00,  4.20it/s]


=> Epoch 1
  Train loss: 0.415959
  Test loss:  0.730664
  Precision:  0.878788
  Recall:     0.274016
  F1:         0.417767


Epoch 2 (train): 100%|██████████| 241/241 [03:19<00:00,  1.21it/s]
Evaluation: 100%|██████████| 53/53 [00:12<00:00,  4.20it/s]


=> Epoch 2
  Train loss: 0.301840
  Test loss:  0.485368
  Precision:  0.758879
  Recall:     0.639370
  F1:         0.694017


Epoch 3 (train): 100%|██████████| 241/241 [03:19<00:00,  1.21it/s]
Evaluation: 100%|██████████| 53/53 [00:12<00:00,  4.21it/s]


=> Epoch 3
  Train loss: 0.228913
  Test loss:  0.534734
  Precision:  0.745027
  Recall:     0.648819
  F1:         0.693603


Epoch 4 (train): 100%|██████████| 241/241 [03:19<00:00,  1.21it/s]
Evaluation: 100%|██████████| 53/53 [00:12<00:00,  4.21it/s]


=> Epoch 4
  Train loss: 0.183557
  Test loss:  0.649592
  Precision:  0.732955
  Recall:     0.609449
  F1:         0.665520
********************************************
               FOLD 1
********************************************
Train dataset numbers:
 Total number of sentences: 7782
 Number of Facts labels: 1772
 Number of Other labels: 6010
Test dataset numbers:
 Total number of sentences: 1598
 Number of Facts labels: 447
 Number of Other labels: 1151


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 244/244 [02:48<00:00,  1.45it/s]
Evaluation: 100%|██████████| 50/50 [00:14<00:00,  3.44it/s]


=> Epoch 1
  Train loss: 0.501135
  Test loss:  0.561656
  Precision:  0.739437
  Recall:     0.469799
  F1:         0.574555


Epoch 2 (train): 100%|██████████| 244/244 [02:48<00:00,  1.45it/s]
Evaluation: 100%|██████████| 50/50 [00:14<00:00,  3.44it/s]


=> Epoch 2
  Train loss: 0.349783
  Test loss:  0.481582
  Precision:  0.716621
  Recall:     0.588367
  F1:         0.646192


Epoch 3 (train): 100%|██████████| 244/244 [02:48<00:00,  1.45it/s]
Evaluation: 100%|██████████| 50/50 [00:14<00:00,  3.44it/s]


=> Epoch 3
  Train loss: 0.270351
  Test loss:  0.681710
  Precision:  0.759398
  Recall:     0.451902
  F1:         0.566620


Epoch 4 (train): 100%|██████████| 244/244 [02:48<00:00,  1.45it/s]
Evaluation: 100%|██████████| 50/50 [00:14<00:00,  3.43it/s]


=> Epoch 4
  Train loss: 0.208616
  Test loss:  0.685879
  Precision:  0.729730
  Recall:     0.483221
  F1:         0.581427
********************************************
               FOLD 2
********************************************
Train dataset numbers:
 Total number of sentences: 7002
 Number of Facts labels: 1844
 Number of Other labels: 5158
Test dataset numbers:
 Total number of sentences: 2378
 Number of Facts labels: 375
 Number of Other labels: 2003


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<00:00,  4.48it/s]


=> Epoch 1
  Train loss: 0.590187
  Test loss:  0.355810
  Precision:  0.580071
  Recall:     0.434667
  F1:         0.496951


Epoch 2 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<00:00,  4.47it/s]


=> Epoch 2
  Train loss: 0.416358
  Test loss:  0.361636
  Precision:  0.567123
  Recall:     0.552000
  F1:         0.559459


Epoch 3 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<00:00,  4.47it/s]


=> Epoch 3
  Train loss: 0.317612
  Test loss:  0.380199
  Precision:  0.575843
  Recall:     0.546667
  F1:         0.560876


Epoch 4 (train): 100%|██████████| 219/219 [03:01<00:00,  1.21it/s]
Evaluation: 100%|██████████| 75/75 [00:16<00:00,  4.47it/s]


=> Epoch 4
  Train loss: 0.247916
  Test loss:  0.423867
  Precision:  0.550369
  Recall:     0.597333
  F1:         0.572890
********************************************
               FOLD 3
********************************************
Train dataset numbers:
 Total number of sentences: 7581
 Number of Facts labels: 1863
 Number of Other labels: 5718
Test dataset numbers:
 Total number of sentences: 1799
 Number of Facts labels: 356
 Number of Other labels: 1443


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 237/237 [03:16<00:00,  1.20it/s]
Evaluation: 100%|██████████| 57/57 [00:12<00:00,  4.49it/s]


=> Epoch 1
  Train loss: 0.481942
  Test loss:  0.325161
  Precision:  0.692308
  Recall:     0.556180
  F1:         0.616822


Epoch 2 (train): 100%|██████████| 237/237 [03:16<00:00,  1.20it/s]
Evaluation: 100%|██████████| 57/57 [00:12<00:00,  4.49it/s]


=> Epoch 2
  Train loss: 0.353289
  Test loss:  0.400022
  Precision:  0.540490
  Recall:     0.806180
  F1:         0.647125


Epoch 3 (train): 100%|██████████| 237/237 [03:16<00:00,  1.20it/s]
Evaluation: 100%|██████████| 57/57 [00:12<00:00,  4.49it/s]


=> Epoch 3
  Train loss: 0.275034
  Test loss:  0.403940
  Precision:  0.550980
  Recall:     0.789326
  F1:         0.648961


Epoch 4 (train): 100%|██████████| 237/237 [03:16<00:00,  1.20it/s]
Evaluation: 100%|██████████| 57/57 [00:12<00:00,  4.49it/s]


=> Epoch 4
  Train loss: 0.211402
  Test loss:  0.440946
  Precision:  0.548729
  Recall:     0.727528
  F1:         0.625604
********************************************
               FOLD 4
********************************************
Train dataset numbers:
 Total number of sentences: 7462
 Number of Facts labels: 1813
 Number of Other labels: 5649
Test dataset numbers:
 Total number of sentences: 1918
 Number of Facts labels: 406
 Number of Other labels: 1512


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:10<00:00,  5.82it/s]


=> Epoch 1
  Train loss: 0.634917
  Test loss:  0.403533
  Precision:  0.597619
  Recall:     0.618227
  F1:         0.607748


Epoch 2 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:10<00:00,  5.82it/s]


=> Epoch 2
  Train loss: 0.431159
  Test loss:  0.386178
  Precision:  0.587196
  Recall:     0.655172
  F1:         0.619325


Epoch 3 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:10<00:00,  5.82it/s]


=> Epoch 3
  Train loss: 0.326162
  Test loss:  0.417306
  Precision:  0.568627
  Recall:     0.714286
  F1:         0.633188


Epoch 4 (train): 100%|██████████| 234/234 [03:13<00:00,  1.21it/s]
Evaluation: 100%|██████████| 60/60 [00:10<00:00,  5.82it/s]


=> Epoch 4
  Train loss: 0.251302
  Test loss:  0.462850
  Precision:  0.565762
  Recall:     0.667488
  F1:         0.612429
CPU times: user 50min 40s, sys: 16min 41s, total: 1h 7min 22s
Wall time: 1h 7min 35s


### Outline

In [19]:
from datetime import datetime

def save_report(avg_metrics, complete_metrics, dest_dir):
  """
  Arguments:
    avg_metrics : A pandas Dataframe with the averaged metrics.
    complete_metrics : A dictionary with the metrics by epoch. The key indicates the epoch. 
              Each value must be a numpy tensor of shape (n_folds, 5).
    dest_dir : The directory where the report will be saved.
  """
  report = (
      'RESULTS REPORT\n'
      'Model: Legal BERT\n'
      'Evaluation: cross-validation\n'
      'Train scheme: fine-tuning\n'
      f'Batch size: {batch_size}\n'
      f'Learning rate: {learning_rate}\n'
      f'Weight decay: {weight_decay}\n\n'
  )
  
  report += 'Averages:\n'
  report += avg_metrics.to_string(index=False, justify='center')
  
  report += '\n\nDetailed report:\n'
  report += 'Supports by fold:\n'
  for fold_id, support in supports_by_fold.items():
    report += f'=> Fold {fold_id}:\n'
    report += f'  Facts (train): {support["facts_train"]}\t\tOther (train): {support["other_train"]}\n'
    report += f'  Facts (test): {support["facts_test"]}\t\tOther (test): {support["other_test"]}\n'
  
  report += '\nScores:\n'
  for epoch, scores in complete_metrics.items():
    df = pd.DataFrame(
      scores, 
      columns=['Train loss', 'Test loss', 'Precision', 'Recall', 'F1'], 
      index=[f'Fold {i}' for i in range(scores.shape[0])])
    report += f'Epoch: {epoch}\n' + df.to_string() + '\n\n'
    
  with open(dest_dir + f'report-{model_reference}_ft_{datetime.now().strftime("%Y-%m-%d-%Hh%Mmin")}.txt', 'w') as f:
    f.write(report)

In [20]:
save_report(metrics, raw_metrics, './')

In [21]:
from IPython.display import display, HTML

metrics_display = display(metrics, display_id='metrics_table')

Unnamed: 0,Epoch,Train loss,std,Test loss,std.1,Precision,P std,Recall,R std,F1,F1 std
0,1,0.524828,0.078287,0.475365,0.151419,0.6976,0.1081,0.4706,0.1175,0.5428,0.0754
1,2,0.370486,0.047375,0.422957,0.050933,0.6341,0.087,0.6482,0.0871,0.6332,0.044
2,3,0.283614,0.035236,0.483578,0.112512,0.64,0.0921,0.6302,0.1196,0.6206,0.0505
3,4,0.220559,0.025648,0.532627,0.111597,0.6255,0.0866,0.617,0.0814,0.6116,0.0332
