<a href="https://colab.research.google.com/github/alexlimatds/fact_extraction/blob/main/AILA2020/FACTS_AILA_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Facts extraction with AILA data and BERT

- for training: the train dataset from AILA 2020. This can be obtained at https://github.com/Law-AI/semantic-segmentation;
- for test: additional train documents from AILA 2021;


### Installing dependencies

In [None]:
!pip install transformers



In [None]:
!pip install datasets



### Loading dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
g_drive_dir = "/content/gdrive/MyDrive/"

Mounted at /content/gdrive


In [None]:
!rm -r data
!mkdir data
!mkdir data/train
!tar -xf {g_drive_dir}fact_extraction_AILA/train.tar.xz -C data/train
!mkdir data/test
!tar -xf {g_drive_dir}fact_extraction_AILA/test.tar.xz -C data/test

train_dir = 'data/train/'
test_dir = 'data/test/'

In [None]:
from os import listdir
import pandas as pd

def read_docs(dir_name):
  """
  Read the docs in a directory.
  Params:
    dir_name : the directory that contains the documents.
  Returns:
    A dictionary.
  """
  sentences = []
  labels = []
  for f in listdir(dir_name):
    df = pd.read_csv(
        dir_name + f, 
        sep='\t', 
        names=['sentence', 'label'])
    sentences.extend(df['sentence'].to_list())
    labels.extend(df['label'].to_list())
  return {'sentences': sentences, 'labels': labels}

dic_train = read_docs(train_dir)
dic_test = read_docs(test_dir)

### Tokenizer and Dataset preparation

In [None]:
from transformers import AutoTokenizer

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from datasets import Dataset

def tokenize_function(ds):
  return tokenizer(ds['sentence'], padding='max_length', truncation=True)

def create_dataset(dic_data):
  dic_ = {
      'label' : [1 if s == 'Facts' else 0 for s in dic_data['labels']], 
      'sentence' : dic_data['sentences']
  }
  ds = Dataset.from_dict(dic_)
  ds = ds.map(tokenize_function, batched=True)
  ds.set_format('torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
  return ds

ds_train = create_dataset(dic_train)
ds_test = create_dataset(dic_test)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
ds_train

Dataset({
    features: ['label', 'sentence', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9380
})

### Evaluation function

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_fscore_support
import numpy as np

best_metrics = {'f1' : -1000}
metrics_by_epoch = []

def evaluate(eval_predictions):
  logits, target_ids = eval_predictions
  predicted_ids = np.argmax(logits, axis=-1)
  # Precision, Recall, F1
  t_metrics = precision_recall_fscore_support(
    target_ids, 
    predicted_ids, 
    average='binary', 
    pos_label=1, 
    zero_division=0)
  # Confusion matrix
  cm = confusion_matrix(
    target_ids, 
    predicted_ids)

  metrics = {
      'precision' : t_metrics[0], 
      'recall' : t_metrics[1], 
      'f1' : t_metrics[2], 
      'cm' : cm
  }
  metrics_by_epoch.append(metrics)

  global best_metrics
  if metrics['f1'] > best_metrics['f1']:
    best_metrics = metrics
  
  return {
      'precision' : t_metrics[0], 
      'recall' : t_metrics[1], 
      'f1' : t_metrics[2]
  }


### Model and fine-tuning

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
!mkdir results

mkdir: cannot create directory ‘results’: File exists


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
  output_dir='results', 
  num_train_epochs=5, 
  per_device_train_batch_size=8, 
  per_device_eval_batch_size=8, 
  evaluation_strategy='epoch', 
  save_strategy='epoch', 
  metric_for_best_model='f1', 
  greater_is_better=True, 
  load_best_model_at_end=True
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=ds_train,
  eval_dataset=ds_test, 
  compute_metrics=evaluate
)

In [None]:
%%time

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9380
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7038


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4103,0.33036,0.690104,0.699208,0.694626
2,0.3276,0.523733,0.727969,0.501319,0.59375


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1673
  Batch size = 8
Saving model checkpoint to results/checkpoint-1173
Configuration saved in results/checkpoint-1173/config.json
Model weights saved in results/checkpoint-1173/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1673
  Batch size = 8
Saving model checkpoint to results/checkpoint-2346
Configuration saved in results/checkpoint-2346/config.json
Model weights saved in results/checkpoint

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.4103,0.33036,0.690104,0.699208,0.694626
2,0.3276,0.523733,0.727969,0.501319,0.59375
3,0.2195,0.632003,0.675595,0.598945,0.634965
4,0.1103,0.713506,0.688705,0.659631,0.673854
5,0.0769,0.76644,0.689655,0.633245,0.660248


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1673
  Batch size = 8
Saving model checkpoint to results/checkpoint-3519
Configuration saved in results/checkpoint-3519/config.json
Model weights saved in results/checkpoint-3519/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1673
  Batch size = 8
Saving model checkpoint to results/checkpoint-4692
Configuration saved in results/checkpoint-4692/config.json
Model weights saved in results/checkpoint

### Saving trained model

In [None]:
model_directory = "results/best_model"
tokenizer.save_pretrained(model_directory)
model.save_pretrained(model_directory)

In [None]:
!tar -czvf {g_drive_dir}fact_extraction_AILA/bert-trained.tar.gz {model_directory}

### Best metrics

In [None]:
print(f'Precision: {best_metrics["precision"]:.4f}')
print(f'Recall:    {best_metrics["recall"]:.4f}')
print(f'F-score:   {best_metrics["f1"]:.4f}')
ConfusionMatrixDisplay(best_metrics['cm'], display_labels=['Other', 'Facts']).plot()