<a href="https://colab.research.google.com/github/alexlimatds/fact_extraction/blob/main/AILA2020/FACTS_AILA_data_augmentation_mixup_SBERT_LaBSE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mixup Data Augmentation

In this notebook we exploit the Mixup data augmentation approach to create additional data using AILA dataset as source. Care was taken to use two vectors from different classes when creating a new agumented vector.

The feature vectors are created with a SBERT/LaBSE model.

#### Installing dependencies

In [1]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 29.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 37.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64

#### Loading dataset

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
g_drive_dir = "/content/gdrive/MyDrive/"

Mounted at /content/gdrive


In [3]:
!mkdir data
!mkdir data/train
!tar -xf {g_drive_dir}fact_extraction_AILA/train.tar.xz -C data/train

train_dir = 'data/train/'

In [4]:
import pandas as pd
from os import listdir

def read_docs(dir_name):
  """
  Read the docs in a directory.
  Params:
    dir_name : the directory that contains the documents.
  Returns:
    A dictionary whose keys are the names of the read files and the values are 
    pandas dataframes. Each dataframe has the sentence and label columns.
  """
  docs = {} # key: file name, value: dataframe with sentences and labels
  for f in listdir(dir_name):
    df = pd.read_csv(
        dir_name + f, 
        sep='\t', 
        names=['sentence', 'label'])
    docs[f] = df
  return docs

docs_train = read_docs(train_dir)

print(f'TRAIN: {len(docs_train)} documents read.')

TRAIN: 50 documents read.


#### Spliting documents according to folds

In [5]:
# Reading the file containing the sets of trains documents and test documents by fold
train_files_by_fold = {}  # Key: fold ID, value: file names (list of string)

df_folds = pd.read_csv(
  g_drive_dir + 'fact_extraction_AILA/train_docs_by_fold.csv', 
  sep=';', 
  names=['fold id', 'train', 'test'], 
  header=0)
for idx, row in df_folds.iterrows():
  train_files_by_fold[row['fold id']] = row['train'].split(',')


#### SBERT model

In [6]:
from sentence_transformers import SentenceTransformer

sent_encoder = SentenceTransformer('sentence-transformers/LaBSE')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

#### Encoding sentences

In [8]:
encoded_docs_train = {} # key: document ID, value: encoded sentences (PyTorch matrix)
for doc_id, doc_df in docs_train.items():
  encoded_docs_train[doc_id] = sent_encoder.encode(doc_df['sentence'].to_list(), convert_to_tensor=True)

#### Data augmentation functions

In [9]:
import numpy as np

# Mixup function: generates an synthetic vector from two source vectors
def mixup(xi, xj, yi, yj, alpha):
  if alpha > 0:
    lam = np.random.beta(alpha, alpha)
  else:
    lam = 1
  x_hat = lam * xi + (1 - lam) * xj
  y_hat = lam * yi + (1 - lam) * yj
  return x_hat, y_hat

In [26]:
# Generating augmented vectors.
# It uses vectors from different classes to generate an augmented one
import torch
import random
random.seed(0)

def data_by_class(doc_id_list):
  """
  Params:
    A list of document IDs.
  Returns:
    The embeddings of the Facts class (PyTorch matrix).
    The embeddings of the Other class (PyTorch matrix).
  """
  sent_embeddings = None
  labels = None
  for doc_id in doc_id_list:
    if sent_embeddings is None:
      sent_embeddings = encoded_docs_train[doc_id]
      labels = docs_train[doc_id]['label'].to_numpy()
    else:
      sent_embeddings = torch.vstack((sent_embeddings, encoded_docs_train[doc_id]))
      labels = np.concatenate((labels, docs_train[doc_id]['label'].to_numpy()))
  
  facts_idx = np.nonzero(labels == 'Facts')[0]
  facts_embeddings = sent_embeddings[facts_idx,:]
  other_idx = np.nonzero(labels == 'Other')[0]
  other_embeddings = sent_embeddings[other_idx,:]

  return facts_embeddings, other_embeddings

def augment_data(alpha, doc_id_list):
  """
  Generates a set of synthetic data from the sentences in a provided set of 
  documents. The sentences are selected at random.
  Params:
    alpha: hyperparameter of the beta distribution to be used with the mixup algorithm.
    doc_id_list: a list with the IDs of the source documents (list of strings).
  Returns:
    The generated feature vectors (PyTorch tensor).
    The generated target vectors (PyTorch tensor).
  """
  N_synthetic = 3500 # number of synthetic vectors to be generated
  X_aug, Y_aug = None, None
  facts_embeddings, other_embeddings = data_by_class(doc_id_list)
  # random indexes for the Facts class
  idx_i = random.choices(range(facts_embeddings.shape[0]), k=N_synthetic)
  # random indexes for the Other class
  idx_j = random.choices(range(other_embeddings.shape[0]), k=N_synthetic)
  # getting source vectors to generate the augmented vectors
  x_i = facts_embeddings[idx_i, :]
  x_j = other_embeddings[idx_j, :]
  y_i = torch.ones(x_i.shape[0], 1)   # targets of the Facts class
  y_j = torch.zeros(x_j.shape[0], 1)  # targets of the Other class
  # data augmentation
  X_aug, Y_aug = mixup(x_i, x_j, y_i, y_j, alpha)

  return X_aug, Y_aug


#### Generating and writing the augmented data

In [28]:
alphas = [0.1, 0.5, 1.0, 4.0]

for a in alphas:
  output_dir = f'{g_drive_dir}fact_extraction_AILA/mixup_data_labse/'
  for fold_id, doc_ids in train_files_by_fold.items():
    X_hat, Y_hat = augment_data(a, doc_ids)
    file_prefix = output_dir + f'alpha_{str(a).replace(".", "_")}_fold_{fold_id}'
    np.save(file_prefix + '_features.npy', X_hat.detach().cpu())
    np.save(file_prefix + '_targets.npy', Y_hat.detach().cpu())
