<a href="https://colab.research.google.com/github/alexlimatds/circle-2022/blob/main/RRLLJ_data_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Rhetorical Role Labeling for Legal Judgments - Data Augmentation

In this notebook we utilize the Mixup data augmentation approach to create additional data. Care was taken to use two vectors from different classes when creating a new agumented vector.

#### Installing dependencies

In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 2.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 26.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 12.2 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 5.4 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6

#### Loading AILA dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
g_drive_dir = "/content/gdrive/MyDrive/"

Mounted at /content/gdrive


In [None]:
!mkdir data
!mkdir data/train
!tar -xf {g_drive_dir}AILA_2021/AILA_2021_train.tar.xz -C data/train

train_dir = 'data/train/'

In [None]:
import pandas as pd
from os import listdir

columns = ['sentence', 'label']
train_df = pd.DataFrame(columns=columns)

for f in listdir(train_dir):
  train_df = pd.concat([
      train_df, 
      pd.read_csv(train_dir + f, sep='\t', names=columns)
  ])

train_df.head()

Unnamed: 0,sentence,label
0,This civil appeal by special leave is directed...,Facts
1,defendants 1 and 2 for redemption of suit prop...,Facts
2,Plaintiff filed the suit for redemption of the...,Facts
3,"That suit was resisted by the defendants,each ...",Facts
4,The defence in those written statements was th...,Facts


In [None]:
train_df['label'].unique().tolist()

['Facts',
 'Ruling by Lower Court',
 'Ratio of the decision',
 'Argument',
 'Statute',
 'Precedent',
 'Ruling by Present Court']

#### Generating feature and target vectors

In [None]:
from sentence_transformers import SentenceTransformer

sent_encoder = SentenceTransformer('sentence-transformers/LaBSE')
embeddings = sent_encoder.encode(train_df['sentence'].to_list(), convert_to_tensor=True)
embeddings.shape

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

torch.Size([10024, 768])

In [None]:
# Converting labels to one-hot vectors
from sklearn.preprocessing import OneHotEncoder
import torch

one_hot_encoder = OneHotEncoder(sparse=False)
labels = train_df['label'].values.reshape(-1, 1)
Y_np = one_hot_encoder.fit_transform(labels)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Y = torch.from_numpy(Y_np).float().to(device)

print(Y.shape)
print(labels.shape)

torch.Size([10024, 7])
(10024, 1)


#### Data augmentation

In [None]:
import numpy as np

# Mixup function: generates an augmented vector from two source vectors
def mixup(xi, xj, yi, yj, alpha):
  if alpha > 0:
    lam = np.random.beta(alpha, alpha)
  else:
    lam = 1
  x_hat = lam * xi + (1 - lam) * xj
  y_hat = lam * yi + (1 - lam) * yj
  return x_hat, y_hat

In [None]:
# Getting the input indexes by class
idx_classes = {}
for label in train_df['label'].unique():
  idx_classes[label] = []
for idx, row in train_df.iterrows():
  idx_classes[row['label']].append(idx)

print('CLASS LABEL - NUM OF RECORDS')
for item in idx_classes.items():
  print(f'{item[0]} {len(item[1])}')

CLASS LABEL - NUM OF RECORDS
Facts 2368
Ruling by Lower Court 341
Ratio of the decision 3919
Argument 901
Statute 671
Precedent 1523
Ruling by Present Court 301


In [None]:
input_ratio = 0.3 # the number of augmented vectors will be a fraction of the number of input vectors
n_hat = round(embeddings.shape[0] * input_ratio) # number of augmented vectors

In [None]:
# Generating augmented vectors.
# The class ratio is preserved.
# It uses vectors from different classes to generate an augmented one
import random
random.seed(0)

def augment_data(alpha):
  X_aug, Y_aug = None, None
  for label, idx in idx_classes.items():
    n_temp = round(n_hat * len(idx) / Y_np.shape[0]) # number of augmented vectors for the current class
    # random indexes for the current class
    idx_i = random.choices(idx, k=n_temp)
    # random indexes for other classes
    temp_list = []
    for v in idx_classes.values():
      if v != idx:
        temp_list.extend(v)
    idx_j = random.choices(temp_list, k=n_temp)
    # getting source vectors to generate the augmented vectors
    x_i = embeddings[idx_i, :]
    x_j = embeddings[idx_j, :]
    y_i = Y[idx_i, :]
    y_j = Y[idx_j, :]
    # data augmentation
    X_, Y_ = mixup(x_i, x_j, y_i, y_j, alpha)
    if X_aug is not None:
      X_aug = torch.cat((X_aug, X_))
      Y_aug = torch.cat((Y_aug, Y_))
    else:
      X_aug, Y_aug = X_, Y_
  
  return X_aug, Y_aug


#### Generating and writing the augmented data

In [None]:
!mkdir data
!mkdir data/train
!tar -xf {g_drive_dir}AILA_2021/AILA_2021_train.tar.xz -C data/train

train_dir = 'data/train/'

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘data/train’: File exists


In [None]:
alphas = [1.0, 0.7, 0.3, 0.1]
for a in alphas:
  X_hat, Y_hat = augment_data(a)
  np.save(f'{g_drive_dir}RRLLJ/mixup_data_alpha_{str(a).replace(".", "_")}_features.npy', X_hat.detach().cpu())
  np.save(f'{g_drive_dir}RRLLJ/mixup_data_alpha_{str(a).replace(".", "_")}_targets.npy', Y_hat.detach().cpu())


In [None]:
# labels and vector dictionary
with open(f'{g_drive_dir}RRLLJ/labels.txt', 'w') as file:
  for l in train_df['label'].unique().tolist():
    file.write(f'{l}:{one_hot_encoder.transform([[l]])[0]}\n')