<a href="https://colab.research.google.com/github/alexlimatds/circle-2022/blob/main/RRLLJ_SBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Rhetorical Role Labeling for Legal Judgments - experiments with SBERT features

In this notebook we utilize Sentence BERT (SBERT) features to represent the sentences.
We use the SentenceTransformer library for SBERT implementation.

### Installing dependencies

In [1]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 48.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 34.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 4.4 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.8 MB/s 
[?25h

### Loading dataset

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
g_drive_dir = "/content/gdrive/MyDrive/"

Mounted at /content/gdrive


In [3]:
!mkdir data
!mkdir data/train
!tar -xf {g_drive_dir}AILA_2021/AILA_2021_train.tar.xz -C data/train

train_dir = 'data/train/'

In [4]:
import pandas as pd
from os import listdir

def read_docs(dir_name):
  docs_ = {} # key: file name, value: dataframe with sentences and labels
  labels_ = set()
  for f in listdir(dir_name):
    df = pd.read_csv(
        dir_name + f, 
        sep='\t', 
        names=['sentence', 'label'])
    docs_[f] = df
    labels_.update(df['label'].to_list())
  return docs_, labels_

docs_train, labels_train = read_docs(train_dir)
n_classes = len(labels_train)
print(f'TRAIN: {len(docs_train)} documents read.')
print(f'Number of classes: {n_classes}')

TRAIN: 60 documents read.
Number of classes: 7


### SBERT features

In [5]:
from sentence_transformers import SentenceTransformer

sent_encoder = SentenceTransformer('sentence-transformers/LaBSE')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

In [6]:
n_features = sent_encoder.get_sentence_embedding_dimension()

In [7]:
%%time
docs_train_features_numpy = {} # key: file id, value: numpy matrix of features
for doc_id, df in docs_train.items():
  docs_train_features_numpy[doc_id] = sent_encoder.encode(df['sentence'].tolist())


CPU times: user 1min 30s, sys: 1.66 s, total: 1min 32s
Wall time: 1min 40s


### Label encoder

In [8]:
from sklearn.preprocessing import LabelBinarizer

label_encoder = LabelBinarizer()
label_encoder.fit(list(labels_train))


LabelBinarizer()

### Evaluation functions

In [9]:
import numpy as np
import sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from IPython.display import display, HTML

def docs_to_sentences(docs_idx, doc_keys_list):
  features_ = None
  targets_ = []
  for idx in docs_idx:
    doc_id = doc_keys_list[idx]
    if features_ is None:
      features_ = docs_train_features_numpy[doc_id]
    else:
      features_ = np.vstack((features_, docs_train_features_numpy[doc_id]))
    targets_.extend(docs_train[doc_id]['label'].tolist())
  return features_, targets_

def metrics_report(title, averages, stds):
  report_df = pd.DataFrame(columns=['Score', 'Standard Deviation'])
  report_df.loc['Precision'] = [f'{averages[0]:.4f}', f'{stds[0]:.4f}']
  report_df.loc['Recall'] = [f'{averages[1]:.4f}', f'{stds[1]:.4f}']
  report_df.loc['F1'] = [f'{averages[2]:.4f}', f'{stds[2]:.4f}']
  display(HTML(f'<br><span style="font-weight: bold">{title}: cross-validation macro averages</span>'))
  display(report_df)

def classification_report(metrics):
  report_df = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])
  for i, l in enumerate(label_encoder.classes_):
    report_df.loc[l] = [
      f'{metrics[i, 0]:.4f}', 
      f'{metrics[i, 1]:.4f}', 
      f'{metrics[i, 2]:.4f}', 
    ]
  display(HTML(f'<br><span style="font-weight: bold">Classification Report (cross-validation test averages)</span>'))
  display(report_df)

test_metrics = {}

def cross_validation(trainer):
  train_metrics_cross = []
  test_metrics_cross = []
  test_metrics_by_class = np.zeros((n_classes, 3)) # 3 metrics (P, R, F1)
  n_folds = 5
  skf = KFold(n_splits=n_folds) # for cross-validation
  docs_list = list(docs_train.keys())
  for train_docs_idx, test_docs_idx in skf.split(docs_list): # The cross-validation splitting is document-oriented
    # train
    train_features_fold, train_targets_fold = docs_to_sentences(train_docs_idx, docs_list)
    model = trainer(train_features_fold, train_targets_fold)
    # test
    test_features_fold, test_targets_fold = docs_to_sentences(test_docs_idx, docs_list)
    predictions = model.predict(test_features_fold)
    # averaged test metrics
    p_test, r_test, f1_test, _ = precision_recall_fscore_support(
        test_targets_fold, 
        predictions, 
        average='macro', 
        zero_division=0)
    test_metrics_cross.append([p_test, r_test, f1_test])
    # test metrics by class
    metrics = precision_recall_fscore_support(
        test_targets_fold, 
        predictions, 
        average=None, 
        zero_division=0, 
        labels=label_encoder.classes_)
    test_metrics_by_class = test_metrics_by_class + np.hstack((
        metrics[0].reshape(-1, 1),  # precision
        metrics[1].reshape(-1, 1),  # recall
        metrics[1].reshape(-1, 1))) # F1
    # train metrics
    predictions = model.predict(train_features_fold)
    p_train, r_train, f1_train, _ = precision_recall_fscore_support(
        train_targets_fold, 
        predictions, 
        average='macro', 
        zero_division=0)
    train_metrics_cross.append([p_train, r_train, f1_train])
  
  train_metrics_cross = np.array(train_metrics_cross)
  train_mean = np.mean(train_metrics_cross, axis=0)
  train_std = np.std(train_metrics_cross, axis=0)
  metrics_report('TRAIN', train_mean, train_std)

  test_metrics_cross = np.array(test_metrics_cross)
  test_mean = np.mean(test_metrics_cross, axis=0)
  test_std = np.std(test_metrics_cross, axis=0)
  metrics_report('TEST', test_mean, test_std)

  test_metrics_by_class /= n_classes
  classification_report(test_metrics_by_class)

  test_metrics[model.__class__.__name__] = test_mean

### Scikit-learn Models

#### Logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression

def lr_trainer(X, y):
  logreg = LogisticRegression(solver='sag', random_state=1)
  return logreg.fit(X, y)

In [11]:
%%time

cross_validation(lr_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.6691,0.0272
Recall,0.4582,0.0063
F1,0.4978,0.0044


Unnamed: 0,Score,Standard Deviation
Precision,0.5117,0.0294
Recall,0.3794,0.0205
F1,0.4064,0.0218


Unnamed: 0,Precision,Recall,F1
Argument,0.369,0.1557,0.1557
Facts,0.3834,0.4459,0.4459
Precedent,0.327,0.2123,0.2123
Ratio of the decision,0.3711,0.5092,0.5092
Ruling by Lower Court,0.068,0.0065,0.0065
Ruling by Present Court,0.6097,0.2894,0.2894
Statute,0.4301,0.278,0.278


CPU times: user 42.1 s, sys: 832 ms, total: 43 s
Wall time: 42 s


#### Linear SVM

In [12]:
from sklearn.svm import LinearSVC

def linear_svm_trainer(X, y):
  svm = LinearSVC(random_state=1)
  return svm.fit(X, y)

In [13]:
%%time

cross_validation(linear_svm_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.6953,0.0304
Recall,0.4935,0.0071
F1,0.5267,0.0065


Unnamed: 0,Score,Standard Deviation
Precision,0.4824,0.0337
Recall,0.3906,0.0162
F1,0.4097,0.0185


Unnamed: 0,Precision,Recall,F1
Argument,0.3554,0.1611,0.1611
Facts,0.3785,0.4611,0.4611
Precedent,0.326,0.2222,0.2222
Ratio of the decision,0.3788,0.4899,0.4899
Ruling by Lower Court,0.0357,0.0049,0.0049
Ruling by Present Court,0.5316,0.3049,0.3049
Statute,0.4061,0.3087,0.3087


CPU times: user 26.3 s, sys: 881 ms, total: 27.1 s
Wall time: 26.1 s


#### KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier

def knn_trainer(X, y):
  knn = KNeighborsClassifier(5)
  return knn.fit(X, y)

In [15]:
%%time

cross_validation(knn_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.7175,0.0028
Recall,0.6199,0.0059
F1,0.6467,0.004


Unnamed: 0,Score,Standard Deviation
Precision,0.4191,0.0213
Recall,0.3746,0.024
F1,0.3804,0.0236


Unnamed: 0,Precision,Recall,F1
Argument,0.1816,0.1917,0.1917
Facts,0.3543,0.4047,0.4047
Precedent,0.2455,0.2807,0.2807
Ratio of the decision,0.3704,0.3674,0.3674
Ruling by Lower Court,0.1107,0.0159,0.0159
Ruling by Present Court,0.4661,0.3028,0.3028
Statute,0.3667,0.3099,0.3099


CPU times: user 38.3 s, sys: 4.37 s, total: 42.7 s
Wall time: 25.7 s


#### MLP Classifier

In [16]:
from sklearn.neural_network import MLPClassifier

def mlp_trainer(X, y):
  # Default MLP from scikit-learn
  mlp = MLPClassifier(early_stopping=True, random_state=1)
  return mlp.fit(X, y)

In [17]:
%%time

cross_validation(mlp_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.6871,0.0192
Recall,0.5454,0.0536
F1,0.5783,0.0504


Unnamed: 0,Score,Standard Deviation
Precision,0.4831,0.0236
Recall,0.4031,0.0116
F1,0.4201,0.0128


Unnamed: 0,Precision,Recall,F1
Argument,0.3117,0.1998,0.1998
Facts,0.3906,0.4461,0.4461
Precedent,0.3292,0.2297,0.2297
Ratio of the decision,0.3821,0.4807,0.4807
Ruling by Lower Court,0.1051,0.0326,0.0326
Ruling by Present Court,0.5049,0.3184,0.3184
Statute,0.3919,0.308,0.308


CPU times: user 34.9 s, sys: 17.2 s, total: 52.1 s
Wall time: 27 s


#### Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree_trainer(X, y):
  dtree = DecisionTreeClassifier(max_depth=5, random_state=1)
  return dtree.fit(X, y)

In [19]:
%%time

cross_validation(decision_tree_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.4517,0.024
Recall,0.2756,0.0081
F1,0.2833,0.0094


Unnamed: 0,Score,Standard Deviation
Precision,0.3409,0.0493
Recall,0.2321,0.0174
F1,0.2291,0.0237


Unnamed: 0,Precision,Recall,F1
Argument,0.0522,0.006,0.006
Facts,0.2808,0.3765,0.3765
Precedent,0.2347,0.0661,0.0661
Ratio of the decision,0.3247,0.5018,0.5018
Ruling by Lower Court,0.0,0.0,0.0
Ruling by Present Court,0.5546,0.1283,0.1283
Statute,0.2576,0.0816,0.0816


CPU times: user 20.6 s, sys: 74.3 ms, total: 20.7 s
Wall time: 20.6 s


#### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

def random_forest_trainer(X, y):
  rforest = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=1)
  return rforest.fit(X, y)

In [21]:
%%time

cross_validation(random_forest_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.5089,0.0585
Recall,0.232,0.0091
F1,0.2166,0.0099


Unnamed: 0,Score,Standard Deviation
Precision,0.3584,0.0689
Recall,0.2068,0.0116
F1,0.1857,0.0186


Unnamed: 0,Precision,Recall,F1
Argument,0.1429,0.0012,0.0012
Facts,0.3697,0.2656,0.2656
Precedent,0.2653,0.005,0.005
Ratio of the decision,0.2997,0.639,0.639
Ruling by Lower Court,0.0,0.0,0.0
Ruling by Present Court,0.7143,0.1235,0.1235
Statute,0.0,0.0,0.0


CPU times: user 5.68 s, sys: 50 ms, total: 5.73 s
Wall time: 5.69 s


#### AdaBoost

In [22]:
from sklearn.ensemble import AdaBoostClassifier

def adaboost_trainer(X, y):
  adab = AdaBoostClassifier(random_state=1)
  return adab.fit(X, y)

In [23]:
%%time

cross_validation(adaboost_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.3641,0.0201
Recall,0.3027,0.0155
F1,0.296,0.0128


Unnamed: 0,Score,Standard Deviation
Precision,0.3249,0.0123
Recall,0.2673,0.0162
F1,0.2559,0.0114


Unnamed: 0,Precision,Recall,F1
Argument,0.2017,0.0242,0.0242
Facts,0.297,0.3532,0.3532
Precedent,0.2447,0.0504,0.0504
Ratio of the decision,0.3248,0.4931,0.4931
Ruling by Lower Court,0.0238,0.0013,0.0013
Ruling by Present Court,0.3099,0.2239,0.2239
Statute,0.2228,0.1904,0.1904


CPU times: user 3min 48s, sys: 334 ms, total: 3min 48s
Wall time: 3min 47s


#### Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB

def naive_bayes_trainer(X, y):
  nb = GaussianNB()
  return nb.fit(X, y)

In [25]:
%%time
cross_validation(naive_bayes_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.421,0.0037
Recall,0.5046,0.0043
F1,0.4369,0.0048


Unnamed: 0,Score,Standard Deviation
Precision,0.3788,0.0158
Recall,0.4425,0.0196
F1,0.3862,0.0107


Unnamed: 0,Precision,Recall,F1
Argument,0.2192,0.2579,0.2579
Facts,0.3675,0.3969,0.3969
Precedent,0.2919,0.2357,0.2357
Ratio of the decision,0.4328,0.2706,0.2706
Ruling by Lower Court,0.0742,0.2016,0.2016
Ruling by Present Court,0.2542,0.367,0.367
Statute,0.2543,0.4827,0.4827


CPU times: user 3.25 s, sys: 86.1 ms, total: 3.34 s
Wall time: 3.32 s


#### XGBoost

In [29]:
from xgboost.sklearn import XGBClassifier
import torch

def xgboost_trainer(X, y):
  if torch.cuda.is_available():
    xgboost = XGBClassifier(objective="multi:softmax", tree_method='gpu_hist', gpu_id=0)
  else:
    xgboost = XGBClassifier(objective="multi:softmax", tree_method='hist')
  return xgboost.fit(X, y)

In [30]:
%%time
cross_validation(xgboost_trainer)

Unnamed: 0,Score,Standard Deviation
Precision,0.8518,0.0075
Recall,0.5821,0.0084
F1,0.6509,0.0076


Unnamed: 0,Score,Standard Deviation
Precision,0.5272,0.0496
Recall,0.3376,0.0186
F1,0.364,0.0228


Unnamed: 0,Precision,Recall,F1
Argument,0.3961,0.0777,0.0777
Facts,0.3745,0.4388,0.4388
Precedent,0.3719,0.1717,0.1717
Ratio of the decision,0.3544,0.5399,0.5399
Ruling by Lower Court,0.0714,0.0049,0.0049
Ruling by Present Court,0.6148,0.2308,0.2308
Statute,0.4527,0.2243,0.2243


CPU times: user 1min 23s, sys: 1.38 s, total: 1min 24s
Wall time: 1min 24s


### PyTorch models

In [31]:
gpu_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [32]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
  def __init__(self, inputs, targets, device):
    self.X = torch.from_numpy(inputs).float().to(device)
    self.y = torch.from_numpy(label_encoder.transform(targets)).float().to(device)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return [self.X[idx], self.y[idx]]


In [33]:
from torch.optim import Adam
from torch.utils.data import DataLoader
from sklearn.model_selection import ShuffleSplit

torch.manual_seed(1)

class MLPTrainer:

  def __init__(self, model, device, l2_penalty=0.0001):
    self.model = model.to(device)
    setattr(self.model.__class__, 'predict', self.predict)
    self.device = device
    # The training replicates the default configuration from scikit-learn's MLPClassifier
    self.criterion = torch.nn.CrossEntropyLoss().to(device)
    self.lambd = l2_penalty # weight decay for Adam optmizer
    self.n_epochs = 200

  def fit(self, inputs, targets, verbose=False):
    # early stopping params and variables
    tol = 0.001
    n_iter_no_change = 7
    early_stop_count = 0
    best_loss_validation = float("inf")
    # splitting train data into train and validation sets in order to perform early stopping
    spl = ShuffleSplit(n_splits=1, train_size=0.9, random_state=1)
    targets = np.array(targets)
    for train_index, val_index in spl.split(inputs):
      # getting datasets
      train_x = inputs[train_index]
      train_y = targets[train_index]
      validation_x = inputs[val_index]
      validation_y = targets[val_index]
      train_dl = DataLoader(
        MyDataset(train_x, train_y, self.device), 
        batch_size=64)
      validation_dl = DataLoader(
        MyDataset(validation_x, validation_y, self.device), 
        batch_size=len(validation_x))
      # training
      self.model.train()
      optimizer = Adam(
        self.model.parameters(), 
        weight_decay=self.lambd)
      for epoch in range(self.n_epochs):
        # iterate mini batches
        for x, y in train_dl:
          optimizer.zero_grad()
          yhat = self.model(x)
          loss = self.criterion(yhat, y)
          loss.backward()
          optimizer.step()
        # Early stopping
        for x, y in validation_dl:
          loss_validation = self.criterion(self.model(x), y)
        if loss_validation >= best_loss_validation - tol:
          early_stop_count += 1
        else:
          early_stop_count = 0
          best_loss_validation = loss_validation
        if early_stop_count == n_iter_no_change:
          break
    if verbose:
      print(f'TRAIN: Stopped at epoch {epoch + 1} {"(MAX EPOCH)" if epoch + 1 == self.n_epochs else ""}')
    
    self.model.eval()
    return self.model

  def predict(self, X):
    y = self.model.forward(torch.from_numpy(X).float().to(self.device))
    return label_encoder.inverse_transform(y.detach().to('cpu').numpy())

#### TorchMLP

In [34]:
import torch.nn
from torch.nn.init import xavier_uniform_
from torch.nn.init import kaiming_uniform_

class TorchMLP(torch.nn.Module):
  def __init__(self, n_inputs, n_classes):
    super(TorchMLP, self).__init__()
    # hidden layer
    n_hidden_units = 100
    hidden1 = torch.nn.Linear(n_inputs, n_hidden_units)
    kaiming_uniform_(hidden1.weight, nonlinearity='relu')
    relu = torch.nn.ReLU()
    # output layer
    output = torch.nn.Linear(n_hidden_units, n_classes)
    xavier_uniform_(output.weight)
    # There's no need of softmax function because it's included in the CrossEntropyLoss function
    self.layers = torch.nn.Sequential(
      hidden1, 
      relu, 
      output)
  
  def forward(self, X):
    return self.layers(X)

In [35]:
def torch_mlp_trainer(X, y):
  trainer = MLPTrainer(
      TorchMLP(n_features, n_classes), 
      gpu_device, 
      l2_penalty=0.0015)
  return trainer.fit(X, y, verbose=True)

In [36]:
%%time
cross_validation(torch_mlp_trainer)

TRAIN: Stopped at epoch 126 
TRAIN: Stopped at epoch 94 
TRAIN: Stopped at epoch 65 
TRAIN: Stopped at epoch 66 
TRAIN: Stopped at epoch 91 


Unnamed: 0,Score,Standard Deviation
Precision,0.6443,0.0378
Recall,0.4906,0.0175
F1,0.526,0.0152


Unnamed: 0,Score,Standard Deviation
Precision,0.5,0.0253
Recall,0.3882,0.0229
F1,0.4113,0.0217


Unnamed: 0,Precision,Recall,F1
Argument,0.3749,0.1813,0.1813
Facts,0.3887,0.4288,0.4288
Precedent,0.3276,0.2307,0.2307
Ratio of the decision,0.3687,0.5,0.5
Ruling by Lower Court,0.0643,0.0065,0.0065
Ruling by Present Court,0.5703,0.2893,0.2893
Statute,0.4054,0.3045,0.3045


CPU times: user 1min 41s, sys: 3.09 s, total: 1min 44s
Wall time: 1min 43s


#### TorchMLPMaxPool

In [37]:
import math

class TorchMLPMaxPool(torch.nn.Module):
  def __init__(self, n_inputs, n_classes):
    super(TorchMLPMaxPool, self).__init__()
    # max pool
    window_size = 2
    max_pool = torch.nn.MaxPool1d(window_size, ceil_mode=True)
    n_out_pool = math.ceil((n_inputs - window_size) / window_size + 1)
    # hidden layers
    n_hidden_units = 100
    hidden1 = torch.nn.Linear(n_out_pool, n_hidden_units)
    kaiming_uniform_(hidden1.weight, nonlinearity='relu')
    relu = torch.nn.ReLU()
    # output layer
    output = torch.nn.Linear(n_hidden_units, n_classes)
    xavier_uniform_(output.weight)
    # There's no need of softmax function because it's included in the CrossEntropyLoss function
    self.layers = torch.nn.Sequential(
      max_pool, 
      hidden1, 
      relu, 
      output)
  
  def forward(self, X):
    return self.layers(X)

In [38]:
def torch_mlp_maxpool_trainer(X, y):
  trainer = MLPTrainer(
      TorchMLPMaxPool(n_features, n_classes), 
      gpu_device, 
      l2_penalty=0.0015)
  return trainer.fit(X, y, verbose=True)

In [39]:
%%time
cross_validation(torch_mlp_maxpool_trainer)

TRAIN: Stopped at epoch 54 
TRAIN: Stopped at epoch 48 
TRAIN: Stopped at epoch 46 
TRAIN: Stopped at epoch 45 
TRAIN: Stopped at epoch 48 


Unnamed: 0,Score,Standard Deviation
Precision,0.5786,0.0065
Recall,0.4142,0.0107
F1,0.4519,0.0087


Unnamed: 0,Score,Standard Deviation
Precision,0.5073,0.0149
Recall,0.3503,0.0238
F1,0.3779,0.024


Unnamed: 0,Precision,Recall,F1
Argument,0.3991,0.1299,0.1299
Facts,0.3849,0.4221,0.4221
Precedent,0.3414,0.2001,0.2001
Ratio of the decision,0.3565,0.5297,0.5297
Ruling by Lower Court,0.0,0.0,0.0
Ruling by Present Court,0.624,0.2231,0.2231
Statute,0.4304,0.2467,0.2467


CPU times: user 57.5 s, sys: 1.83 s, total: 59.4 s
Wall time: 58.9 s


#### TorchLogisticRegression

In [40]:
class TorchLogisticRegression(torch.nn.Module):
  def __init__(self, n_inputs, n_classes, device, verbose=False):
    super(TorchLogisticRegression, self).__init__()
    self.verbose = verbose
    self.device = device
    self.layer = torch.nn.Linear(n_inputs, n_classes)
    xavier_uniform_(self.layer.weight)

  def forward(self, X):
    return self.layer(X)
  
  def predict(self, X):
    y = self.forward(torch.from_numpy(X).float().to(self.device))
    return label_encoder.inverse_transform(y.detach().to('cpu').numpy())

  def fit(self, X, y):
    # SGD params
    learning_rate = 0.5
    momentum = 0.9
    lambda_param = 0.0001 # L2 regularization
    n_iterations = 1000
    decay_rate = 0.95  # learning rate decay
    # early stopping params and variables
    tol = 0.001
    n_iter_no_change = 5
    early_stop_count = 0
    best_loss = float("inf")
    # loss function and optmizer
    self.train()
    criterion = torch.nn.CrossEntropyLoss().to(self.device)
    optimizer = torch.optim.SGD(
      self.parameters(), 
      lr=learning_rate, 
      momentum=momentum, 
      weight_decay=lambda_param)
    lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
      optimizer=optimizer, 
      gamma=decay_rate)
    # Data loader
    batch_size = 64
    train_dl = DataLoader(
      MyDataset(X, y, self.device), 
      batch_size=batch_size, 
      shuffle=True)
    # Train loop
    for i in range(1, n_iterations + 1):
      # iterate mini batches
      for x_batch, y_batch in train_dl:
        optimizer.zero_grad()
        y_hat = self(x_batch)
        loss = criterion(y_hat, y_batch)
        loss.backward()
        optimizer.step()
      lr_scheduler.step()
      # early stop
      if loss >= best_loss - tol:
        early_stop_count += 1
      else:
        early_stop_count = 0
        best_loss = loss
      if early_stop_count == n_iter_no_change:
        break
    
    if self.verbose:
      print(f'TRAIN: Stopped at iteration {i} {"(MAX ITERATION)" if i == n_iterations else ""}')
    self.eval()
    return self


In [44]:
def torch_lr_trainer(X, y):
  lr_ = TorchLogisticRegression(n_features, n_classes, gpu_device, verbose=True).to(gpu_device)
  return lr_.fit(X, y)

In [45]:
%%time
cross_validation(torch_lr_trainer)

TRAIN: Stopped at iteration 11 
TRAIN: Stopped at iteration 15 
TRAIN: Stopped at iteration 9 
TRAIN: Stopped at iteration 13 
TRAIN: Stopped at iteration 9 


Unnamed: 0,Score,Standard Deviation
Precision,0.6307,0.0321
Recall,0.4409,0.0132
F1,0.4807,0.0107


Unnamed: 0,Score,Standard Deviation
Precision,0.5179,0.0308
Recall,0.3674,0.0213
F1,0.3934,0.0205


Unnamed: 0,Precision,Recall,F1
Argument,0.3488,0.1655,0.1655
Facts,0.3897,0.4276,0.4276
Precedent,0.3275,0.2036,0.2036
Ratio of the decision,0.364,0.5156,0.5156
Ruling by Lower Court,0.094,0.0108,0.0108
Ruling by Present Court,0.622,0.2576,0.2576
Statute,0.4434,0.2564,0.2564


CPU times: user 10.6 s, sys: 576 ms, total: 11.2 s
Wall time: 11.1 s


### Summary

In [46]:
from IPython.display import display, update_display

metrics_df = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])
for model_name, metrics in test_metrics.items():
  metrics_df.loc[model_name] = [f'{metrics[0]:.4f}', f'{metrics[1]:.4f}', f'{metrics[2]:.4f}']
metrics_display = display(metrics_df, display_id='metrics_table')

Unnamed: 0,Precision,Recall,F1
LogisticRegression,0.5117,0.3794,0.4064
LinearSVC,0.4824,0.3906,0.4097
KNeighborsClassifier,0.4191,0.3746,0.3804
MLPClassifier,0.4831,0.4031,0.4201
DecisionTreeClassifier,0.3409,0.2321,0.2291
RandomForestClassifier,0.3584,0.2068,0.1857
AdaBoostClassifier,0.3249,0.2673,0.2559
GaussianNB,0.3788,0.4425,0.3862
XGBClassifier,0.5272,0.3376,0.364
TorchMLP,0.5,0.3882,0.4113
