In [None]:
!pip install transformers datasets
import json
import pandas as pd
import numpy as np
from sklearn import metrics

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
UNFAIR_CATEGORIES = [
    "Limitation of liability",
    "Unilateral termination",
    "Unilateral change",
    "Content removal",
    "Contract by using",
    "Choice of law",
    "Jurisdiction",
    "Arbitration",
]

## Create UnfairToS dataframe

In [None]:
from datasets import load_dataset, list_datasets
dataset = load_dataset("lex_glue", 'unfair_tos')

df_all = pd.DataFrame()
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

for row in dataset['train']:
    df_all = df_all.append(row, ignore_index=True)
    df_train = df_train.append(row, ignore_index=True)

for row in dataset['validation']:
    df_all = df_all.append(row, ignore_index=True)
    df_val = df_val.append(row, ignore_index=True)

for row in dataset['test']:
    df_all = df_all.append(row, ignore_index=True)
    df_test = df_test.append(row, ignore_index=True)


In [None]:
df_train.head(15)


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
example_text = df_train.iloc[0]['text']
bert_input = tokenizer(example_text,padding='max_length', max_length=50, 
                       truncation=True, return_tensors="pt")


print(bert_input['input_ids']) # id representation of each token 101=CLS, 4430=notice etc.
print(bert_input['token_type_ids']) # in which sequence a token belongs
print(bert_input['attention_mask']) # whether a token is a real word or just padding: [CLS], [SEP], word = 1 else 0 ([PAD])

In [None]:
example_text = tokenizer.decode(bert_input.input_ids[0])

print(example_text)

In [None]:
enc_labels = {0 : "Limitation of liability",
           1 : "Unilateral termination",
           2: "Unilateral change",
           3: "Content removal",
           4: "Contract by using",
           5: "Choice of law",
           6: "Jurisdiction",
           7: "Arbitration", 
          }

labels = []
for label_pair in df_train['labels']:
    row_encodings_list = []
    for l in enc_labels.keys():
        if l in label_pair:
            row_encodings_list.append(1)
        else:
            row_encodings_list.append(0)
    # # NO_TYPE 
    # if not label_pair:
    #     row_encodings_list.append(1)
    # else:
    #     row_encodings_list.append(0)

    labels.append(row_encodings_list)

print(labels)  

In [None]:
labels_df_train = pd.DataFrame.from_records(labels, columns=["Limitation of liability", "Unilateral termination", "Unilateral change", "Content removal", "Contract by using", "Choice of law", "Jurisdiction", "Arbitration"])
LABEL_COLUMNS = labels_df_train.columns.tolist()
labels_df_train[LABEL_COLUMNS].sum().sort_values()


In [None]:
print(len(labels))

# Dataset Class

In [None]:
label2id = {"Limitation of liability": 0,
          "Unilateral termination": 1,
          "Unilateral change": 2,
          "Content removal": 3,
          "Contract by using": 4,
          "Choice of law": 5,
          "Jurisdiction": 6,
          "Arbitration": 7, 
          }

id2label = {0: "Limitation of liability",
          1: "Unilateral termination",
          2: "Unilateral change",
          3: "Content removal",
          4: "Contract by using",
          5: "Choice of law",
          6: "Jurisdiction",
          7: "Arbitration", 
          }

In [None]:
enc_labels = {0 : "Limitation of liability",
           1 : "Unilateral termination",
           2: "Unilateral change",
           3: "Content removal",
           4: "Contract by using",
           5: "Choice of law",
           6: "Jurisdiction",
           7: "Arbitration", 
          }

'''
        self.labels = []
        for label_pair in df['label']:
          row_encodings_list = []
          for l in label_pair:
            row_encodings_list.append(enc_labels[l])
          self.labels.append(row_encodings_list)
'''
        
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = df
        self.text = df['text']
        self.max_len = max_len

        self.targets = []
        for label_pair in df['labels']:
            row_encodings_list = []
            for l in enc_labels.keys():
                if l in label_pair:
                    row_encodings_list.append(1)
                else:
                    row_encodings_list.append(0)

            self.targets.append(row_encodings_list)

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Config
MAX_LEN = 64
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64

## Train:Val:Test Shapes

In [None]:
from transformers import AutoTokenizer, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("zlucia/custom-legalbert")

training_set = CustomDataset(df_train, tokenizer, MAX_LEN)
validation_set = CustomDataset(df_val, tokenizer, MAX_LEN)
test_set = CustomDataset(df_test, tokenizer, MAX_LEN)

print("FULL Dataset: {}".format(df_all.shape))
print("TRAIN Dataset: {}".format(df_train.shape))
print("VAL Dataset: {}".format(df_val.shape))
print("TEST Dataset: {}".format(df_test.shape))
print(df_train.shape[0] + df_val.shape[0] + df_test.shape[0])

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
test_loader = DataLoader(test_set, **test_params)

## BERT Model Buidling

In [None]:
from torch import nn
from transformers import BertModel, AutoModel

# *-base configuration of each pre-trained model, 
# i.e., 12 Transformer blocks, 768 hidden units (DEFAULT), 
# and 12 attention heads (DEFAULT). We train models with the Adam optimizer 
# and an initial learning rate of 3e-5 up to 20 epochs 
# using early stopping on development data. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
        # self.l1 = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
        # self.l1 = AutoModel.from_pretrained("zlucia/custom-legalbert")
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 8)
    
    def forward(self, ids, mask, token_type_ids):
        # _ = embedding vector of tokens
        # pooled_output = embedding vector of [CLS] token
        _, pooled_output= self.l1(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        dropout_output = self.l2(pooled_output)
        linear_output = self.l3(dropout_output)

        return linear_output

model = BERTClass()
model.to(device)

Save the best model during training

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

In [None]:
import shutil, sys   
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

## Train the model

In [None]:
from sklearn.model_selection import GridSearchCV

# EPOCHS = 20

def loss_fn(outputs, targets):
    weights = torch.tensor([0.2, 0.3, 0.35, 0.5, 0.55, 0.7, 0.75, 0.8]).to(device)

    return torch.nn.BCEWithLogitsLoss()(outputs.to(device), targets.to(device))


params = {
    'lr': [3e-5, 1e-5, 3e-4, 1e-4, 3e-3, 3e-2]
}
LEARNING_RATE = 3e-4

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
model = model.cuda()

In [None]:
val_targets = []
val_outputs = []

def train_model(start_epochs,  n_epochs, valid_loss_min_input, 
                training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = valid_loss_min_input 
  
  for epoch in range(start_epochs, n_epochs + 1):
    ###################
    # train the model #
    ###################
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['ids'].to(device='cuda', dtype = torch.long)
        mask = data['mask'].to(device='cuda', dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device='cuda', dtype = torch.long)
        targets = data['targets'].to(device='cuda', dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device='cuda', dtype = torch.long)
            mask = data['mask'].to(device='cuda', dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device='cuda', dtype = torch.long)
            targets = data['targets'].to(device='cuda', dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAverage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
      # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))


  return model

In [None]:
checkpoint_path = '/content/current_checkpoint.pt'
best_model = '/content/best_model.pt'

# early stopping patience; how long to wait after last time validation loss improved.
patience = 3
model = model.to(device)
trained_model = train_model(1, 
                            20, 
                            np.Inf, 
                            training_loader, 
                            validation_loader, 
                            model, optimizer, 
                            checkpoint_path, 
                            best_model)

In [None]:
val_preds = (np.array(val_outputs) > 0.5).astype(int)
val_preds

In [None]:
preds_df = pd.DataFrame(val_preds, columns = UNFAIR_CATEGORIES) 

In [None]:
df_concat = pd.concat([df_val, preds_df], axis=1)

In [None]:
accuracy = metrics.accuracy_score(val_targets, val_preds)
f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
test_targets = []
test_outputs = []

def test(device, model, test_loader):
    # Settings
    model.eval()

    with torch.no_grad():
      for batch_idx, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            # loss = loss_fn(outputs, targets)
            # valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            test_targets.extend(targets.cpu().detach().numpy().tolist())
            test_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())


In [None]:
test(device, model, test_loader)

In [None]:
test_preds = (np.array(test_outputs) > 0.5).astype(float)
test_preds

In [None]:
import numpy
test_targets = numpy.array(test_targets)
test_targets_ext = np.zeros((test_targets.shape[0], test_targets.shape[1] + 1), dtype=np.float)
test_targets_ext[:, :-1] = test_targets
test_targets_ext[:, -1] = (np.sum(test_targets, axis=1) == 0).astype(float)
test_targets_ext.shape
test_targets_ext

In [None]:
# test_targets = numpy.array(test_targets)
test_preds_ext = np.zeros((test_preds.shape[0], test_preds.shape[1] + 1), dtype=np.float)
test_preds_ext[:, :-1] = test_preds
test_preds_ext[:, -1] = (np.sum(test_preds, axis=1) == 0).astype(float)
test_preds_ext.shape
test_preds_ext

In [None]:
accuracy = metrics.accuracy_score(test_targets_ext, test_preds_ext)
f1_score_micro = metrics.f1_score(test_targets_ext, test_preds_ext, average='micro')
f1_score_macro = metrics.f1_score(test_targets_ext, test_preds_ext, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
from sklearn.metrics import multilabel_confusion_matrix as mcm, classification_report

In [None]:
cm = mcm(val_targets, val_preds)

In [None]:
print(classification_report(val_targets, val_preds))