In [None]:
!pip install transformers datasets
!pip install sentencepiece
import json
import pandas as pd
import numpy as np
from sklearn import metrics

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
id2label = {0 : "Limitation of liability",
           1 : "Unilateral termination",
           2: "Unilateral change",
           3: "Content removal",
           4: "Contract by using",
           5: "Choice of law",
           6: "Jurisdiction",
           7: "Arbitration", }          

label2id = {v: k for k, v in id2label.items()}


In [None]:
from datasets import load_dataset, list_datasets
dataset = load_dataset("lex_glue", 'unfair_tos')

df_all = pd.DataFrame()
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

df_train = pd.read_csv('/content/train_alphabetical.csv')
for index, row in df_train.iterrows():
     label_string = row['labels'].split(',')
     row_string_label_list = []
     for l in label_string:
        l = l.strip()
        row_string_label_list.append(label2id[l])

     row['labels'] = row_string_label_list

for row in dataset['validation']:
    df_all = df_all.append(row, ignore_index=True)
    if not row['labels']:
      row['labels'].append(8)
    df_val = df_val.append(row, ignore_index=True)

for row in dataset['test']:
    df_all = df_all.append(row, ignore_index=True)
    if not row['labels']:
      row['labels'].append(8)
    df_test = df_test.append(row, ignore_index=True)

In [None]:
df_train.head(35)

In [None]:
df_val.head(35)

In [None]:
# # get frequency of labels
# import collections, itertools

# list_labelslist = df_train['labels'].tolist()
# freq = collections.defaultdict(int)  # 0 by default
# for x in itertools.chain.from_iterable(list_labelslist):
#     freq[x] += 1

# sorted_label_freq = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))
# sorted_label_freq = list(sorted_label_freq.keys())
# sorted_label_freq

In [None]:
# # reorder labels in train set by frequency
# for index, row in df_train.iterrows():
#   label_list = row['labels']
#   if len(label_list) > 1:
#     row['labels'] = sorted(label_list, key=lambda x: sorted_label_freq.index(x))

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len, SUMMARY_LEN):
        super().__init__()
        self.tokenizer = tokenizer
        self.data = df
        self.text = df['text']
        self.max_len = max_len
        self.labels_len = SUMMARY_LEN

        self.labels = []
        for label_pair in df['labels']:
            row_string_label_list = []
            for l in label_pair:
                row_string_label_list.append(id2label[l])
            self.labels.append(row_string_label_list)
      
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        labels = self.labels[index]
        labelstostring = ", ".join(labels)
        labelstostring = 'summarize: ' + labelstostring

        source = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )

        target = self.tokenizer(
            labelstostring, 
            max_length=self.labels_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
            )
        
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_mask.to(dtype=torch.long)
        }


In [None]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we wnumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            print("Training Loss: ", loss.item())

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=10, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
# Defining some key variables that will be used later on in the training  
TRAIN_BATCH_SIZE = 128    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 128    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 20        # number of epochs to train (default: 10)
VAL_EPOCHS = 20
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 32
SUMMARY_LEN = 10 

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(df_train, tokenizer, MAX_LEN, SUMMARY_LEN)
validation_set = CustomDataset(df_val, tokenizer, MAX_LEN, SUMMARY_LEN)
test_set = CustomDataset(df_test, tokenizer, MAX_LEN, SUMMARY_LEN)

print("FULL Dataset: {}".format(df_all.shape))
print("TRAIN Dataset: {}".format(df_train.shape))
print("VAL Dataset: {}".format(df_val.shape))
print("TEST Dataset: {}".format(df_test.shape))

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(validation_set, **val_params)

In [None]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'pred':predictions,'true':actuals})
    final_df.to_csv('/content/sample_data/predictions.csv')
    print('Output Files generated for review')

In [None]:
final_df.head(150)

## Clean Outputs
i.e. Remove the string 'summarize:' in target ouputs + Keep only labels strings in prediction outputs. 

For example: ': Unilateral change' -> 'Unilateral change', 'summarize: No violation' -> 'No violation'


In [None]:
def clean_pred(row_pred):
  cleaned_pred = ''
  for predefined_label in list(id2label.values()):
    if str(row_pred).find(predefined_label) > -1:
      cleaned_pred += predefined_label + ', '
  return cleaned_pred.split(',')[0]

final_df['pred'] = final_df['pred'].apply(clean_pred)

final_df['true'] = final_df['true'].apply(lambda x: x.split('summarize: ')[1])

In [None]:
final_df.head(100)

##Evaluate performance

In [None]:
from sklearn import metrics
val_preds = final_df['pred'].to_numpy()
val_targets = final_df['true'].to_numpy()

f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')

print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
for label in list(id2label.values()):
  copy_df = final_df[final_df.true == label]

  test_preds = copy_df['pred'].to_numpy()
  test_targets = copy_df['true'].to_numpy()

  f1_score_micro = metrics.f1_score(test_targets, test_preds, average='micro')
  f1_score_macro = metrics.f1_score(test_targets, test_preds, average='macro')
  print(f"F1 Score (Micro) {label} = {f1_score_micro}")
  print(f"F1 Score (Macro) {label} = {f1_score_macro}")
  print()