In [None]:
!pip install transformers
!pip install sentencepiece
!pip install pyspellchecker

In [None]:
import pandas as pd
import json
import torch

Data : https://huggingface.co/datasets/kiddothe2b/contract-nli

## Hyperparameters

In [None]:
BATCH_SIZE = 16
NUM_EPOCHS = 10
LEARNING_RATE = 2e-5
MAX_TOKEN_LEN = 512
SEED = 444

## Seeders

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

## Prepare Cuda

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Data Splitting

In [None]:
data = pd.read_json("/content/contract_nli_v1.jsonl", lines=True)
#data = pd.read_json("/content/contract_nli_long.jsonl", lines=True)

## Load raw data into Pandas DF

In [None]:
train_data = []
val_data = []
test_data = []

def process_data():
  with open('/content/contract_nli_long.jsonl', 'r', encoding='utf-8') as f:
    sid = -1
    for id_, row in enumerate(f):
      data = json.loads(row)
      if data['subset'] == 'train':
        for sample in data['hypothesises/labels']:
          sid += 1
          row = {
              "premise": data['premise'],
              "hypothesis": sample['hypothesis'],
              "label": sample['label'],
          }
          train_data.append(row)
      elif data['subset'] == 'dev':
        for sample in data['hypothesises/labels']:
          sid += 1
          row = {
              "premise": data['premise'],
              "hypothesis": sample['hypothesis'],
              "label": sample['label'],
          }
          val_data.append(row)
      elif data['subset'] == 'test':
        for sample in data['hypothesises/labels']:
          sid += 1
          row = {
              "premise": data['premise'],
              "hypothesis": sample['hypothesis'],
              "label": sample['label'],
          }
          test_data.append(row)

process_data()
train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)
test_df = pd.DataFrame(test_data)

In [None]:
train_df

Unnamed: 0,premise,hypothesis,label
0,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not reverse engineer any...,neutral
1,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall destroy or return some C...,entailment
2,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Agreement shall not grant Receiving Party any ...,entailment
3,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Receiving Party shall not disclose the fact th...,entailment
4,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...,Confidential Information shall only include te...,neutral
...,...,...,...
7186,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may create a copy of some Conf...,neutral
7187,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party shall notify Disclosing Party ...,entailment
7188,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may acquire information simila...,entailment
7189,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...,Receiving Party may share some Confidential In...,entailment


## Preprocessing & Cleaning Data

In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE

def remove_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text

def remove_special_characters(text):
    text = text.replace('\n', ' ')
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return clean_text

def convert_to_lowercase(text):
    lowercased_text = text.lower()
    return lowercased_text

def remove_duplicates(texts):
    unique_texts = list(set(texts))
    result_string = ', '.join(unique_texts)
    return result_string

def correct_spelling(text):
    spell = SpellChecker()
    tokens = word_tokenize(text)
    corrected_tokens = [spell.correction(word) for word in tokens]
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

def stem_text(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

def balance_text_data(X, y):
    smote = SMOTE(sampling_strategy='auto')
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
#df['premise'] = df['premise'].apply(remove_html_tags)

train_df['premise'] = train_df['premise'].apply(remove_special_characters)
val_df['premise'] = val_df['premise'].apply(remove_special_characters)
test_df['premise'] = test_df['premise'].apply(remove_special_characters)

train_df['premise'] = train_df['premise'].apply(convert_to_lowercase)
val_df['premise'] = val_df['premise'].apply(convert_to_lowercase)
test_df['premise'] = test_df['premise'].apply(convert_to_lowercase)

#df['premise'] = df['premise'].apply(correct_spelling)

#df['premise'] = df['premise'].apply(remove_duplicates)

train_df['premise'] = train_df['premise'].apply(tokenize_text)
val_df['premise'] = val_df['premise'].apply(tokenize_text)
test_df['premise'] = test_df['premise'].apply(tokenize_text)

#df['premise'] = df['premise'].apply(stem_text)

train_df['premise'] = train_df['premise'].apply(lemmatize_text)
val_df['premise'] = val_df['premise'].apply(lemmatize_text)
test_df['premise'] = test_df['premise'].apply(lemmatize_text)

In [None]:
' '.join(train_df.iloc[0]['premise'])

In [None]:
train_df = data[data['subset'] == 'train'] # 70%
val_df = data[data['subset'] == 'dev'] # 10%
test_df = data[data['subset'] == 'test'] # 20%

In [None]:
train_df

Unnamed: 0,premise,hypothesis,label,subset
0,2.3 Provided that the Recipient has a written ...,Receiving Party shall not reverse engineer any...,neutral,train
1,5. All Confidential Information in any form an...,Receiving Party shall destroy or return some C...,entailment,train
2,4. Nothing in this Agreement is to be construe...,Agreement shall not grant Receiving Party any ...,entailment,train
3,11. The Recipient shall not advertise or other...,Receiving Party shall not disclose the fact th...,entailment,train
4,"1. “Confidential Information”, whenever used i...",Confidential Information shall only include te...,neutral,train
...,...,...,...,...
6814,"For purposes of this Agreement, the Provider’s...",Confidential Information may include verbally ...,entailment,train
6815,"“Confidential Information” does not include, a...",Receiving Party may create a copy of some Conf...,neutral,train
6816,(c) If the Recipient or any of the Recipient’s...,Receiving Party shall notify Disclosing Party ...,entailment,train
6817,"However, the Provider’s “Confidential Informat...",Receiving Party may acquire information simila...,entailment,train


In [None]:
#per_train = (train_df.shape[0]/len(data))*100
#per_val = (val_df.shape[0]/len(data))*100
#per_test = (test_df.shape[0]/len(data))*100
#per_train, per_val, per_test

(69.68698517298188, 10.049423393739703, 20.263591433278417)

## Data Loaders

In [None]:
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
from transformers import AlbertTokenizer

class CNLIDataAlbert(Dataset):
    def __init__(self, train_df, val_df, test_df, max_len):
        self.label_dict = {'entailment':0, 'contradiction':1, 'neutral':2}
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.max_len = max_len

        self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', do_lower_case=True)
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.init_data()

    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)
        self.test_data = self.load_data(self.test_df)

    def load_data(self, df):
        max_len = self.max_len
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        premise_list = df['premise'].to_list()
        hypo_list = df['hypothesis'].to_list()
        label_list = df['label'].to_list()

        for (premise, hypothesis, label) in zip(premise_list, hypo_list, label_list):
            #Encoding
            premise_id = self.tokenizer.encode(premise, add_special_tokens=False, max_length=max_len, truncation=True)
            hypo_id = self.tokenizer.encode(hypothesis, add_special_tokens=False, max_length=max_len, truncation=True)

            pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypo_id + [self.tokenizer.sep_token_id]
            premise_len = len(premise_id)
            hypo_len = len(hypo_id)

            #chech if token length exceed limit
            if len(pair_token_ids) > max_len:
                # Split the tokens into chunks
                chunks = [pair_token_ids[i:i+max_len] for i in range(0, len(pair_token_ids), max_len)]

                for chunk in chunks:
                    segment_ids = torch.tensor([0] * (len(chunk)//2) + [1] * ((len(chunk))//2)) #premise & hypos
                    attention_mask_ids = torch.tensor([1]* len(chunk)) #mask padded values

                    token_ids.append(torch.tensor(chunk))
                    seg_ids.append(segment_ids)
                    mask_ids.append(attention_mask_ids)
                    y.append(self.label_dict[label])
            else:
                segment_ids = torch.tensor([0] * (premise_len+2) + [1] * (hypo_len+1)) #premise & hypos
                attention_mask_ids = torch.tensor([1]* (premise_len + hypo_len + 3)) #mask padded values

                token_ids.append(torch.tensor(pair_token_ids))
                seg_ids.append(segment_ids)
                mask_ids.append(attention_mask_ids)
                y.append(self.label_dict[label])

        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        print(len(dataset))
        return dataset

    def get_data_loaders(self, batch_size=32, shuffle=True):
        train_loader = DataLoader(
            self.train_data,
            shuffle=shuffle,
            batch_size=batch_size
        )
        val_loader = DataLoader(
            self.val_data,
            shuffle=shuffle,
            batch_size=batch_size
        )
        test_loader = DataLoader(
            self.test_data,
            shuffle=False,
            batch_size=batch_size
        )
        return train_loader, val_loader, test_loader

In [None]:
cnli_dataset = CNLIDataAlbert(train_df, val_df, test_df, max_len=MAX_TOKEN_LEN)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

6852
988
2005


In [None]:
train_loader, val_loader, test_loader = cnli_dataset.get_data_loaders(batch_size=BATCH_SIZE)

In [None]:
#for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
    #print('batch', pair_token_ids.shape)

## Load ALBERT Model

In [None]:
from transformers import AlbertForSequenceClassification, AdamW, AlbertConfig

model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=3)
model.to(device)

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [None]:
# Cross-layer parameters Sharing DONT RUN
#from transformers import AlbertConfig, AlbertModel

# Initialize the configuration
#config = AlbertConfig(
#   num_hidden_layers=12, # Total number of layers
#   num_attention_heads=12,
#   hidden_size=768,
#   intermediate_size=3072,
#   num_hidden_groups=1, # Number of layer groups
#)

# Initialize the model
#model = AlbertModel(config)


## Optimizer

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params' : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
    {'params' : [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0},
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)



In [None]:
#if dont want to retrain albert
#for name, param in model.named_parameters():
  #if name.startswith('albert'):
    #param.requires_grad = False

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 11,685,891 trainable parameters


## Metrics

In [None]:
def multi_acc(y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc

## Trainer

In [None]:
#Trainable
import time

def train(model, train_loader, val_loader, optimizer):
    total_step = len(train_loader)

    for epoch in range(NUM_EPOCHS):
        start = time.time()
        model.train()
        total_train_loss = 0
        total_train_acc = 0
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
            optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)
            #prediciton  = model(pair_token_ids, mask_ids, seg_ids)

            loss, prediction = model(pair_token_ids, token_type_ids=seg_ids, attention_mask=mask_ids, labels=labels).values()

            #loss = criterion(prediction, labels)

            acc = multi_acc(prediction, labels)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            total_train_acc += acc.item()

        train_acc = total_train_acc/len(train_loader)
        train_loss = total_train_loss/len(train_loader)

        model.eval()
        total_val_acc = 0
        total_val_loss = 0
        with torch.no_grad():
            for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)

                loss, prediction = model(pair_token_ids, token_type_ids=seg_ids, attention_mask=mask_ids, labels=labels).values()

                acc = multi_acc(prediction, labels)

                total_val_loss += loss.item()
                total_val_acc += acc.item()

            val_acc = total_val_acc/len(val_loader)
            val_loss = total_val_loss/len(val_loader)

            end = time.time()
            hours, rem = divmod(end-start, 3600)
            minutes, seconds = divmod(rem, 60)

            print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
            print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))


In [None]:
train(model, train_loader, val_loader, optimizer)

Epoch 1: train_loss: 0.9890 train_acc: 0.4557 | val_loss: 0.9369 val_acc: 0.4735
00:11:02.09
Epoch 2: train_loss: 0.9863 train_acc: 0.4529 | val_loss: 0.9399 val_acc: 0.4721
00:11:04.31
Epoch 3: train_loss: 0.9789 train_acc: 0.4484 | val_loss: 0.9390 val_acc: 0.4724
00:11:04.43
Epoch 4: train_loss: 0.9108 train_acc: 0.4851 | val_loss: 0.7860 val_acc: 0.5901
00:11:05.13
Epoch 5: train_loss: 0.7587 train_acc: 0.6356 | val_loss: 0.7696 val_acc: 0.6435
00:11:04.67
Epoch 6: train_loss: 0.6649 train_acc: 0.7032 | val_loss: 0.6949 val_acc: 0.6882
00:11:04.54


In [None]:
torch.save(model.state_dict(), '/content/model.pth')

In [None]:
from sklearn.metrics import f1_score

# TESTing
def test(model, test_loader):
   model.eval()
   total_test_loss = 0
   total_test_acc = 0
   true_labels = []
   predicted_labels = []
   with torch.no_grad():
       for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(test_loader):
           pair_token_ids = pair_token_ids.to(device)
           mask_ids = mask_ids.to(device)
           seg_ids = seg_ids.to(device)
           labels = y.to(device)

           loss, prediction = model(pair_token_ids, token_type_ids=seg_ids, attention_mask=mask_ids, labels=labels).values()

           predicted_labels.extend(prediction.cpu().numpy().tolist())
           true_labels.extend(y.cpu().numpy().tolist())

           acc = multi_acc(prediction, labels)

           total_test_loss += loss.item()
           total_test_acc += acc.item()

       test_acc = total_test_acc/len(test_loader)
       test_loss = total_test_loss/len(test_loader)

       f1_micro = f1_score(true_labels, predicted_labels, average='micro')

       labels = np.unique(y_true)
       f1_macro = {label: f1_score(y_true == label, y_pred == label, average='binary') for label in labels}

       print(f'Test: test_loss: {test_loss:.4f} test_acc: {test_acc:.4f}')
       print(f'Test: F1 Micro: {f1_micro:.4f}')
       print(f'Test: F1 Macro for each class: ' {f1_macro})


In [None]:
test(model, test_loader)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Last training result (retrain albert layer)

Epoch 1: train_loss: 0.9567 train_acc: 0.4397 | val_loss: 0.9272 val_acc: 0.4368
00:11:09.54

Epoch 2: train_loss: 0.9858 train_acc: 0.4493 | val_loss: 0.9425 val_acc: 0.4724
00:11:09.34

Epoch 3: train_loss: 0.9806 train_acc: 0.4528 | val_loss: 0.9182 val_acc: 0.4671
00:11:10.46

Epoch 4: train_loss: 0.9042 train_acc: 0.4764 | val_loss: 0.8471 val_acc: 0.4953
00:11:09.96

Epoch 5: train_loss: 0.8807 train_acc: 0.4777 | val_loss: 0.8419 val_acc: 0.4472
00:11:08.39

## Last Training Result (only train head layer)

Epoch 1: train_loss: 0.9571 train_acc: 0.4875 | val_loss: 0.9122 val_acc: 0.5302
00:04:44.54

Epoch 2: train_loss: 0.9353 train_acc: 0.5144 | val_loss: 0.9046 val_acc: 0.5360
00:04:47.93

Epoch 3: train_loss: 0.9275 train_acc: 0.5203 | val_loss: 0.8975 val_acc: 0.5444
00:04:48.08

Epoch 4: train_loss: 0.9238 train_acc: 0.5221 | val_loss: 0.8930 val_acc: 0.5652
00:04:48.10

Epoch 5: train_loss: 0.9174 train_acc: 0.5380 | val_loss: 0.8910 val_acc: 0.5749
00:04:48.12

## Last Training Result (retrain albert. 10 epoch)

Epoch 1: train_loss: 0.9890 train_acc: 0.4557 | val_loss: 0.9369 val_acc: 0.4735
00:11:16.79

Epoch 2: train_loss: 0.9863 train_acc: 0.4529 | val_loss: 0.9399 val_acc: 0.4721
00:11:16.10

Epoch 3: train_loss: 0.9789 train_acc: 0.4484 | val_loss: 0.9390 val_acc: 0.4724
00:11:16.08

Epoch 4: train_loss: 0.9108 train_acc: 0.4851 | val_loss: 0.7860 val_acc: 0.5901
00:11:16.25

Epoch 5: train_loss: 0.7587 train_acc: 0.6356 | val_loss: 0.7696 val_acc: 0.6435
00:11:16.10

Epoch 6: train_loss: 0.6649 train_acc: 0.7032 | val_loss: 0.6949 val_acc: 0.6882
00:11:16.11

Epoch 7: train_loss: 0.5915 train_acc: 0.7497 | val_loss: 0.5912 val_acc: 0.7440
00:11:16.18

Epoch 8: train_loss: 0.5108 train_acc: 0.8029 | val_loss: 0.5750 val_acc: 0.7712
00:11:16.18

Epoch 9: train_loss: 0.4462 train_acc: 0.8329 | val_loss: 0.5691 val_acc: 0.7893
00:11:15.99

Epoch 10: train_loss: 0.4007 train_acc: 0.8587 | val_loss: 0.5881 val_acc: 0.7903
00:11:16.03