# Model Training

In [None]:
!pip install -q transformers[torch] accelerate
!pip3 install -q wandb
!pip3 install openpyxl

In [3]:
import json 
import pandas as pd 
import os 
import pickle

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np 

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# !pip3 uninstall -q wandb -y

In [8]:
import wandb

In [8]:
wandb.login(key="4de6103347df6561e7258cdef0ef60bbc1233695", relogin=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
MODEL_DIRECTORY = "./trained_model/v9-small"
DATA_DIRECTORY = "./data/dataset/v7/"

In [10]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [11]:
train_df = pd.read_json(os.path.join(DATA_DIRECTORY, "train.json"))

In [12]:
# print(train_df.shape)

In [13]:
texts = train_df['section_content'].values
labels_text = train_df['tags'].values

In [14]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_text)

In [15]:
os.makedirs(MODEL_DIRECTORY)

In [16]:
with open(os.path.join(MODEL_DIRECTORY, "tag.le"), 'wb') as file: 
    pickle.dump(label_encoder, file)

In [17]:
num_labels = train_df.tags.unique().shape[0]
print("Number of labels", num_labels)

Number of labels 23


## Calculate Class weights

In [18]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [19]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

print("Class Weights: ", class_weights_dict)


Class Weights:  {0: 2.2355072463768115, 1: 3.8322981366459627, 2: 1.889161053276179, 3: 3.945012787723785, 4: 2.273397199705232, 5: 1.166351606805293, 6: 2.1633941093969145, 7: 1.6160293347302253, 8: 2.6826086956521737, 9: 3.945012787723785, 10: 1.8125734430082256, 11: 2.0635451505016724, 12: 2.353165522501907, 13: 0.07037273598248095, 14: 7.451690821256038, 15: 3.8322981366459627, 16: 2.0635451505016724, 17: 2.9158790170132325, 18: 1.8125734430082256, 19: 7.059496567505721, 20: 3.945012787723785, 21: 3.6251468860164513, 22: 2.57943143812709}


In [20]:
class_weight_tensor = torch.tensor(class_weights, dtype=torch.float) 

In [21]:
def compute_weighted_loss(model, inputs, return_outputs=False): 
    labels = inputs.get("labels") 
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weight_tensor.to(logits.device))
    loss = loss_fct(logits, labels) 
    return (loss, outputs) if return_outputs else loss

In [22]:
# model_name = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

## Load Model

In [23]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load Dataset

In [24]:
dataset = TextDataset(texts, labels, tokenizer, max_length=512)

## Compute Metrics callback

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [26]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions, )
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [27]:
MODEL_DIRECTORY

'./trained_model/v9-small'

In [28]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [29]:
from pathlib import Path

In [30]:

training_args = TrainingArguments(
    output_dir=os.path.join(MODEL_DIRECTORY, './results'),
    run_name=f"{Path(MODEL_DIRECTORY).name}",
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir=os.path.join(MODEL_DIRECTORY, './logs'),
    logging_steps=10,
    learning_rate=2e-5,
    eval_strategy="epoch"
)

# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset, 
    compute_metrics=compute_metrics, 
    class_weights=class_weight_tensor
)

# Train the model


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33manurag-credcore[0m ([33mcredcore[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7252,2.175436,0.534522,0.522477,0.838442,0.603741
2,0.7566,0.908236,0.847326,0.753365,0.957801,0.830798
3,0.1758,0.420771,0.901135,0.809032,0.985435,0.883094
4,0.0801,0.146331,0.962723,0.915118,0.980429,0.944691
5,0.029,0.128852,0.971799,0.928795,0.994975,0.959055
6,0.0301,0.10985,0.978606,0.94982,0.99671,0.971821
7,0.2163,0.096705,0.983793,0.960313,0.995266,0.976844
8,0.0131,0.079971,0.98282,0.957836,0.993618,0.974326
9,0.0169,0.054256,0.988979,0.971061,0.996876,0.983388
10,0.0094,0.051726,0.991248,0.974612,0.998819,0.986109


  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [32]:
# eval_results = trainer.evaluate()
# print(f"Evaluation results: {eval_results}")

In [32]:
# Save the model and tokenizer to disk
model.save_pretrained(MODEL_DIRECTORY)
tokenizer.save_pretrained(MODEL_DIRECTORY)

('./trained_model/v9-small/tokenizer_config.json',
 './trained_model/v9-small/special_tokens_map.json',
 './trained_model/v9-small/vocab.txt',
 './trained_model/v9-small/added_tokens.json')

# Load Model For Inferences

In [None]:
!pip install -q transformers[torch] accelerate
!pip3 install -q wandb
!pip3 install openpyxl

In [None]:
MODEL_DIRECTORY = "./model" # latest deployed model 

In [9]:
import json 
import pandas as pd 
import os 
import pickle

In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np 

In [11]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import LabelEncoder


In [12]:
print(MODEL_DIRECTORY)

./trained_model/v9-small


In [13]:
model = DistilBertForSequenceClassification.from_pretrained(MODEL_DIRECTORY)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_DIRECTORY)

In [14]:
test_df = pd.read_json(os.path.join(DATA_DIRECTORY, "test.json"))
test_df = test_df.drop_duplicates("section_content")


In [15]:
with open(os.path.join(MODEL_DIRECTORY, "tag.le"), 'rb') as file: 
    label_encoder = pickle.load(file)

In [16]:
test_texts = test_df['section_content'].values
test_labels = test_df['tags'].values
test_labels = label_encoder.transform(test_labels)

In [17]:
from sklearn.metrics import classification_report

In [18]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [19]:
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_length=512)

In [20]:
inference_args = TrainingArguments(
    output_dir='./results',  
    per_device_eval_batch_size=32, 
    dataloader_drop_last=False,  
    no_cuda=False if torch.cuda.is_available() else True, 
    seed=42,  
    report_to="none" # Disable wandb reporting. 
)
trainer = Trainer(
    model=model, 
    args = inference_args
)

In [21]:
os.environ["WANDB_DISABLED"] = "true"

## Training Accuracy

In [22]:
# Make predictions on the test set
predictions = trainer.predict(dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


NameError: name 'dataset' is not defined

In [48]:
print(report)

                                    precision    recall  f1-score   support

                  Additional Liens       1.00      1.00      1.00        60
                 Asset Disposition       1.00      1.00      1.00        35
            Compliance Certificate       1.00      1.00      1.00        71
           Consequences of Default       0.94      1.00      0.97        34
                  Event of Default       1.00      1.00      1.00        59
           Facilities / Instrument       1.00      1.00      1.00       115
                Financial Covenant       1.00      1.00      1.00        62
              Financial Statements       0.99      1.00      0.99        83
                    Governing Laws       1.00      1.00      1.00        50
            Incremental Facilities       1.00      1.00      1.00        34
                     Interest Rate       1.00      0.99      0.99        74
                    Loan Repayment       1.00      1.00      1.00        65
Mandatory P

## Test Accuracy.

In [24]:
# Make predictions on the test set
predictions = trainer.predict(test_dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(test_labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [25]:
print(report)

                                    precision    recall  f1-score   support

                  Additional Liens       1.00      0.94      0.97        18
                 Asset Disposition       0.86      1.00      0.92        12
            Compliance Certificate       0.67      0.67      0.67         3
           Consequences of Default       0.71      0.91      0.80        11
                  Event of Default       1.00      1.00      1.00        13
           Facilities / Instrument       0.90      0.90      0.90        20
                Financial Covenant       0.92      1.00      0.96        12
              Financial Statements       0.57      1.00      0.73         4
                    Governing Laws       1.00      0.91      0.95        11
            Incremental Facilities       1.00      0.83      0.91         6
                     Interest Rate       0.85      0.89      0.87        19
                    Loan Repayment       1.00      0.93      0.97        15
Mandatory P

In [26]:
test_df['Predicted_Tag'] = predicted_labels_text

In [27]:
test_df = test_df.rename(columns = {"tag": "Original_Tag"})

In [28]:
print(test_df.shape)

(764, 9)


In [57]:
test_df.to_excel("./data/output/predictions_v9_small.xlsx")

## Generate Results on Unsampled NA data

In [47]:
seen_df = pd.read_json("./data/cleaned_tags_data.json")
all_df = pd.read_json("./data/all-data.json")

In [54]:
unseen_df = all_df.merge(seen_df, how='left', on=['filename', 'tags', 'title', 'category', 'word_count', 'section_content'], indicator=True, )
unseen_df = unseen_df[unseen_df['_merge'] == 'left_only']
unseen_df = unseen_df.drop(columns=['_merge'])

In [63]:

unseen_df['tags'] = unseen_df['tags'].str.replace("Events of Default", "Event of Default")

In [64]:
unseen_texts = unseen_df['section_content'].values
unseen_labels = unseen_df['tags'].values
unseen_labels = label_encoder.transform(unseen_labels)

In [65]:
unseen_dataset = TextDataset(unseen_texts, unseen_labels, tokenizer, max_length=512)

In [66]:
# Make predictions on the test set
predictions = trainer.predict(unseen_dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(unseen_labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [67]:
print(report)

                                    precision    recall  f1-score   support

                  Additional Liens       0.51      0.93      0.66        29
                 Asset Disposition       0.41      0.83      0.55        18
            Compliance Certificate       0.50      1.00      0.67        14
           Consequences of Default       0.45      0.26      0.33        92
                  Event of Default       0.83      0.64      0.73       107
           Facilities / Instrument       0.17      0.97      0.30        36
                Financial Covenant       0.53      0.75      0.62        24
              Financial Statements       0.07      1.00      0.13         1
                    Governing Laws       0.46      0.96      0.62        51
            Incremental Facilities       0.17      1.00      0.29         8
                     Interest Rate       0.42      0.95      0.59        44
                    Loan Repayment       0.44      0.93      0.60        29
Mandatory P

In [None]:
all_df = pd.read_json("./data/all-data.json")
print(all_df.shape)
all_df = all_df[all_df.word_count > 20]
print(all_df.shape)

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df['predicted_tags'] = predicted_labels_text

In [None]:
test_df.head()

In [None]:
group_df = all_df[['section_content', 'tags']].groupby('section_content').agg(list).reset_index()
group_df['n_tags'] = group_df.tags.apply(lambda x: len(x))
group_df['n_unique_tags'] = group_df.tags.apply(lambda x: len(set(x)))
print(group_df[group_df.n_tags > 1].shape)
print(group_df[group_df.n_unique_tags > 1].shape)
group_df.sort_values(by='n_unique_tags', ascending=False).head(10)

In [None]:
test_df[["section_content", "predicted_tags"]].merge(all_df.drop_duplicates("section_content"), how="left", on="section_content", validate="one_to_one").to_excel("./data/output/test_data_predictions.xlsx")

In [None]:
test_df.shape

In [None]:
print(report)

In [None]:
def predict_labels(texts, model, tokenizer, label_encoder, max_length=512, device='cuda'):
    # Move model to the specified device
    model.to(device)
    
    # Tokenize the texts
    encodings = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Move tensors to the same device as the model
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    # Put the model in evaluation mode
    model.eval()
    
    # Perform the inference
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
    # Move predictions back to the CPU before converting to numpy
    predicted_labels = label_encoder.inverse_transform(predictions.cpu().numpy())
    
    return predicted_labels

In [None]:
predicted_labels_text = predict_labels(test_texts.tolist(), model, tokenizer, label_encoder)
true_labels_text = label_encoder.inverse_transform(test_labels)

In [None]:
!nvidia-smi

In [None]:
# Extract the logits and convert to predicted labels
# logits = predictions.predictions
# predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [None]:
print(report)