# Model Training 

In [2]:
!pip install -q transformers[torch] accelerate
!pip3 install -q wandb
!pip3 install -q openpyxl

[0m

In [2]:
import json 
import pandas as pd 
import os 
import pickle

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np 

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# !pip3 uninstall -q wandb -y

In [7]:
import wandb

In [8]:
wandb.login(key="4de6103347df6561e7258cdef0ef60bbc1233695", relogin=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
MODEL_DIRECTORY = "./trained_model/v15-small"
DATA_DIRECTORY = "./data/dataset/v9/"

In [10]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [23]:
train_df = pd.read_json(os.path.join(DATA_DIRECTORY, "train.json"))

In [24]:
print(train_df.shape)

(6785, 9)


In [25]:
print(train_df.filename.unique().shape)

(80,)


In [26]:
texts = train_df['section_content'].values
labels_text = train_df['tags'].values

In [27]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_text)

In [16]:
os.makedirs(MODEL_DIRECTORY)

In [17]:
with open(os.path.join(MODEL_DIRECTORY, "tag.le"), 'wb') as file: 
    pickle.dump(label_encoder, file)

In [18]:
num_labels = train_df.tags.unique().shape[0]
print("Number of labels", num_labels)

Number of labels 25


## Calculate Class weights

In [19]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [20]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

print("Class Weights: ", class_weights_dict)


Class Weights:  {0: 3.049438202247191, 1: 3.7694444444444444, 2: 6.619512195121951, 3: 3.7694444444444444, 4: 27.14, 5: 6.0311111111111115, 6: 6.619512195121951, 7: 1.8214765100671142, 8: 3.933333333333333, 9: 15.077777777777778, 10: 4.112121212121212, 11: 6.168181818181818, 12: 20.876923076923077, 13: 0.049086634111050825, 14: 11.8, 15: 7.982352941176471, 16: 4.240625, 17: 3.9911764705882353, 18: 5.9, 19: 3.155813953488372, 20: 4.934545454545455, 21: 13.57, 22: 6.461904761904762, 23: 7.335135135135135, 24: 5.219230769230769}


In [21]:
class_weight_tensor = torch.tensor(class_weights, dtype=torch.float) 

In [22]:
def compute_weighted_loss(model, inputs, return_outputs=False): 
    labels = inputs.get("labels") 
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weight_tensor.to(logits.device))
    loss = loss_fct(logits, labels) 
    return (loss, outputs) if return_outputs else loss

In [23]:
# model_name = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

## Load Model

In [24]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)



In [28]:
dataset = TextDataset(texts, labels, tokenizer, max_length=512)

## Compute Metrics callback

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions, )
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [28]:
MODEL_DIRECTORY

'./trained_model/v15-small'

In [29]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

## Train using K-fold Cross Validation

In [30]:
from sklearn.model_selection import StratifiedKFold, KFold
import json 
from pathlib import Path

In [31]:
k=5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [32]:
unique_files = train_df[train_df.filename.str.contains("synth") == False].filename.unique()

In [33]:
splits = list(kf.split(unique_files))
for fold, (train_file_index, val_file_index)  in enumerate(splits):
    train_file_names = set(unique_files[train_file_index])
    val_file_names = set(unique_files[val_file_index])
    
    train = train_df[train_df.filename.isin(train_file_names)]
    val  = train_df[train_df.filename.isin(val_file_names)]
    print(train.shape, train.tags.unique().shape)
    print(val.shape, val.tags.unique().shape)

(5615, 9) (25,)
(1170, 9) (24,)
(5287, 9) (25,)
(1498, 9) (25,)
(5387, 9) (25,)
(1398, 9) (24,)
(5366, 9) (25,)
(1419, 9) (25,)
(5485, 9) (25,)
(1300, 9) (23,)


## Split by files

In [34]:
unique_labels = train_df.tags.unique()

In [None]:
fold_accuracies = [] 
for fold, (train_file_index, val_file_index)  in enumerate(splits):
    
    train_file_names = set(unique_files[train_file_index])
    val_file_names = set(unique_files[val_file_index])
    
    train = train_df[train_df.filename.isin(train_file_names)]
    val  = train_df[train_df.filename.isin(val_file_names)]
    
    val_labels = label_encoder.transform(val['tags'].values)
    train_labels = label_encoder.transform(train['tags'].values)
    
    train_dataset = TextDataset(train['section_content'].values, train_labels ,  tokenizer, max_length=512)
    val_dataset = TextDataset(val['section_content'].values, val_labels,  tokenizer, max_length=512)
    
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
    
    run_name = f"{Path(MODEL_DIRECTORY).name}_fold_{fold+1}"    
    print("Starting ", run_name)
    wandb.init(project="huggingface", name=run_name)
    
    training_args = TrainingArguments(
        output_dir=os.path.join(MODEL_DIRECTORY, f'./results_fold_{fold+1}'),
        run_name=run_name,
        num_train_epochs=30,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        logging_dir=os.path.join(MODEL_DIRECTORY, f'./logs_fold_{fold+1}'),
        logging_steps=10,
        learning_rate=2e-5,
        eval_strategy="epoch", 
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model='recall'

    )

    # Initialize Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset, 
        compute_metrics=compute_metrics, 
        class_weights=class_weight_tensor, 
    )
    trainer.train()

    # Train the model
    # Make predictions on the test set
    predictions = trainer.predict(val_dataset)

    # Extract the logits and convert to predicted labels
    logits = predictions.predictions
    predicted_labels = np.argmax(logits, axis=-1)

    # Convert numeric labels back to original text labels for comparison
    predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
    true_labels_text = label_encoder.inverse_transform(val_labels)
    
    val['predicted_tags'] = predicted_labels_text
    val.to_json(f"./data/output/val_{Path(MODEL_DIRECTORY).name}_fold_{fold+1}.json", orient="records")
    # Generate a classification report
    report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_, labels=unique_labels, output_dict=True)
    with open(f"./data/output/tag_classification_fold_{fold+1}_report.json", "w") as file: 
        json.dump(report, file)
        
    print(report)
        
    
    print("Generating Evaluation results")
    eval_result = trainer.evaluate()
    # Save accuracy for this fold
    fold_accuracies.append((report, eval_result['eval_accuracy']))
    del model 
    torch.cuda.empty_cache()

  _warn_prf(average, modifier, msg_start, len(result))


---

## Split by rows

In [None]:
fold_accuracies = [] 
for fold, (train_index, val_index) in enumerate(kf.split(train_df, train_df['tags'].values)):
    print(f"Fold {fold + 1}/{k}")
    train = train_df.iloc[train_index]
    val = train_df.iloc[val_index]
    
    val_labels = label_encoder.transform(val['tags'].values)
    train_labels = label_encoder.transform(train['tags'].values)
    
    train_dataset = TextDataset(train['section_content'].values, train_labels ,  tokenizer, max_length=512)
    val_dataset = TextDataset(val['section_content'].values, val_labels,  tokenizer, max_length=512)
    
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
    
    run_name = f"{Path(MODEL_DIRECTORY).name}_fold_{fold+1}"
    wandb.run_name = run_name


    
    print("Starting ", run_name)
    
    training_args = TrainingArguments(
        output_dir=os.path.join(MODEL_DIRECTORY, f'./results_fold_{fold+1}'),
        run_name=run_name,
        num_train_epochs=30,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        logging_dir=os.path.join(MODEL_DIRECTORY, f'./logs_fold_{fold+1}'),
        logging_steps=10,
        learning_rate=2e-5,
        eval_strategy="epoch", 
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model='recall'

    )

    # Initialize Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset, 
        compute_metrics=compute_metrics, 
        class_weights=class_weight_tensor, 
    )
    trainer.train()

    # Train the model
    # Make predictions on the test set
    predictions = trainer.predict(val_dataset)

    # Extract the logits and convert to predicted labels
    logits = predictions.predictions
    predicted_labels = np.argmax(logits, axis=-1)

    # Convert numeric labels back to original text labels for comparison
    predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
    true_labels_text = label_encoder.inverse_transform(val_labels)
    
    val['predicted_tags'] = predicted_labels_text
    val.to_json(f"./data/output/val_{Path(MODEL_DIRECTORY).name}_fold_{fold+1}.json", orient="records")
    # Generate a classification report
    report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_, output_dict=True)
    with open(f"./data/output/tag_classification_fold_{fold+1}_report.json", "w") as file: 
        json.dump(report, file)
        
    print(report)
        
    
    print("Generating Evaluation results")
    eval_result = trainer.evaluate()
    # Save accuracy for this fold
    fold_accuracies.append((report, eval_result['eval_accuracy']))
    del model 
    torch.cuda.empty_cache()

In [None]:
print("")

In [None]:
x[0]

In [None]:
fold_data = []
for x in fold_accuracies[1:]:
    for key, value in x[0].items(): 
        if type(value) != dict:
            continue
        fold_data.append({
            "tag": key, 
            "precision": value['precision'], 
            'recall' : value['recall'], 
            'f1-score': value['f1-score'], 
            'support': value['support']
        })
    break

In [None]:
for i in fold_accuracies: 
    print(i[1])

In [None]:
fold_df = pd.DataFrame(fold_data)

In [None]:
print(fold_df.groupby('tag').agg('mean').to_string())

## Normal Training 

In [33]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
dataset = TextDataset(texts, labels, tokenizer, max_length=512)

In [35]:
import os 
from pathlib import Path

In [36]:

training_args = TrainingArguments(
    output_dir=os.path.join(MODEL_DIRECTORY, './results'),
    run_name=f"{Path(MODEL_DIRECTORY).name}",
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir=os.path.join(MODEL_DIRECTORY, './logs'),
    logging_steps=10,
    learning_rate=2e-5,
    eval_strategy="epoch"
)

# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset, 
    compute_metrics=compute_metrics, 
    class_weights=class_weight_tensor
)

# Train the model


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33manurag-credcore[0m ([33mcredcore[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3018,0.920087,0.899189,0.630986,0.814861,0.675411
2,0.241,0.188968,0.97185,0.90609,0.972276,0.932525
3,0.3957,0.057707,0.969934,0.888395,0.995696,0.932934
4,0.0232,0.024118,0.990273,0.957133,0.995353,0.974942
5,0.0766,0.016977,0.992631,0.967311,0.997074,0.981501
6,0.0163,0.018199,0.99322,0.966537,0.995515,0.979839
7,0.0211,0.008274,0.994399,0.977524,0.998562,0.98776


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# eval_results = trainer.evaluate()
# print(f"Evaluation results: {eval_results}")

In [None]:
# Save the model and tokenizer to disk
model.save_pretrained(MODEL_DIRECTORY)
tokenizer.save_pretrained(MODEL_DIRECTORY)

# Load Model For Inferences

In [3]:
!pip install -q transformers[torch] accelerate
!pip3 install -q wandb
!pip3 install -q openpyxl

[0m

In [1]:
MODEL_DIRECTORY = "./trained_model/v15-small"
DATA_DIRECTORY = "./data/carlyle-inference"

In [3]:
import json 
import pandas as pd 
import os 
import pickle
from glob import glob

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np 
from sklearn.metrics import classification_report

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


## TextDatset Class

In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

## Inference 

In [7]:
print(MODEL_DIRECTORY)

./trained_model/v15-small


In [8]:
model = DistilBertForSequenceClassification.from_pretrained(MODEL_DIRECTORY)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_DIRECTORY)

In [31]:
with open(os.path.join(MODEL_DIRECTORY, "tag.le"), 'rb') as file: 
    label_encoder = pickle.load(file)

In [None]:
inputs = "This is a sample text"

tokens = tokenizer(inputs, return_tensors='pt', padding=True, truncation=True)

data = tokens
with torch.no_grad():
    outputs = model(**data)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1).tolist()

predictions

## Inference Trainer

In [35]:
inference_args = TrainingArguments(
    output_dir='./results',  
    per_device_eval_batch_size=32, 
    dataloader_drop_last=False,  
    no_cuda=False if torch.cuda.is_available() else True, 
    seed=42,  
    report_to="none" # Disable wandb reporting. 
)
trainer = Trainer(
    model=model, 
    args = inference_args
)

In [49]:
import re
def extract_section_num(x): 
    text = x.split("\n")[1].strip().upper()
    matches = re.findall("^SECTION\s+\d+\.\d{1,2}", text)
    if matches:
        return matches[0]
    matches  = re.findall("^SECTION\s+\d+\.?\d{1,2}", text)
    if matches:
        return matches[0]
    matches  = re.findall("^\d+\.?\d{1,2}", text, flags=re.IGNORECASE)
    if matches:
        return matches[0]
    return "NOT FOUND"

In [199]:
def run_predict(filename): 
    df = pd.read_json(filename)
    df = df[df.section_content.isna() == False]
    df['section_content'] = df['title'].apply(lambda x: f"Title is {x}") + "\n" +  df['section_content']
    test_texts = df['section_content'].values
    test_dataset = TextDataset(test_texts, [0] * len(test_texts), tokenizer, max_length=512)
    predictions = trainer.predict(test_dataset)

    # Extract the logits and convert to predicted labels
    logits = predictions.predictions
    predicted_labels = np.argmax(logits, axis=-1)

    # Convert numeric labels back to original text labels for comparison
    predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
    df['toc'] = df.section_content.apply(extract_section_num)

    df['AITag'] = predicted_labels_text
    df['AITag'] = df.AITag.str.replace("^NA$", "", regex=True)
    df["ExpertTag"] = df['AITag']
    df = df[['toc', 'title', 'section_content', 'AITag', 'ExpertTag']]
    return df 

In [94]:
# filename = os.path.join(DATA_DIRECTORY, "test.json")

In [200]:
filenames = glob(os.path.join(DATA_DIRECTORY, "*/*.section.json"))
filename = filenames[0]
cat_filenames = glob(os.path.join(DATA_DIRECTORY, "*/categorized.xlsx"))

In [201]:
from pathlib import Path
import os 
import shutil

In [202]:
def extract_num(x): 
    try: 
        return "".join(re.findall("\d", x))
    except Exception as e: 
        print(x)
        raise e

Unnamed: 0,toc_number,title,article,toc,AICategory,ExpertCategory,key,AITag,ExpertTag
0,,I DEFINITIONS AND ACCOUNTING TERMS,I DEFINITIONS AND ACCOUNTING TERMS,ARTICLE I DEFINITIONS AND ACCOUNTING TERMS,FYI,FYI,_i definitions and accounting terms,,
1,1.01,Defined Terms,I DEFINITIONS AND ACCOUNTING TERMS,Section 1.01 Defined Terms,Definitions,Definitions,101_defined terms,,
2,1.02,Other Interpretive Provisions,I DEFINITIONS AND ACCOUNTING TERMS,Section 1.02 Other Interpretive Provisions,FYI,FYI,102_other interpretive provisions,,
3,1.03,Accounting Terms Generally,I DEFINITIONS AND ACCOUNTING TERMS,Section 1.03 Accounting Terms Generally,FYI,FYI,103_accounting terms generally,,
4,1.04,Uniform Commercial Code,I DEFINITIONS AND ACCOUNTING TERMS,Section 1.04 Uniform Commercial Code,FYI,FYI,104_uniform commercial code,,
...,...,...,...,...,...,...,...,...,...
148,11.06,Termination; Reinstatement,XI CONTINUING GUARANTY,Section 11.06 Termination; Reinstatement,Event Driven,Event Driven,1106_termination; reinstatement,,
149,11.07,Subordination,XI CONTINUING GUARANTY,Section 11.07 Subordination,Event Driven,Event Driven,1107_subordination,,
150,11.08,Stay of Acceleration,XI CONTINUING GUARANTY,Section 11.08 Stay of Acceleration,Event Driven,Event Driven,1108_stay of acceleration,,
151,11.09,Condition of Borrower,XI CONTINUING GUARANTY,Section 11.09 Condition of Borrower,Event Driven,Event Driven,1109_condition of borrower,,


PosixPath('data/carlyle-inference')

In [209]:
for filename, cat_filename in zip(filenames, cat_filenames): 
    test_df = run_predict(filename)
    cat_df = pd.read_excel(cat_filename)
    cat_df = cat_df.drop(columns=["AITag", "ExpertTag"])
    
    output_dir = os.path.join(DATA_DIRECTORY, Path(filename).stem.split(".")[0])
    output_file = os.path.join(output_dir, Path(filename).stem.split(".")[0]+".tags.xlsx")
    root_output_file = os.path.join(Path(output_file).parent.parent, Path(filename).stem.split(".")[0]+".tags.xlsx")
    
    # Create keys for joining
    test_df['key'] = test_df.toc.apply(extract_num) + "_" + test_df.title.str.lower()
    cat_df['key'] = cat_df.toc_number.map(str).apply(extract_num) + "_" + cat_df.title.str.lower()
    
    result_df = cat_df.merge(test_df[['AITag', 'ExpertTag', 'key']], on='key', how='left').fillna("   ")
    result_df = result_df.drop(columns = "key")
    print(output_dir)
    print(output_file)
    print(result_df.shape, result_df[result_df.AITag.isna()].shape)
    result_df.to_excel(output_file, index=False)
    result_df.to_excel(root_output_file, index=False)
    

./data/carlyle-inference/Coupa CA
./data/carlyle-inference/Coupa CA/Coupa CA.tags.xlsx
(188, 8) (0, 8)


./data/carlyle-inference/PDI CA
./data/carlyle-inference/PDI CA/PDI CA.tags.xlsx
(156, 8) (0, 8)


./data/carlyle-inference/FPG CA
./data/carlyle-inference/FPG CA/FPG CA.tags.xlsx
(151, 8) (0, 8)


./data/carlyle-inference/Transtar CA
./data/carlyle-inference/Transtar CA/Transtar CA.tags.xlsx
(224, 8) (0, 8)


./data/carlyle-inference/NEFCO CA
./data/carlyle-inference/NEFCO CA/NEFCO CA.tags.xlsx
(153, 8) (0, 8)


In [194]:
output_file

'./data/carlyle-inference/Coupa CA/Coupa CA.tags.xlsx'

Unnamed: 0,toc,title,section_content,AITag,ExpertTag,key
14,2.2,Minimum Amount of Each Borrowing; Maximum Numb...,Title is Minimum Amount of Each Borrowing; Max...,,,22_minimum amount of each borrowing; maximum n...


(188, 9)

(26, 9)

In [166]:
cat_df.head()

Unnamed: 0,toc_number,title,article,toc,AICategory,ExpertCategory,AITag,ExpertTag,key
0,1.0,Definitions,Definitions,Section 1. Definitions,FYI,FYI,,,10_ definitions
1,1.1,Defined Terms,Definitions,1.1 Defined Terms,Definitions,Definitions,,,11_defined terms
2,1.2,Other Interpretive Provisions,Definitions,1.2 Other Interpretive Provisions,FYI,FYI,,,12_other interpretive provisions
3,1.3,Accounting Terms,Definitions,1.3 Accounting Terms,FYI,FYI,,,13_accounting terms
4,1.4,Rounding,Definitions,1.4 Rounding,FYI,FYI,,,14_rounding


In [167]:
test_df.head()

Unnamed: 0,toc,title,section_content,AITag,ExpertTag,key
0,1.2,Other Interpretive Provisions,Title is Other Interpretive Provisions\n1.2 Ot...,,,12_other interpretive provisions
1,1.3,Accounting Terms,Title is Accounting Terms\n1.3 Accounting Term...,,,13_accounting terms
2,1.4,Rounding,Title is Rounding\n1.4 Rounding. Any financia...,,,14_rounding
3,1.5,"References to Agreements, Laws, Etc","Title is References to Agreements, Laws, Etc\n...",,,"15_references to agreements, laws, etc"
4,1.6,Exchange Rates,Title is Exchange Rates\n1.6 Exchange Rates. ...,,,16_exchange rates


In [119]:
cat_df

Unnamed: 0,toc_number,title,article,toc,AICategory,ExpertCategory,AITag,ExpertTag
0,1.00,Definitions,Definitions,Section 1. Definitions,FYI,FYI,,
1,1.10,Defined Terms,Definitions,1.1 Defined Terms,Definitions,Definitions,,
2,1.20,Other Interpretive Provisions,Definitions,1.2 Other Interpretive Provisions,FYI,FYI,,
3,1.30,Accounting Terms,Definitions,1.3 Accounting Terms,FYI,FYI,,
4,1.40,Rounding,Definitions,1.4 Rounding,FYI,FYI,,
...,...,...,...,...,...,...,...,...
183,14.20,Payments Set Aside,Miscellaneous,14.20 Payments Set Aside,FYI,FYI,,
184,14.21,No Fiduciary Duty,Miscellaneous,14.21 No Fiduciary Duty,FYI,FYI,,
185,14.22,Nature of Borrower Obligations,Miscellaneous,14.22 Nature of Borrower Obligations,FYI,FYI,,
186,14.23,Acknowledgment and Consent to Bail-In of EEA F...,Miscellaneous,14.23 Acknowledgment and Consent to Bail-In of...,FYI,FYI,,


In [99]:
test_df[test_df.AITag!=""]

Unnamed: 0,toc,title,section_content,AITag,ExpertTag
9,SECTION 2.01,The Loans,Title is The Loans\nSection 2.01. The Loans. ...,Facilities / Instrument,Facilities / Instrument
11,SECTION 2.03,Letters of Credit,Title is Letters of Credit\nSection 2.03. Let...,Facilities / Instrument,Facilities / Instrument
13,SECTION 2.05,Prepayments,Title is Prepayments\nSection 2.05. Prepaymen...,Prepayment / Redemption,Prepayment / Redemption
15,SECTION 2.07,Repayment of Loans,Title is Repayment of Loans\nSection 2.07. Re...,Amortization Schedule / Loan Repayment,Amortization Schedule / Loan Repayment
16,SECTION 2.08,Interest,Title is Interest\nSection 2.08. Interest. (a...,Pricing / Interest,Pricing / Interest
17,SECTION 2.09,Fees,Title is Fees\nSection 2.09. Fees. In additi...,Premium and Fees,Premium and Fees
22,SECTION 2.14,Incremental Credit Extensions,Title is Incremental Credit Extensions\nSectio...,Incremental Facilities,Incremental Facilities
57,SECTION 6.01,Financial Statements,Title is Financial Statements\nSection 6.01. ...,Financial Statements,Financial Statements
58,SECTION 6.02,Certificates; Other Information,Title is Certificates; Other Information\nSect...,Compliance Certificate,Compliance Certificate
71,SECTION 6.15,Transactions with Affiliates,Title is Transactions with Affiliates\nSection...,Transactions with Affiliates,Transactions with Affiliates


In [75]:
print(output_file)

./data/carlyle-inference/PDI CA_tags.xlsx


In [55]:
test_df.head()

Unnamed: 0,title,category,tags,checklists,section_content,toc,AITag
0,Other Interpretive Provisions,FYI,[],[],Title is Other Interpretive Provisions\nSectio...,SECTION 1.02,
1,Accounting Terms,FYI,[],[],Title is Accounting Terms\nSection 1.03. Acco...,SECTION 1.03,
2,Rounding,FYI,[],[],Title is Rounding\nSection 1.04. Rounding. A...,SECTION 1.04,
3,"References to Agreements, Laws, Etc",FYI,[],[],"Title is References to Agreements, Laws, Etc\n...",SECTION 1.05,
4,Times of Day,FYI,[],[],Title is Times of Day\nSection 1.06. Times of...,SECTION 1.06,


In [40]:
# test_labels = test_df['tags'].values
# test_labels = label_encoder.transform(test_labels)

In [42]:
# Make predictions on the test set
predictions = trainer.predict(test_dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
# true_labels_text = label_encoder.inverse_transform(test_labels)

# # Generate a classification report
# report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


---

In [11]:
nst_to_st = {
    "Interest / Pricing": "Pricing / Interest",
    "Other Reporting Requirements": "Reporting Requirements"
}

In [12]:
def to_standard_tag(x): 
    if x in nst_to_st: 
        return nst_to_st.get(x)
    return x
        
    

In [13]:
test_df['tags'] = test_df.tags.apply(to_standard_tag)

In [14]:
test_texts = test_df['section_content'].values
test_labels = test_df['tags'].values
test_labels = label_encoder.transform(test_labels)

In [15]:
from sklearn.metrics import classification_report

In [17]:
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_length=512)

In [18]:
inference_args = TrainingArguments(
    output_dir='./results',  
    per_device_eval_batch_size=32, 
    dataloader_drop_last=False,  
    no_cuda=False if torch.cuda.is_available() else True, 
    seed=42,  
    report_to="none" # Disable wandb reporting. 
)
trainer = Trainer(
    model=model, 
    args = inference_args
)

In [19]:
os.environ["WANDB_DISABLED"] = "true"

## Training Accuracy

In [29]:
# Make predictions on the test set
predictions = trainer.predict(dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [31]:
print(report)

                                        precision    recall  f1-score   support

                            Amendments       1.00      1.00      1.00        89
Amortization Schedule / Loan Repayment       1.00      1.00      1.00        72
                     Asset Disposition       1.00      1.00      1.00        41
                           Assignments       1.00      1.00      1.00        72
                Compliance Certificate       1.00      1.00      1.00        10
               Consequences of Default       0.98      1.00      0.99        45
                      Event of Default       1.00      1.00      1.00        41
               Facilities / Instrument       1.00      1.00      1.00       149
                    Financial Covenant       1.00      1.00      1.00        69
                  Financial Statements       1.00      1.00      1.00        18
                        Governing Laws       1.00      1.00      1.00        66
                Incremental Facilities 

## Test Accuracy.

In [32]:
# Make predictions on the test set
predictions = trainer.predict(test_dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(test_labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [33]:
print(report)

                                        precision    recall  f1-score   support

                            Amendments       0.95      0.95      0.95        20
Amortization Schedule / Loan Repayment       0.92      0.92      0.92        13
                     Asset Disposition       0.92      1.00      0.96        11
                           Assignments       0.75      1.00      0.86        15
                Compliance Certificate       1.00      1.00      1.00         3
               Consequences of Default       0.82      0.90      0.86        10
                      Event of Default       0.88      1.00      0.93         7
               Facilities / Instrument       0.70      1.00      0.82        16
                    Financial Covenant       0.85      1.00      0.92        11
                  Financial Statements       1.00      1.00      1.00         4
                        Governing Laws       0.91      0.91      0.91        11
                Incremental Facilities 

In [34]:
test_df['Predicted_Tag'] = predicted_labels_text

In [35]:
test_df = test_df.rename(columns = {"tag": "Original_Tag"})

In [36]:
print(test_df.shape)

(1493, 9)


In [37]:
test_df.to_excel("./data/output/predictions_v15_small.xlsx")

## Generate Results on Unsampled NA data

In [None]:
seen_df = pd.read_json("./data/cleaned_tags_data.json")
all_df = pd.read_json("./data/all-data.json")

In [None]:
unseen_df = all_df.merge(seen_df, how='left', on=['filename', 'tags', 'title', 'category', 'word_count', 'section_content'], indicator=True, )
unseen_df = unseen_df[unseen_df['_merge'] == 'left_only']
unseen_df = unseen_df.drop(columns=['_merge'])


In [None]:

unseen_df['tags'] = unseen_df['tags'].str.replace("Events of Default", "Event of Default")

In [None]:
unseen_texts = unseen_df['section_content'].values
unseen_labels = unseen_df['tags'].values
unseen_labels = label_encoder.transform(unseen_labels)

In [None]:
unseen_dataset = TextDataset(unseen_texts, unseen_labels, tokenizer, max_length=512)

In [None]:
# Make predictions on the test set
predictions = trainer.predict(unseen_dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(unseen_labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [None]:
print(report)

In [None]:
all_df = pd.read_json("./data/all-data.json")
print(all_df.shape)
all_df = all_df[all_df.word_count > 20]
print(all_df.shape)

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df['predicted_tags'] = predicted_labels_text

In [None]:
test_df.head()

In [None]:
group_df = all_df[['section_content', 'tags']].groupby('section_content').agg(list).reset_index()
group_df['n_tags'] = group_df.tags.apply(lambda x: len(x))
group_df['n_unique_tags'] = group_df.tags.apply(lambda x: len(set(x)))
print(group_df[group_df.n_tags > 1].shape)
print(group_df[group_df.n_unique_tags > 1].shape)
group_df.sort_values(by='n_unique_tags', ascending=False).head(10)

In [None]:
test_df[["section_content", "predicted_tags"]].merge(all_df.drop_duplicates("section_content"), how="left", on="section_content", validate="one_to_one").to_excel("./data/output/test_data_predictions.xlsx")

In [None]:
test_df.shape

In [None]:
print(report)

In [None]:
def predict_labels(texts, model, tokenizer, label_encoder, max_length=512, device='cuda'):
    # Move model to the specified device
    model.to(device)
    
    # Tokenize the texts
    encodings = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Move tensors to the same device as the model
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    # Put the model in evaluation mode
    model.eval()
    
    # Perform the inference
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
    # Move predictions back to the CPU before converting to numpy
    predicted_labels = label_encoder.inverse_transform(predictions.cpu().numpy())
    
    return predicted_labels

In [None]:
predicted_labels_text = predict_labels(test_texts.tolist(), model, tokenizer, label_encoder)
true_labels_text = label_encoder.inverse_transform(test_labels)

In [None]:
!nvidia-smi

In [None]:
# Extract the logits and convert to predicted labels
# logits = predictions.predictions
# predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [None]:
print(report)