In [1]:
# !pip install -q transformers[torch] accelerate
# !pip3 install -q wandb
# !pip3 install -q openpyxl

In [33]:
import json 
import pandas as pd 
import os 
import pickle
from pathlib import Path

In [3]:
import numpy as np 

In [5]:

import torch
from torch.utils.data import Dataset
from transformers import LongformerTokenizer, LongformerForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# !pip3 uninstall -q wandb -y

In [7]:
import wandb

In [8]:
wandb.login(key="4de6103347df6561e7258cdef0ef60bbc1233695", relogin=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
MODEL_DIRECTORY = "./trained_model/v1-longformer"
DATA_DIRECTORY = "./data/dataset/v7/"

In [10]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=4096):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [11]:
train_df = pd.read_json(os.path.join(DATA_DIRECTORY, "train.json"))

In [12]:
print(train_df.shape)

(3085, 9)


In [13]:
print(train_df.filename.unique().shape)

(138,)


In [14]:
texts = train_df['section_content'].values
labels_text = train_df['tags'].values

In [15]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_text)

In [16]:
os.makedirs(MODEL_DIRECTORY)

FileExistsError: [Errno 17] File exists: './trained_model/v1-longformer'

In [17]:
with open(os.path.join(MODEL_DIRECTORY, "tag.le"), 'wb') as file: 
    pickle.dump(label_encoder, file)

In [18]:
num_labels = train_df.tags.unique().shape[0]
print("Number of labels", num_labels)

Number of labels 23


## Calculate Class weights

In [19]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [20]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

print("Class Weights: ", class_weights_dict)


Class Weights:  {0: 2.2355072463768115, 1: 3.8322981366459627, 2: 1.889161053276179, 3: 3.945012787723785, 4: 2.273397199705232, 5: 1.166351606805293, 6: 2.1633941093969145, 7: 1.6160293347302253, 8: 2.6826086956521737, 9: 3.945012787723785, 10: 1.8125734430082256, 11: 2.0635451505016724, 12: 2.353165522501907, 13: 0.07037273598248095, 14: 7.451690821256038, 15: 3.8322981366459627, 16: 2.0635451505016724, 17: 2.9158790170132325, 18: 1.8125734430082256, 19: 7.059496567505721, 20: 3.945012787723785, 21: 3.6251468860164513, 22: 2.57943143812709}


In [21]:
class_weight_tensor = torch.tensor(class_weights, dtype=torch.float) 

In [22]:
def compute_weighted_loss(model, inputs, return_outputs=False): 
    labels = inputs.get("labels") 
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = torch.nn.CrossEntropyLoss(weight=class_weight_tensor.to(logits.device))
    loss = loss_fct(logits, labels) 
    return (loss, outputs) if return_outputs else loss

# Load Model

In [23]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')



## Compute Metrics callback

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions, )
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [26]:
MODEL_DIRECTORY

'./trained_model/v1-longformer'

In [27]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [28]:
dataset = TextDataset(texts, labels, tokenizer, max_length=4096)

In [29]:
print(MODEL_DIRECTORY)

./trained_model/v1-longformer


In [30]:
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=23)

  return self.fget.__get__(instance, owner)()
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:

training_args = TrainingArguments(
    output_dir=os.path.join(MODEL_DIRECTORY, './results'),
    run_name=f"{Path(MODEL_DIRECTORY).name}",
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir=os.path.join(MODEL_DIRECTORY, './logs'),
    logging_steps=10,
    learning_rate=2e-5,
    eval_strategy="epoch"
)

# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset, 
    compute_metrics=compute_metrics, 
    class_weights=class_weight_tensor
)

# Train the model


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33manurag-credcore[0m ([33mcredcore[0m). Use [1m`wandb login --relogin`[0m to force relogin


Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss


In [None]:
# eval_results = trainer.evaluate()
# print(f"Evaluation results: {eval_results}")

In [None]:
# Save the model and tokenizer to disk
model.save_pretrained(MODEL_DIRECTORY)
tokenizer.save_pretrained(MODEL_DIRECTORY)

# Load Model For Inferences

In [None]:
import json 
import pandas as pd 
import os 
import pickle

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np 

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import LabelEncoder


In [None]:
print(MODEL_DIRECTORY)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(MODEL_DIRECTORY)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_DIRECTORY)

In [None]:
test_df = pd.read_json(os.path.join(DATA_DIRECTORY, "test.json"))
test_df = test_df.drop_duplicates("section_content")


In [None]:
with open(os.path.join(MODEL_DIRECTORY, "tag.le"), 'rb') as file: 
    label_encoder = pickle.load(file)

In [None]:
test_texts = test_df['section_content'].values
test_labels = test_df['tags'].values
test_labels = label_encoder.transform(test_labels)

In [None]:
from sklearn.metrics import classification_report

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [None]:
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_length=512)

In [None]:
inference_args = TrainingArguments(
    output_dir='./results',  
    per_device_eval_batch_size=32, 
    dataloader_drop_last=False,  
    no_cuda=False if torch.cuda.is_available() else True, 
    seed=42,  
    report_to="none" # Disable wandb reporting. 
)
trainer = Trainer(
    model=model, 
    args = inference_args
)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

## Training Accuracy

In [None]:
# Make predictions on the test set
predictions = trainer.predict(dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [None]:
print(report)

## Test Accuracy.

In [None]:
# Make predictions on the test set
predictions = trainer.predict(test_dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(test_labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [None]:
print(report)

In [None]:
test_df['Predicted_Tag'] = predicted_labels_text

In [None]:
test_df = test_df.rename(columns = {"tag": "Original_Tag"})

In [None]:
print(test_df.shape)

In [None]:
test_df.to_excel("./data/output/predictions_v9_small.xlsx")

# Train using K-fold Cross Validation

In [29]:
from sklearn.model_selection import StratifiedKFold, KFold
import json 
from pathlib import Path

In [30]:
k=5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [31]:
unique_files = train_df[train_df.filename.str.contains("synth") == False].filename.unique()

In [32]:
splits = list(kf.split(unique_files))
for fold, (train_file_index, val_file_index)  in enumerate(splits):
    train_file_names = set(unique_files[train_file_index])
    val_file_names = set(unique_files[val_file_index])
    
    train = train_df[train_df.filename.isin(train_file_names)]
    val  = train_df[train_df.filename.isin(val_file_names)]
    print(train.shape, train.tags.unique().shape)
    print(val.shape, val.tags.unique().shape)

(2341, 9) (23,)
(573, 9) (23,)
(2358, 9) (23,)
(556, 9) (21,)
(2204, 9) (23,)
(710, 9) (23,)
(2372, 9) (23,)
(542, 9) (23,)
(2381, 9) (23,)
(533, 9) (23,)


# Split by files

In [36]:
unique_labels = train_df.tags.unique()

In [None]:
fold_accuracies = [] 
for fold, (train_file_index, val_file_index)  in enumerate(splits):
    
    train_file_names = set(unique_files[train_file_index])
    val_file_names = set(unique_files[val_file_index])
    
    train = train_df[train_df.filename.isin(train_file_names)]
    val  = train_df[train_df.filename.isin(val_file_names)]
    
    val_labels = label_encoder.transform(val['tags'].values)
    train_labels = label_encoder.transform(train['tags'].values)
    
    train_dataset = TextDataset(train['section_content'].values, train_labels ,  tokenizer, max_length=512)
    val_dataset = TextDataset(val['section_content'].values, val_labels,  tokenizer, max_length=512)
    
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
    
    run_name = f"{Path(MODEL_DIRECTORY).name}_fold_{fold+1}"    
    print("Starting ", run_name)
    wandb.init(project="huggingface", name=run_name)
    
    training_args = TrainingArguments(
        output_dir=os.path.join(MODEL_DIRECTORY, f'./results_fold_{fold+1}'),
        run_name=run_name,
        num_train_epochs=30,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        logging_dir=os.path.join(MODEL_DIRECTORY, f'./logs_fold_{fold+1}'),
        logging_steps=10,
        learning_rate=2e-5,
        eval_strategy="epoch", 
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model='recall'

    )

    # Initialize Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset, 
        compute_metrics=compute_metrics, 
        class_weights=class_weight_tensor, 
    )
    trainer.train()

    # Train the model
    # Make predictions on the test set
    predictions = trainer.predict(val_dataset)

    # Extract the logits and convert to predicted labels
    logits = predictions.predictions
    predicted_labels = np.argmax(logits, axis=-1)

    # Convert numeric labels back to original text labels for comparison
    predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
    true_labels_text = label_encoder.inverse_transform(val_labels)
    
    val['predicted_tags'] = predicted_labels_text
    val.to_json(f"./data/output/val_{Path(MODEL_DIRECTORY).name}_fold_{fold+1}.json", orient="records")
    # Generate a classification report
    report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_, labels=unique_labels, output_dict=True)
    with open(f"./data/output/tag_classification_fold_{fold+1}_report.json", "w") as file: 
        json.dump(report, file)
        
    print(report)
        
    
    print("Generating Evaluation results")
    eval_result = trainer.evaluate()
    # Save accuracy for this fold
    fold_accuracies.append((report, eval_result['eval_accuracy']))
    del model 
    torch.cuda.empty_cache()

# Split by rows

In [None]:
fold_accuracies = [] 
for fold, (train_index, val_index) in enumerate(kf.split(train_df, train_df['tags'].values)):
    print(f"Fold {fold + 1}/{k}")
    train = train_df.iloc[train_index]
    val = train_df.iloc[val_index]
    
    val_labels = label_encoder.transform(val['tags'].values)
    train_labels = label_encoder.transform(train['tags'].values)
    
    train_dataset = TextDataset(train['section_content'].values, train_labels ,  tokenizer, max_length=512)
    val_dataset = TextDataset(val['section_content'].values, val_labels,  tokenizer, max_length=512)
    
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
    
    run_name = f"{Path(MODEL_DIRECTORY).name}_fold_{fold+1}"
    wandb.run_name = run_name


    
    print("Starting ", run_name)
    
    training_args = TrainingArguments(
        output_dir=os.path.join(MODEL_DIRECTORY, f'./results_fold_{fold+1}'),
        run_name=run_name,
        num_train_epochs=30,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        logging_dir=os.path.join(MODEL_DIRECTORY, f'./logs_fold_{fold+1}'),
        logging_steps=10,
        learning_rate=2e-5,
        eval_strategy="epoch", 
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model='recall'

    )

    # Initialize Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset, 
        compute_metrics=compute_metrics, 
        class_weights=class_weight_tensor, 
    )
    trainer.train()

    # Train the model
    # Make predictions on the test set
    predictions = trainer.predict(val_dataset)

    # Extract the logits and convert to predicted labels
    logits = predictions.predictions
    predicted_labels = np.argmax(logits, axis=-1)

    # Convert numeric labels back to original text labels for comparison
    predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
    true_labels_text = label_encoder.inverse_transform(val_labels)
    
    val['predicted_tags'] = predicted_labels_text
    val.to_json(f"./data/output/val_{Path(MODEL_DIRECTORY).name}_fold_{fold+1}.json", orient="records")
    # Generate a classification report
    report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_, output_dict=True)
    with open(f"./data/output/tag_classification_fold_{fold+1}_report.json", "w") as file: 
        json.dump(report, file)
        
    print(report)
        
    
    print("Generating Evaluation results")
    eval_result = trainer.evaluate()
    # Save accuracy for this fold
    fold_accuracies.append((report, eval_result['eval_accuracy']))
    del model 
    torch.cuda.empty_cache()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/5
Starting  v11-small_fold_1


[34m[1mwandb[0m: Currently logged in as: [33manurag-credcore[0m ([33mcredcore[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0772,2.362797,0.473258,0.469511,0.838682,0.549513
2,0.8162,1.399544,0.708266,0.592619,0.890267,0.688403
3,0.3193,0.696363,0.846029,0.761781,0.93949,0.823779
4,0.1769,0.331885,0.910859,0.837361,0.915628,0.86735
5,0.0684,0.349098,0.91248,0.823768,0.952591,0.876586
6,0.0308,0.320608,0.9141,0.840754,0.968206,0.893099
7,0.0228,0.250025,0.93517,0.87828,0.932558,0.900807
8,0.1612,0.27392,0.941653,0.887081,0.953572,0.917525
9,0.0083,0.318282,0.936791,0.87363,0.9553,0.908921
10,0.1018,0.296882,0.940032,0.880521,0.958841,0.915121


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['predicted_tags'] = predicted_labels_text


{'Additional Liens': {'precision': 0.9166666666666666, 'recall': 0.9166666666666666, 'f1-score': 0.9166666666666666, 'support': 12}, 'Asset Disposition': {'precision': 0.875, 'recall': 1.0, 'f1-score': 0.9333333333333333, 'support': 7}, 'Compliance Certificate': {'precision': 0.875, 'recall': 0.9333333333333333, 'f1-score': 0.9032258064516129, 'support': 15}, 'Consequences of Default': {'precision': 0.8571428571428571, 'recall': 0.8571428571428571, 'f1-score': 0.8571428571428571, 'support': 7}, 'Event of Default': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11}, 'Facilities / Instrument': {'precision': 0.875, 'recall': 0.9130434782608695, 'f1-score': 0.8936170212765957, 'support': 23}, 'Financial Covenant': {'precision': 0.9230769230769231, 'recall': 1.0, 'f1-score': 0.9600000000000001, 'support': 12}, 'Financial Statements': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 16}, 'Governing Laws': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support

Fold 2/5


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting  v11-small_fold_2


Epoch,Training Loss,Validation Loss


In [35]:
print("")




In [None]:
x[0]

In [73]:
fold_data = []
for x in fold_accuracies[1:]:
    for key, value in x[0].items(): 
        if type(value) != dict:
            continue
        fold_data.append({
            "tag": key, 
            "precision": value['precision'], 
            'recall' : value['recall'], 
            'f1-score': value['f1-score'], 
            'support': value['support']
        })
    break

In [76]:
for i in fold_accuracies: 
    print(i[1])

0.9529983792544571
0.946515397082658
0.9124797406807131
0.9562398703403565
0.9141004862236629
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


In [74]:
fold_df = pd.DataFrame(fold_data)

In [75]:
print(fold_df.groupby('tag').agg('mean').to_string())

                                    precision    recall  f1-score  support
tag                                                                       
Additional Liens                     0.909091  1.000000  0.952381     50.0
Asset Disposition                    0.966667  1.000000  0.983051     29.0
Compliance Certificate               0.920635  1.000000  0.958678     58.0
Consequences of Default              0.935484  1.000000  0.966667     29.0
Event of Default                     0.962963  1.000000  0.981132     52.0
Facilities / Instrument              0.938776  1.000000  0.968421     92.0
Financial Covenant                   0.961538  1.000000  0.980392     50.0
Financial Statements                 0.971014  1.000000  0.985294     67.0
Governing Laws                       0.911111  1.000000  0.953488     41.0
Incremental Facilities               0.781250  1.000000  0.877193     25.0
Interest Rate                        0.919355  1.000000  0.957983     57.0
Loan Repayment           

## Generate Results on Unsampled NA data

In [None]:
seen_df = pd.read_json("./data/cleaned_tags_data.json")
all_df = pd.read_json("./data/all-data.json")

In [None]:
unseen_df = all_df.merge(seen_df, how='left', on=['filename', 'tags', 'title', 'category', 'word_count', 'section_content'], indicator=True, )
unseen_df = unseen_df[unseen_df['_merge'] == 'left_only']
unseen_df = unseen_df.drop(columns=['_merge'])


In [None]:

unseen_df['tags'] = unseen_df['tags'].str.replace("Events of Default", "Event of Default")

In [None]:
unseen_texts = unseen_df['section_content'].values
unseen_labels = unseen_df['tags'].values
unseen_labels = label_encoder.transform(unseen_labels)

In [None]:
unseen_dataset = TextDataset(unseen_texts, unseen_labels, tokenizer, max_length=512)

In [None]:
# Make predictions on the test set
predictions = trainer.predict(unseen_dataset)

# Extract the logits and convert to predicted labels
logits = predictions.predictions
predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
predicted_labels_text = label_encoder.inverse_transform(predicted_labels)
true_labels_text = label_encoder.inverse_transform(unseen_labels)

# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [None]:
print(report)

In [None]:
all_df = pd.read_json("./data/all-data.json")
print(all_df.shape)
all_df = all_df[all_df.word_count > 20]
print(all_df.shape)

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df['predicted_tags'] = predicted_labels_text

In [None]:
test_df.head()

In [None]:
group_df = all_df[['section_content', 'tags']].groupby('section_content').agg(list).reset_index()
group_df['n_tags'] = group_df.tags.apply(lambda x: len(x))
group_df['n_unique_tags'] = group_df.tags.apply(lambda x: len(set(x)))
print(group_df[group_df.n_tags > 1].shape)
print(group_df[group_df.n_unique_tags > 1].shape)
group_df.sort_values(by='n_unique_tags', ascending=False).head(10)

In [None]:
test_df[["section_content", "predicted_tags"]].merge(all_df.drop_duplicates("section_content"), how="left", on="section_content", validate="one_to_one").to_excel("./data/output/test_data_predictions.xlsx")

In [None]:
test_df.shape

In [None]:
print(report)

In [None]:
def predict_labels(texts, model, tokenizer, label_encoder, max_length=512, device='cuda'):
    # Move model to the specified device
    model.to(device)
    
    # Tokenize the texts
    encodings = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Move tensors to the same device as the model
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)
    
    # Put the model in evaluation mode
    model.eval()
    
    # Perform the inference
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
    # Move predictions back to the CPU before converting to numpy
    predicted_labels = label_encoder.inverse_transform(predictions.cpu().numpy())
    
    return predicted_labels

In [None]:
predicted_labels_text = predict_labels(test_texts.tolist(), model, tokenizer, label_encoder)
true_labels_text = label_encoder.inverse_transform(test_labels)

In [None]:
!nvidia-smi

In [None]:
# Extract the logits and convert to predicted labels
# logits = predictions.predictions
# predicted_labels = np.argmax(logits, axis=-1)

# Convert numeric labels back to original text labels for comparison
# Generate a classification report
report = classification_report(true_labels_text, predicted_labels_text, target_names=label_encoder.classes_)


In [None]:
print(report)