# Evaluate the Pre-trained Model (Before Fine-Tuning):

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import accelerate

# Load pre-trained PatentBERT model and tokenizer
model_name = "AI-Growth-Lab/PatentSBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)  # Adjust num_labels as needed

# Load the dataset from train_dataset.csv
file_path = 'train_dataset.csv'
dataset = load_dataset('csv', data_files={'train': file_path})

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example['claims'], padding='max_length', truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split the dataset into training and validation sets (80% train, 20% validation)
train_test_split = tokenized_dataset['train'].train_test_split(test_size=0.2)
validation_dataset = train_test_split['test']

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Set up training arguments for evaluation
training_args = TrainingArguments(
    output_dir="./results_pretrained",  
    per_device_eval_batch_size=16,
    logging_dir='./logs_pretrained',
    evaluation_strategy="epoch"
)

# Set up Trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics
)

# Evaluate the pre-trained model
pretrain_eval_result = trainer.evaluate()

# Save the pre-trained evaluation results
import json
with open('pretrained_eval_results.json', 'w') as f:
    json.dump(pretrain_eval_result, f)

# Print the evaluation metrics for the pre-trained model
print("Pre-trained Model Evaluation Metrics:", pretrain_eval_result)


Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at AI-Growth-Lab/PatentSBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-trained Model Evaluation Metrics: {'eval_model_preparation_time': 0.006, 'eval_runtime': 1481.7994, 'eval_samples_per_second': 60.737, 'eval_steps_per_second': 3.796}


# Fine Tuning PatentBERT

In [12]:
import torch
torch.cuda.empty_cache()


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import joblib
import pandas as pd
import os

# Enable Tokenizer Parallelism (if safe)
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Clear CUDA cache before starting
torch.cuda.empty_cache()

# Load the dataset from train_dataset.csv
file_path = 'train_dataset.csv'
df = pd.read_csv(file_path)

# Extract the IPC section, class, and subclass from the IPC code
df['ipc_section'] = df['ipc'].apply(lambda x: x[0] if pd.notna(x) and len(x) > 0 else '')
df['ipc_class'] = df['ipc'].apply(lambda x: x[:3] if pd.notna(x) and len(x) >= 3 else '')
df['ipc_subclass'] = df['ipc'].apply(lambda x: x[:4] if pd.notna(x) and len(x) >= 4 else '')

# Initialize LabelEncoders for each level (section, class, subclass)
label_encoder_section = LabelEncoder()
label_encoder_class = LabelEncoder()
label_encoder_subclass = LabelEncoder()

# Encode the section, class, and subclass IPC codes into numerical labels
df['encoded_section'] = label_encoder_section.fit_transform(df['ipc_section'])
df['encoded_class'] = label_encoder_class.fit_transform(df['ipc_class'])
df['encoded_subclass'] = label_encoder_subclass.fit_transform(df['ipc_subclass'])

# Rename the label column to 'labels' for Hugging Face compatibility
df = df.rename(columns={'encoded_section': 'labels'})  # Start with section labels

# Convert the DataFrame into Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Load the PatentBERT tokenizer
model_name = "AI-Growth-Lab/PatentSBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset (using 'claims' column for input text)
def tokenize_function(example):
    return tokenizer(example['claims'], padding='max_length', truncation=True)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split the dataset into training and validation sets (80% train, 20% validation)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
validation_dataset = train_test_split['test']

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }



tokenizer_config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/450000 [00:00<?, ? examples/s]

In [2]:
import torch
import os
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
import joblib
import time

print("Training for IPC Section Classification...")

# Function to check available memory
def get_free_memory():
    # Get the current GPU memory usage
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    free_memory = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)
    return free_memory / (1024 ** 3)  # Return memory in GB

# Custom Trainer to dynamically adjust batch size
class DynamicBatchTrainer(Trainer):
    def __init__(self, *args, batch_size_step=5, max_batch_size=50, **kwargs):
        super().__init__(*args, **kwargs)
        self.batch_size_step = batch_size_step
        self.max_batch_size = max_batch_size

    def train(self, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None, **kwargs):
        # Start the training loop
        for epoch in range(int(self.args.num_train_epochs)):
            print(f"Starting Epoch {epoch + 1}/{self.args.num_train_epochs}")
            # Check for available GPU memory before each epoch
            free_memory = get_free_memory()
            print(f"Free GPU Memory at Epoch Start: {free_memory:.2f} GB")
            
            # Dynamically increase batch size if more memory is available and under the max limit
            if free_memory > 2 and self.args.per_device_train_batch_size < self.max_batch_size:
                new_batch_size = min(self.args.per_device_train_batch_size + self.batch_size_step, self.max_batch_size)
                print(f"Increasing batch size to: {new_batch_size}")
                self.args.per_device_train_batch_size = new_batch_size
            
            # Call the original training method
            super().train(resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
            # Pause between epochs to clear memory and check for memory status
            time.sleep(10)  # Sleep for 10 seconds to allow for memory cleanup

# Check for any existing checkpoints
checkpoint_dir = "./results_section/checkpoint"
last_checkpoint = None
if os.path.isdir(checkpoint_dir):
    last_checkpoint = checkpoint_dir

# TrainingArguments for IPC Section
initial_batch_size = 15  # Start with a smaller batch size
training_args_section = TrainingArguments(
    output_dir="./results_section",  # Directory for saving checkpoints
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save checkpoint after every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=initial_batch_size,
    per_device_eval_batch_size=initial_batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_section',
    logging_steps=10,
    fp16=False,  # Disable mixed precision training
    save_steps=100,  # Save model checkpoint every 500 steps
    save_total_limit=2,  # Limit the total number of saved checkpoints
    dataloader_num_workers=4,  # Speed up data loading
    load_best_model_at_end=True,
)

# Load PatentBERT for IPC Section Classification
num_section_labels = len(df['labels'].unique())
model_section = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_section_labels)

# Set up the dynamic batch trainer
trainer_section = DynamicBatchTrainer(
    model=model_section,
    args=training_args_section,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
    batch_size_step=5,  # Increase batch size by 5 when memory is available
    max_batch_size=50,  # Maximum batch size limit
)

# Fine-tune the model for IPC Section, resuming from the last checkpoint if available
trainer_section.train(resume_from_checkpoint=last_checkpoint)

# Evaluate the model for IPC Section
evaluation_results_section = trainer_section.evaluate()

# Print detailed evaluation results for IPC Section
print("Evaluation Metrics for IPC Section Classification:")
print(f"Accuracy: {evaluation_results_section['eval_accuracy'] * 100:.2f}%")
print(f"Precision: {evaluation_results_section['eval_precision'] * 100:.2f}%")
print(f"Recall: {evaluation_results_section['eval_recall'] * 100:.2f}%")
print(f"F1 Score: {evaluation_results_section['eval_f1'] * 100:.2f}%")

# Save the fine-tuned model and tokenizer for IPC Section
trainer_section.save_model("./fine_tuned_patentbert_ipc_section")
tokenizer.save_pretrained("./fine_tuned_patentbert_ipc_section")
joblib.dump(label_encoder_section, './fine_tuned_patentbert_ipc_section/label_encoder_section.pkl') 

Training for IPC Section Classification...




config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at AI-Growth-Lab/PatentSBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Epoch 1/3
Free GPU Memory at Epoch Start: 15.36 GB
Increasing batch size to: 20


Epoch,Training Loss,Validation Loss


# Compare with the Pretrained and Finetuned for Section Prediction

In [None]:
print("\n### Model Comparison ###\n")

# Compare Accuracy
print(f"Pre-trained PatentBERT Accuracy: {evaluation_results_pretrained['eval_accuracy'] * 100:.2f}%")
print(f"Fine-tuned PatentBERT Accuracy: {evaluation_results_finetuned['eval_accuracy'] * 100:.2f}%\n")

# Compare Precision
print(f"Pre-trained PatentBERT Precision: {evaluation_results_pretrained['eval_precision'] * 100:.2f}%")
print(f"Fine-tuned PatentBERT Precision: {evaluation_results_finetuned['eval_precision'] * 100:.2f}%\n")

# Compare Recall
print(f"Pre-trained PatentBERT Recall: {evaluation_results_pretrained['eval_recall'] * 100:.2f}%")
print(f"Fine-tuned PatentBERT Recall: {evaluation_results_finetuned['eval_recall'] * 100:.2f}%\n")

# Compare F1 Score
print(f"Pre-trained PatentBERT F1 Score: {evaluation_results_pretrained['eval_f1'] * 100:.2f}%")
print(f"Fine-tuned PatentBERT F1 Score: {evaluation_results_finetuned['eval_f1'] * 100:.2f}%\n")
