In [1]:
import os
os.environ['HF_TOKEN'] = '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'

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from datasets import load_dataset

In [3]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
import gc

import torch as t
from openai import OpenAIError
from tabulate import tabulate


def get_tensor_size(obj):
    size = 0
    if t.is_tensor(obj):
        size += obj.element_size() * obj.nelement()
    return size


def get_tensors_size(obj):
    if isinstance(obj, t.nn.Module):
        return sum(get_tensor_size(p) for p in obj.parameters())
    if hasattr(obj, "state_dict"):
        return sum(get_tensor_size(t) for t in obj.state_dict().values())
    return get_tensor_size(obj)


def get_device(obj):
    if t.is_tensor(obj):
        return str(obj.device)
    if isinstance(obj, t.nn.Module):
        try:
            return str(next(iter(obj.parameters())).device)
        except StopIteration:
            return "N/A"
    return "N/A"


def print_memory_status():
    t.cuda.synchronize()
    allocated = t.cuda.memory_allocated(0)
    total = t.cuda.get_device_properties(0).total_memory
    free = total - allocated
    print(f"Allocated: {allocated / 1024**3:.2f} GB")
    print(f"Total:  {total / 1024**3:.2f} GB")
    print(f"Free:  {free / 1024**3:.2f} GB")


def profile_pytorch_memory(namespace: dict, n_top: int = 10, filter_device: str = None):
    print_memory_status()

    object_sizes = {}
    for name, obj in namespace.items():
        try:
            obj_type = (
                type(obj).__name__
                if isinstance(obj, t.nn.Module)
                else f"Tensor {tuple(obj.shape)}"
                if t.is_tensor(obj)
                else None
            )
            if obj_type is None:
                continue
            device = get_device(obj)
            if filter_device and device != filter_device:
                continue
            size = get_tensors_size(obj)
            object_sizes[name] = (obj_type, device, size / (1024**3))
        except (OpenAIError, ReferenceError):
            # OpenAIError: we can't inspect the type of certain objects without triggering API request
            # ReferenceError: this object might have been garbage collected, so we don't care about it
            continue

    # Convert bytes to GB, sort by size & print
    sorted_sizes = sorted(object_sizes.items(), key=lambda x: x[1][2], reverse=True)[:n_top]
    table_data = [(name, obj_type, device, size) for name, (obj_type, device, size) in sorted_sizes]
    print(
        tabulate(
            table_data, headers=["Name", "Object", "Device", "Size (GB)"], floatfmt=".2f", tablefmt="simple_outline"
        )
    )


def find_cuda_tensors():
    cuda_tensors = []
    for obj in gc.get_objects():
        try:
            if t.is_tensor(obj) and obj.is_cuda:
                cuda_tensors.append(obj)
        except:
            pass
    return cuda_tensors


In [10]:
# Profile memory usage, and delete gemma models if we've loaded them in
namespace = globals().copy() | locals()
profile_pytorch_memory(namespace=namespace, filter_device="cuda:0")

Allocated: 4.29 GB
Total:  6.00 GB
Free:  1.71 GB
┌────────────┬───────────────────┬──────────┬─────────────┐
│ Name       │ Object            │ Device   │   Size (GB) │
├────────────┼───────────────────┼──────────┼─────────────┤
│ base_model │ Gemma2ForCausalLM │ cuda:0   │        4.87 │
│ _          │ Embedding         │ cuda:0   │        1.10 │
│ _9         │ Embedding         │ cuda:0   │        1.10 │
└────────────┴───────────────────┴──────────┴─────────────┘


In [6]:


####################################
# 1) LOAD THE TEST DATA
####################################
# Assume you have the same multi_label CSV: multi_label_patent_data.csv
# which has columns: ["text", "ipc_codes", "APPLN_YR"] (for example)

raw_dataset = load_dataset("csv", data_files="multi_label_patent_data.csv")["train"]

# Shuffle the dataset once
raw_dataset = raw_dataset.shuffle(seed=42)

# Compute how many samples = 10%
sample_size = int(0.1 * len(raw_dataset))

# Select the first 10% of shuffled samples
sampled_dataset = raw_dataset.select(range(sample_size))

# e.g., 90% train, 10% test
split = sampled_dataset.train_test_split(test_size=0.1, seed=42)
test_ds  = split["test"]

# NOTE: Make sure your CSV still has "text" and "ipc_codes" columns 
# (i.e., you didn't remove them in preprocessing).


FileNotFoundError: Unable to find 'C:/Users/valen/Documents/GitHub/TesiCS2024/_newNewCode/giordanoCode\multi_label_patent_data.csv'

In [7]:
####################################
# 2) LOAD THE TOKENIZER & FINE-TUNED MODEL
####################################
model_name = "google/gemma-2-2b"

# 1) Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2) Add the same special tokens you added during training
special_tokens_dict = {"additional_special_tokens": ["<|start_of_text|>", "<|end_of_text|>"]}
tokenizer.add_special_tokens(special_tokens_dict)

2

In [None]:
# 3) Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)

# 4) IMPORTANT: resize the model’s embeddings to match the new tokenizer length
base_model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
# Load LoRA adapter (saved from training)
# e.g. trainer.save_model("gemma2b-lora-patents-multilabel") -> we pass that directory here:
fine_tuned_model = PeftModel.from_pretrained(
    base_model,
    "gemma2b-lora-patents-multilabel-v2",
    device_map="auto"    # put it on GPU
)

####################################
# 3) DEFINE A PREDICTION FUNCTION
####################################
def predict_ipc_codes(model, tokenizer, text, max_new_tokens=50):
    """
    Given a patent text, build the same style of prompt used in training.
    Generate predicted IPC codes from the fine-tuned model.
    """
    # Match the same prompt format you used during training
    prompt = f"<|start_of_text|>Patent Text: {text}\nPredict the IPC codes:\n"

    # Tokenize and move inputs to the same device as the model
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,   # or True for sampling
            top_p=0.9,         # sampling hyperparam if do_sample=True
            temperature=0.8,
        )

    # Decode the model's output
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

####################################
# 4) OPTIONAL: PARSE PREDICTED CODES
####################################
def parse_predicted_codes(generated_text):
    """
    A simple parser that tries to extract comma-separated IPC codes.
    Adjust it to your prompt's exact output format.
    """
    lines = generated_text.split("\n")
    # Look for the line that might contain the codes
    for line in lines:
        if "," in line:  # naive approach
            return [code.strip() for code 
            in line.split(",")]
    return []





ValueError: We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules need to be offloaded: base_model.model.model.layers.16, base_model.model.model.layers.17, base_model.model.model.layers.18, base_model.model.model.layers.19, base_model.model.model.layers.20, base_model.model.model.layers.21, base_model.model.model.layers.22, base_model.model.model.layers.23, base_model.model.model.layers.24, base_model.model.model.layers.25, base_model.model.model.norm.

In [2]:
####################################
# 5) QUALITATIVE CHECK: PRINT PREDICTIONS
####################################
import random

n_samples_to_check = 20
N_test = len(test_ds)

for i in range(n_samples_to_check):
    #idx = random.randint(0, N_test-1)
    example = test_ds[i]
    
    # "text" and "ipc_codes" should exist in your CSV
    patent_text = example["text"]
    true_codes_str = example["ipc_codes"]  # e.g., "C08F2, B65G65"

    # Generate predictions
    predicted_output = predict_ipc_codes(fine_tuned_model, tokenizer, patent_text)
    predicted_codes_list = parse_predicted_codes(predicted_output)

    # Print a short preview
    print("=" * 60)
    print(f"Example #{i+1}")
    print("Patent Text (truncated):", patent_text[:200], "...")
    print("True IPC Codes:", true_codes_str)
    print("Model Predicted Raw Output:")
    print(predicted_output)
    print("Parsed IPC Codes:", predicted_codes_list)
    print("=" * 60, "\n")


Example #1
Patent Text (truncated): REMEDY FOR ALLERGIC DISEASES IN THE REGION OF THE NOSE An anti-allergic pharmaceutical composition for nasal topical administration comprising an IgE antibody production inhibitor as an active ingredi ...
True IPC Codes: A61K45, A61K9, A61K31
Model Predicted Raw Output:
Patent Text: REMEDY FOR ALLERGIC DISEASES IN THE REGION OF THE NOSE An anti-allergic pharmaceutical composition for nasal topical administration comprising an IgE antibody production inhibitor as an active ingredient, which is effective and safe in its nasal topical administration.
Predict the IPC codes:
A61K38, A61K39
 ผมขอร้องว่าอาจจะไม่น่าสนใจให้ทราบบนเรื่องที่น่าสนใจดังนี้ แต่ไม่ใช่การรวบรวมและปูพื้นฐานการด
Parsed IPC Codes: ['Patent Text: REMEDY FOR ALLERGIC DISEASES IN THE REGION OF THE NOSE An anti-allergic pharmaceutical composition for nasal topical administration comprising an IgE antibody production inhibitor as an active ingredient', 'which is effective and safe in its nas



Example #2
Patent Text (truncated): Co-cured rubber-thermoplastic elastomer compositions A thermoplastic elastomer composition has a plastic phase or matrix of an engineering thermoplastic such as a polyamide and a crosslinked phase of  ...
True IPC Codes: C08L23, C08L77, C08L33
Model Predicted Raw Output:
Patent Text: Co-cured rubber-thermoplastic elastomer compositions A thermoplastic elastomer composition has a plastic phase or matrix of an engineering thermoplastic such as a polyamide and a crosslinked phase of cured acrylic rubbers. The rubbers contain at least two functionalized acrylic rubbers wherein the functional group such as a carboxyl, an epoxy, or a hydroxyl can be the same or different. The curing agents generally react via covalent bonding with the reactive functional groups of the rubbers. The composition can be dynamically vulcanized and when one of the rubbers is a terpolymer of ethylene-alkyl acrylate-unsaturated carboxylic acid, the composition has a single low tem