# Green Patent Detection (PatentSBERTa): Active Learning + LLM‚ÜíHuman HITL

## üß± Part A: Baseline Model (Frozen Embeddings)

In [1]:
# Load dependencies
import pandas as pd
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pickle
from pathlib import Path

SEED = 42

In [2]:
# Load the dataset
dataset = load_dataset("AI-Growth-Lab/patents_claims_1.5m_traim_test", split="train")
df = dataset.to_pandas()

print(df.columns.tolist())
print(df.head())

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

['id', 'date', 'text', 'A01B', 'A01C', 'A01D', 'A01F', 'A01G', 'A01H', 'A01J', 'A01K', 'A01L', 'A01M', 'A01N', 'A21B', 'A21C', 'A21D', 'A22B', 'A22C', 'A23B', 'A23C', 'A23D', 'A23F', 'A23G', 'A23J', 'A23K', 'A23L', 'A23N', 'A23P', 'A23V', 'A23Y', 'A24B', 'A24C', 'A24D', 'A24F', 'A41B', 'A41C', 'A41D', 'A41F', 'A41G', 'A41H', 'A42B', 'A42C', 'A43B', 'A43C', 'A43D', 'A44B', 'A44C', 'A44D', 'A45B', 'A45C', 'A45D', 'A45F', 'A46B', 'A46D', 'A47B', 'A47C', 'A47D', 'A47F', 'A47G', 'A47H', 'A47J', 'A47K', 'A47L', 'A61B', 'A61C', 'A61D', 'A61F', 'A61G', 'A61H', 'A61J', 'A61K', 'A61L', 'A61M', 'A61N', 'A61P', 'A61Q', 'A62B', 'A62C', 'A62D', 'A63B', 'A63C', 'A63D', 'A63F', 'A63G', 'A63H', 'A63J', 'A63K', 'B01B', 'B01D', 'B01F', 'B01J', 'B01L', 'B02B', 'B02C', 'B03B', 'B03C', 'B03D', 'B04B', 'B04C', 'B05B', 'B05C', 'B05D', 'B06B', 'B07B', 'B07C', 'B08B', 'B09B', 'B09C', 'B21B', 'B21C', 'B21D', 'B21F', 'B21G', 'B21H', 'B21J', 'B21K', 'B21L', 'B22C', 'B22D', 'B22F', 'B23B', 'B23C', 'B23D', 'B23F', '

In [3]:
# Assign labels based on the presence of "Y02" in the CPC codes
Y02_COLS = [c for c in df.columns if c.startswith("Y02")]
df["is_green_silver"] = (df[Y02_COLS].sum(axis=1) > 0).astype(int)

# Sample 25k green and 25k not green patents
green     = df[df["is_green_silver"] == 1].sample(25000, random_state=SEED)
not_green = df[df["is_green_silver"] == 0].sample(25000, random_state=SEED)

# Combine and shuffle the dataset
df50k = pd.concat([green, not_green]).sample(frac=1, random_state=SEED).reset_index(drop=True)
df50k["doc_id"] = df50k.index

# Create a directory to save the sampled dataset
Path("parquet").mkdir(parents=True, exist_ok=True)

# Save the sampled dataset to a Parquet file
df50k.to_parquet("parquet/patents_50k_green.parquet", index=False)
print(df50k["is_green_silver"].value_counts())

is_green_silver
0    25000
1    25000
Name: count, dtype: int64


In [4]:
# Read the sampled dataset from the Parquet file
df50k = pd.read_parquet("parquet/patents_50k_green.parquet")

# Split the dataset into train, pool (unlabeled), and eval sets
train_silver, temp = train_test_split(
    df50k, test_size=0.30, random_state=SEED, stratify=df50k["is_green_silver"]
)
pool_unlabeled, eval_silver = train_test_split(
    temp, test_size=1/3, random_state=SEED, stratify=temp["is_green_silver"]
)

print(f"train_silver:   {len(train_silver)}")
print(f"pool_unlabeled: {len(pool_unlabeled)}")
print(f"eval_silver:    {len(eval_silver)}")

train_silver.to_parquet("parquet/train_silver.parquet", index=False)
pool_unlabeled.to_parquet("parquet/pool_unlabeled.parquet", index=False)
eval_silver.to_parquet("parquet/eval_silver.parquet", index=False)

train_silver:   35000
pool_unlabeled: 10000
eval_silver:    5000


In [5]:
# Load the pre-trained PatentSBERTa model
model = SentenceTransformer("AI-Growth-Lab/PatentSBERTa")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mMPNetModel LOAD REPORT[0m from: AI-Growth-Lab/PatentSBERTa
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [6]:
# Uncomment the code below to compute and save the embeddings (this can take some time)

# Create embeddings for the train set using the "text" column, which contains the patent claims
# X_train = model.encode(train_silver["text"].tolist(),    batch_size=64, show_progress_bar=True)

# We do embeddings for both the pool and eval sets as we'll use them for evaluation and active learning later on
# X_pool  = model.encode(pool_unlabeled["text"].tolist(),  batch_size=64, show_progress_bar=True)
# X_eval  = model.encode(eval_silver["text"].tolist(),     batch_size=64, show_progress_bar=True)

# Extract the true silver labels for the train and eval sets
# We'll not need the pool labels as it's meant to be unlabeled for active learning
y_train = train_silver["is_green_silver"].values
y_eval  = eval_silver["is_green_silver"].values

# Create a directory to save the embeddings
# Path("embeddings").mkdir(parents=True, exist_ok=True)

# Save the embeddings to .npy files
# np.save("embeddings/X_train.npy", X_train)
# np.save("embeddings/X_pool.npy",  X_pool)
# np.save("embeddings/X_eval.npy",  X_eval)

In [7]:
# Load the train and eval embeddings from .npy files
X_train = np.load("embeddings/X_train.npy")
X_eval  = np.load("embeddings/X_eval.npy")

In [8]:
# Train Logistic Regression on the frozen embeddings
# max_iter=1000 gives it enough steps to converge on a dataset this size
clf = LogisticRegression(max_iter=1000, random_state=SEED)
clf.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [9]:
# Predict on the eval set and print the classification report
# Classification report showing Precision / Recall / F1 across both classes ‚Äî "not green" and "green"
y_pred = clf.predict(X_eval)
print(classification_report(y_eval, y_pred, target_names=["not green", "green"]))

              precision    recall  f1-score   support

   not green       0.77      0.79      0.78      2500
       green       0.78      0.76      0.77      2500

    accuracy                           0.77      5000
   macro avg       0.77      0.77      0.77      5000
weighted avg       0.77      0.77      0.77      5000



In [10]:
# Save the classifier so Part B can load it directly
# and compute p_green probabilities for uncertainty sampling
# without retraining from scratch.
with open("baseline_clf.pkl", "wb") as f:
    pickle.dump(clf, f)

### Part A ‚Äì Summary

We built a baseline green patent classifier using frozen PatentSBERTa embeddings and Logistic Regression on a balanced 50k sample (25k green, 25k not-green) derived from CPC Y02* codes as silver labels.

The dataset was split into three parts: `train_silver` (70%) for training, `pool_unlabeled` (20%) for uncertainty sampling in Part B, and `eval_silver` (10%) for evaluation.

**Results on eval_silver:**
| | Precision | Recall | F1 |
|---|---|---|---|
| Not green | 0.77 | 0.79 | 0.78 |
| Green | 0.78 | 0.76 | 0.77 |
| **Overall accuracy** | | | **0.77** |

The balanced performance across both classes confirms the baseline is stable and unbiased ‚Äî a good foundation for uncertainty sampling in Part B.

## üéØ Part B: Identify High-Risk Examples (Uncertainty Sampling)

In [11]:
# Load the saved classifier and pool embeddings from Part A
clf = pickle.load(open("baseline_clf.pkl", "rb"))
X_pool = np.load("embeddings/X_pool.npy")

# Compute p_green: probability that each claim is green
# predict_proba returns [p_not_green, p_green] per row, so we take column 1 so we get the green probability
p_green = clf.predict_proba(X_pool)[:, 1]

# Compute uncertainty score as defined in the assignment:
# u = 1 - 2 * |p - 0.5|  ‚Üí  u=1 means most uncertain, u=0 means most confident
u = 1 - 2 * np.abs(p_green - 0.5)

print(f"Mean uncertainty: {u.mean():.3f}")
print(f"Max uncertainty:  {u.max():.3f}")

Mean uncertainty: 0.428
Max uncertainty:  1.000


In [12]:
# Add scores back to the pool dataframe
pool_unlabeled = pd.read_parquet("parquet/pool_unlabeled.parquet")
pool_unlabeled["p_green"] = p_green
pool_unlabeled["u"]       = u

# Select top 100 most uncertain examples
top100 = pool_unlabeled.nlargest(100, "u")

# Export with required columns + empty labeling columns for Part C
top100[["doc_id", "text", "p_green", "u"]].assign(
    llm_green_suggested = "",
    llm_confidence      = "",
    llm_rationale       = "",
    is_green_human      = "",
    notes               = ""
).to_csv("csv/hitl_green_100.csv", index=False)

print(f"Exported {len(top100)} examples")
print(f"Uncertainty range: {top100['u'].min():.3f} ‚Äì {top100['u'].max():.3f}")

Exported 100 examples
Uncertainty range: 0.987 ‚Äì 1.000


### Part B ‚Äì Summary

Using the baseline classifier from Part A, we computed a predicted green probability (`p_green`) for every example in `pool_unlabeled` via `predict_proba`. We then applied the uncertainty formula from the assignment (`u = 1 ‚àí 2 ¬∑ |p ‚àí 0.5|`) to score how uncertain the model is about each claim.

**Results:**
- Mean uncertainty across pool: 0.428 ‚Äî the baseline is reasonably confident on most examples
- The top 100 selected examples have uncertainty scores between 0.987‚Äì1.000, meaning the model was nearly at a 50/50 guess for all of them

The 100 most uncertain claims were exported to `hitl_green_100.csv` with empty labeling columns ready for the LLM ‚Üí Human workflow in Part C.

## ü§ù Part C: Implement LLM ‚Üí Human HITL (Gold Labels)

Before continuing, make sure you have completed the LLM labeling step on the HPC:

1. Ran `slurm_llm.sh` which executed `llm_label.py` on the 100 uncertain examples
2. The script loaded `Mistral-7B-Instruct-v0.2` via vllm and generated a suggested label, confidence score, and rationale for each claim
3. The output was saved to `csv/hitl_llm_labeled.csv`

**Your task now:** Go through each claim and assign your own final gold label (`is_green_human`). For each row you will see:
- The raw patent claim text
- The LLM's suggested label and confidence
- The LLM's rationale citing phrases from the claim

You make the final call. If you disagree with the LLM, set `is_green_human` accordingly and add a short note explaining why.

In [13]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# Load the LLM labeled file
df_review = pd.read_csv("csv/hitl_llm_labeled.csv")

# Tracks which row we're currently reviewing
state = {"idx": 0}

In [14]:
# Display elements
progress     = widgets.Label()
claim_text   = widgets.Textarea(layout=widgets.Layout(width="100%", height="150px"), disabled=True)
llm_label    = widgets.Label()
llm_conf     = widgets.Label()
llm_rat      = widgets.Textarea(layout=widgets.Layout(width="100%", height="80px"), disabled=True)
notes_box    = widgets.Textarea(placeholder="Optional: add a note (especially if you disagree)",
                                layout=widgets.Layout(width="100%", height="60px"))

# Buttons
btn_green    = widgets.Button(description="1 - Green",     button_style="success")
btn_notgreen = widgets.Button(description="0 - Not Green", button_style="danger")
btn_prev     = widgets.Button(description="‚Üê Previous")
out          = widgets.Output()

In [15]:
OUTPUT_PATH = "csv/hitl_human_labeled.csv"

def show_row(idx):
    row = df_review.iloc[idx]
    progress.value   = f"Claim {idx + 1} / {len(df_review)}"
    claim_text.value = str(row["text"])
    llm_label.value  = f"LLM suggested: {int(row['llm_green_suggested'])}"
    llm_conf.value   = f"LLM confidence: {row['llm_confidence']}"
    llm_rat.value    = str(row["llm_rationale"])
    notes_box.value  = str(row["notes"]) if pd.notna(row["notes"]) else ""

def save_and_advance(label):
    idx = state["idx"]
    df_review.at[idx, "is_green_human"] = label
    df_review.at[idx, "notes"]          = notes_box.value
    # Save to new file instead of overwriting the original LLM output
    df_review.to_csv(OUTPUT_PATH, index=False)
    with out:
        clear_output()
        print(f"Saved: claim {idx + 1} ‚Üí {label}")
    state["idx"] = min(idx + 1, len(df_review) - 1)
    show_row(state["idx"])

In [16]:
# Connect buttons to logic
btn_green.on_click(lambda _: save_and_advance(1))
btn_notgreen.on_click(lambda _: save_and_advance(0))
btn_prev.on_click(lambda _: [state.update({"idx": max(state["idx"] - 1, 0)}), show_row(state["idx"])])

# Start at first unlabeled row so you can safely resume after interruptions
first_unlabeled = df_review["is_green_human"].isna().idxmax()
state["idx"] = first_unlabeled if pd.isna(df_review.at[first_unlabeled, "is_green_human"]) else 0
show_row(state["idx"])

# Render the widget
display(widgets.VBox([
    progress,
    widgets.Label("Claim text:"), claim_text,
    widgets.Label("LLM output:"), llm_label, llm_conf,
    widgets.Label("LLM rationale:"), llm_rat,
    widgets.Label("Your notes:"), notes_box,
    widgets.HBox([btn_notgreen, btn_green, btn_prev]),
    out
]))

VBox(children=(Label(value='Claim 1 / 100'), Label(value='Claim text:'), Textarea(value='1. A processor compri‚Ä¶

In [17]:
df_human = pd.read_csv("csv/hitl_human_labeled.csv")

# Find rows where human label differs from LLM suggestion
overrides = df_human[df_human["is_green_human"] != df_human["llm_green_suggested"]]

print(f"Total overrides: {len(overrides)} / {len(df_human)}")
print()

# Print 3 examples for the README
for i, (_, row) in enumerate(overrides.head(3).iterrows()):
    print(f"--- Example {i+1} ---")
    print(f"Claim:          {row['text'][:200]}...")
    print(f"LLM suggested:  {int(row['llm_green_suggested'])} ({row['llm_confidence']} confidence)")
    print(f"LLM rationale:  {row['llm_rationale']}")
    print(f"Human label:    {int(row['is_green_human'])}")
    print(f"Notes:          {row['notes']}")
    print()

Total overrides: 6 / 100

--- Example 1 ---
Claim:          1. A method of detecting soil macronutrient, plant nutrient uptake, phosphate contaminant in ground water, phosphate contaminant in rivers and phosphate poison in agriculture products, comprising: int...
LLM suggested:  0 (low confidence)
LLM rationale:  The claim does not mention any specific green or sustainable technology. It describes a method for detecting phosphate contaminants in soil, water, and agriculture products.
Human label:    1
Notes:          The claim mentions plan nutrient uptake in ground water and also mentions agriculture which I think should be a green technology.

--- Example 2 ---
Claim:          1. A substrate processing method comprising: a coating step that applies a coating liquid to a substrate having a front surface on which a pattern is formed, thereby forming a coating film on the subs...
LLM suggested:  0 (medium confidence)
LLM rationale:  The patent claim describes a method for processing a s

### Part C ‚Äì Summary

The HITL workflow was implemented as a two-step pipeline: first an LLM step using Mistral-7B-Instruct-v0.2 via vllm on HPC, followed by a human review step in the notebook.

**LLM step:** Mistral evaluated all 100 uncertain claims and output a suggested label, confidence score, and rationale citing phrases from the claim text. The LLM labeled 95 claims as not green and 5 as green, with 72% low confidence ‚Äî reflecting the genuine ambiguity of these examples selected by uncertainty sampling.

**Human step:** All 100 claims were manually reviewed using an interactive widget. The human label overrode the LLM suggestion in 6 out of 100 cases, all from 0 ‚Üí 1 (not green ‚Üí green).

**Override examples:**
1. A phosphate detection method in soil and groundwater ‚Äî LLM labeled not green (low confidence), human labeled green as it relates to monitoring agricultural and water contamination
2. A substrate coating method reducing film thickness ‚Äî LLM labeled not green (medium confidence), human labeled green as material reduction can be considered a sustainable practice
3. A method for removing surfactants using electrochemical oxidation ‚Äî LLM labeled not green (low confidence), human labeled green as surfactant removal is associated with clean chemistry

**Output files:**
- `csv/hitl_llm_labeled.csv` ‚Äî original LLM suggestions
- `csv/hitl_human_labeled.csv` ‚Äî final gold labels after human review

## üöÄ Part D: Final Model (Fine-Tune PatentSBERTa Once)

In [18]:
# Load the human reviewed labels and train_silver
df_gold  = pd.read_csv("csv/hitl_human_labeled.csv")
df_train = pd.read_parquet("parquet/train_silver.parquet")

# Rename human label to is_green_gold for consistency
df_gold["is_green_gold"] = df_gold["is_green_human"]

# Give train_silver the same column name
df_train["is_green_gold"] = df_train["is_green_silver"]

# Concatenate train_silver + gold_100 into one training set
# The 100 examples came from the pool split which is different from the training split
# This means that the 100 examples gets added to the training set because it has no doc id's to overwrite
df_combined = pd.concat(
    [df_train, df_gold[["doc_id", "text", "is_green_gold"]]],
    ignore_index=True
)

print(f"train_silver rows: {len(df_train)}")
print(f"gold_100 rows:     {len(df_gold)}")
print(f"combined rows:     {len(df_combined)}")
print(df_combined["is_green_gold"].value_counts())

df_combined.to_parquet("parquet/train_gold.parquet", index=False)

train_silver rows: 35000
gold_100 rows:     100
combined rows:     35100
is_green_gold
0.0    17589
1.0    17511
Name: count, dtype: int64


Now that `train_gold.parquet` has been created combining `train_silver` and the 100 gold labels, 
the fine-tuning step needs to be run on HPC. Follow these steps before continuing:

1. Make sure `train_gold.parquet` and `hitl_human_labeled.csv` are available in your HPC project folder
2. Submit the SLURM job: `sbatch slurm_finetune.sh`
3. Once complete, copy the saved model folder back to your local project: `models/patentsberta-finetuned`

Then continue with the HuggingFace upload cells below.

In [22]:
# Results from the finetuning process

# Fine-tuning results from HPC (finetune.py output)
# Fine-tuning results from HPC (finetune.py output)
print("""
--- eval_silver ---
              precision    recall  f1-score   support
   not green       0.81      0.80      0.80      2500
       green       0.80      0.81      0.81      2500
    accuracy                           0.81      5000
   macro avg       0.81      0.81      0.81      5000
weighted avg       0.81      0.81      0.81      5000

--- gold_100 ---
              precision    recall  f1-score   support
   not green       0.95      0.61      0.74        89
       green       0.19      0.73      0.30        11
    accuracy                           0.62       100
   macro avg       0.57      0.67      0.52       100
weighted avg       0.86      0.62      0.69       100
""")


--- eval_silver ---
              precision    recall  f1-score   support
   not green       0.81      0.80      0.80      2500
       green       0.80      0.81      0.81      2500
    accuracy                           0.81      5000
   macro avg       0.81      0.81      0.81      5000
weighted avg       0.81      0.81      0.81      5000

--- gold_100 ---
              precision    recall  f1-score   support
   not green       0.95      0.61      0.74        89
       green       0.19      0.73      0.30        11
    accuracy                           0.62       100
   macro avg       0.57      0.67      0.52       100
weighted avg       0.86      0.62      0.69       100



In [None]:
from huggingface_hub import HfApi
from dotenv import load_dotenv
import os

load_dotenv()
TOKEN    = os.getenv("HF_TOKEN")
USERNAME = "alexchrander"

# ‚ö†Ô∏è Set to True only when you want to upload ‚Äî prevents accidental re-uploads
UPLOAD_TO_HF = False

api = HfApi()

if UPLOAD_TO_HF:
    api.create_repo(
        repo_id=f"{USERNAME}/patent-sberta-green-finetuned",
        token=TOKEN,
        exist_ok=True
    )
    api.upload_folder(
        folder_path="models/patentsberta-finetuned",
        repo_id=f"{USERNAME}/patent-sberta-green-finetuned",
        token=TOKEN
    )
    print("Model uploaded successfully")
else:
    print("Skipping upload ‚Äî set UPLOAD_TO_HF = True to upload"))

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Model uploaded successfully


In [None]:
if UPLOAD_TO_HF:
    api.create_repo(
        repo_id=f"{USERNAME}/patents-green-gold-dataset",
        repo_type="dataset",
        token=TOKEN,
        exist_ok=True
    )
    api.upload_file(
        path_or_fileobj="csv/hitl_human_labeled.csv",
        path_in_repo="hitl_human_labeled.csv",
        repo_id=f"{USERNAME}/patents-green-gold-dataset",
        repo_type="dataset",
        token=TOKEN
    )
    print("Dataset uploaded successfully")
else:
    print("Skipping upload ‚Äî set UPLOAD_TO_HF = True to upload")

Dataset uploaded successfully
