# M4: Assignment 2

## Part A: Baseline Model (Frozen Embeddings)

In [1]:
# !pip install sentence-transformers scikit-learn pandas pyarrow

In [2]:
# !pip install --upgrade transformers sentence-transformers scikit-learn pandas pyarrow

In [3]:
# !pip install "huggingface-hub>=0.34.0,<1.0" --upgrade

In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [5]:
# Load data from HuggingFace
from datasets import load_dataset

dataset = load_dataset("AI-Growth-Lab/patents_claims_1.5m_traim_test")

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'date', 'text', 'A01B', 'A01C', 'A01D', 'A01F', 'A01G', 'A01H', 'A01J', 'A01K', 'A01L', 'A01M', 'A01N', 'A21B', 'A21C', 'A21D', 'A22B', 'A22C', 'A23B', 'A23C', 'A23D', 'A23F', 'A23G', 'A23J', 'A23K', 'A23L', 'A23N', 'A23P', 'A23V', 'A23Y', 'A24B', 'A24C', 'A24D', 'A24F', 'A41B', 'A41C', 'A41D', 'A41F', 'A41G', 'A41H', 'A42B', 'A42C', 'A43B', 'A43C', 'A43D', 'A44B', 'A44C', 'A44D', 'A45B', 'A45C', 'A45D', 'A45F', 'A46B', 'A46D', 'A47B', 'A47C', 'A47D', 'A47F', 'A47G', 'A47H', 'A47J', 'A47K', 'A47L', 'A61B', 'A61C', 'A61D', 'A61F', 'A61G', 'A61H', 'A61J', 'A61K', 'A61L', 'A61M', 'A61N', 'A61P', 'A61Q', 'A62B', 'A62C', 'A62D', 'A63B', 'A63C', 'A63D', 'A63F', 'A63G', 'A63H', 'A63J', 'A63K', 'B01B', 'B01D', 'B01F', 'B01J', 'B01L', 'B02B', 'B02C', 'B03B', 'B03C', 'B03D', 'B04B', 'B04C', 'B05B', 'B05C', 'B05D', 'B06B', 'B07B', 'B07C', 'B08B', 'B09B', 'B09C', 'B21B', 'B21C', 'B21D', 'B21F', 'B21G', 'B21H', 'B21J', 'B21K', 'B21L', 'B22

In [7]:
# Create the green label (is_green_silver)

train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

y02_columns = ['Y02A', 'Y02B', 'Y02C', 'Y02D', 'Y02E', 'Y02P', 'Y02T', 'Y02W']

# Create green label: 1 if any Y02 column = 1, else 0
train_df['is_green_silver'] = train_df[y02_columns].max(axis=1)
test_df['is_green_silver'] = test_df[y02_columns].max(axis=1)

In [8]:
# Sample balanced data (25k green + 25k not green)

# Separate green and not green patents from train set
green_patents = train_df[train_df['is_green_silver'] == 1]
not_green_patents = train_df[train_df['is_green_silver'] == 0]

# Sample 25k from each
green_sample = green_patents.sample(n=25000, random_state=42)
not_green_sample = not_green_patents.sample(n=25000, random_state=42)

# Combine into 50k balanced dataset
patents_50k_green = pd.concat([green_sample, not_green_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
# Create custom splits and prepare columns

# Split into train_silver (40k) and eval_silver (10k)
patents_50k_green['split'] = (
    ['train_silver'] * 40000 +
    ['pool_unlabeled'] * 5000 +
    ['eval_silver'] * 5000 
)

# Keep only needed columns
patents_50k_green = patents_50k_green[['id', 'date', 'text', 'is_green_silver', 'split']]

In [10]:
# Save as parquet file
patents_50k_green.to_parquet('patents_50k_green.parquet', index=False)

In [11]:
# Load the prepared dataset
df = pd.read_parquet('patents_50k_green.parquet')

# Split into train, pool unlabeled and eval
train_df = df[df['split'] == 'train_silver'].reset_index(drop=True)
pool_unlabeled_df = df[df['split'] == 'pool_unlabeled'].reset_index(drop=True)
eval_df = df[df['split'] == 'eval_silver'].reset_index(drop=True)

In [12]:
# Load the model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

In [None]:
# Uncomment to create embeddings. This can take a while, so I've have saved the embeddings as .npy files in the repository.

# Create embeddings. Converts each text into a 768 dimensional vector
# train_embeddings = model.encode(train_df['text'].tolist(), show_progress_bar=True)
# pool_unlabeled_embeddings = model.encode(pool_unlabeled_df['text'].tolist(), show_progress_bar=True)
# eval_embeddings = model.encode(eval_df['text'].tolist(), show_progress_bar=True)

In [None]:
# Uncomment to save embeddings to file.

# Save embeddings to file
# np.save('train_embeddings.npy', train_embeddings)
# np.save('pool_unlabeled_embeddings.npy', pool_unlabeled_embeddings)
# np.save('eval_embeddings.npy', eval_embeddings)

In [15]:
# Load pre-computed embeddings
train_embeddings = np.load('train_embeddings.npy')
pool_unlabeled_embeddings = np.load('pool_unlabeled_embeddings.npy')
eval_embeddings = np.load('eval_embeddings.npy')

In [16]:
# Print the shape of the embeddings
print(train_embeddings.shape)
print(pool_unlabeled_embeddings.shape)
print(eval_embeddings.shape)

(40000, 768)
(5000, 768)
(5000, 768)


In [17]:
# Train Logistic Regression classifier on frozen embeddings
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(train_embeddings, train_df['is_green_silver'])

# Evaluate on eval set
predictions = classifier.predict(eval_embeddings)
probabilities = classifier.predict_proba(eval_embeddings)

# Report metrics
print(classification_report(eval_df['is_green_silver'], predictions, 
                          target_names=['Not Green', 'Green']))

              precision    recall  f1-score   support

   Not Green       0.78      0.80      0.79      2466
       Green       0.80      0.78      0.79      2534

    accuracy                           0.79      5000
   macro avg       0.79      0.79      0.79      5000
weighted avg       0.79      0.79      0.79      5000



## Part B: Identify High-Risk Examples (Uncertainty Sampling)

In [18]:
# Load pool_unlabeled dataframe
pool_df = df[df['split'] == 'pool_unlabeled'].reset_index(drop=True)

# Get predicted probabilities for pool_unlabeled
pool_probabilities = classifier.predict_proba(pool_unlabeled_embeddings)
p_green = pool_probabilities[:, 1]

# Compute uncertainty score: u = 1 - 2 * |p - 0.5|
u = 1 - 2 * np.abs(p_green - 0.5)

# Add to dataframe
pool_df['p_green'] = p_green
pool_df['u'] = u

# Select top 100 highest uncertainty examples
top_100 = pool_df.nlargest(100, 'u').reset_index(drop=True)

# Export with empty labeling columns
hitl_df = top_100[['id', 'text', 'p_green', 'u']].copy()
hitl_df['human_label'] = ''
hitl_df['notes'] = ''

hitl_df.to_csv('hitl_green_100.csv', index=False)

In [19]:
hitl_df.head().sort_values(by='u', ascending=False)

Unnamed: 0,id,text,p_green,u,human_label,notes
0,9647788,"1. A system, comprising: a memory that stores ...",0.499949,0.999898,,
1,8999166,1. A method for allowing access to the bottom ...,0.499604,0.999209,,
2,8555568,1. A drain inlet vault comprising a plurality ...,0.499371,0.998741,,
3,9637231,1. A method comprising: operating an aerial ve...,0.499266,0.998531,,
4,9698560,1. A laser ignition system for an internal com...,0.49921,0.998419,,


## Part C: Implement LLM → Human HITL (Gold Labels)

In [20]:
## Part C: Implement LLM → Human HITL (Gold Labels)

# Load the CSV from Part B
hitl_df = pd.read_csv('hitl_green_100.csv')

# Add new columns with correct data types
hitl_df['llm_green_suggested'] = 0  # Initialize as integer
hitl_df['llm_confidence'] = ''
hitl_df['llm_rationale'] = ''
hitl_df['is_green_human'] = 0  # Initialize as integer
hitl_df['override'] = 0  # Initialize as integer

# Save template for manual labeling
hitl_df.to_csv('hitl_green_100_for_labeling.csv', index=False)

In [21]:
# LLM evaluation using Ollama
import requests
import re

def llm_evaluate_patent(text):
    """
    Send patent text to Ollama and get green classification suggestion.
    """
    prompt = f"""You are an expert in green technology patents. Analyze this patent claim and determine if it relates to climate change mitigation or green technology (Y02 classification).

Patent Claim:
{text}

Provide your response in exactly this format:
Classification: [YES/NO]
Confidence: [low/medium/high]
Rationale: [1-3 sentences citing specific phrases from the claim]
"""
    
    response = requests.post('http://localhost:11434/api/generate',
                            json={
                                "model": "gemma3:1b",  # or llama2, mistral, etc.
                                "prompt": prompt,
                                "stream": False,
                                "temperature": 0.3
                            })
    
    return response.json()['response']

def parse_llm_response(response):
    """
    Parse LLM response to extract structured fields.
    """
    # Extract classification
    classification_match = re.search(r'Classification:\s*(YES|NO)', response, re.IGNORECASE)
    llm_green = 1 if classification_match and classification_match.group(1).upper() == 'YES' else 0
    
    # Extract confidence
    confidence_match = re.search(r'Confidence:\s*(low|medium|high)', response, re.IGNORECASE)
    confidence = confidence_match.group(1).lower() if confidence_match else 'unknown'
    
    # Extract rationale
    rationale_match = re.search(r'Rationale:\s*(.+)', response, re.IGNORECASE | re.DOTALL)
    rationale = rationale_match.group(1).strip() if rationale_match else response
    
    return llm_green, confidence, rationale

# Process each row with LLM
for idx, row in hitl_df.iterrows():
    print(f"Processing {idx+1}/100...")
    llm_response = llm_evaluate_patent(row['text'])
    llm_green, confidence, rationale = parse_llm_response(llm_response)
    
    hitl_df.at[idx, 'llm_green_suggested'] = llm_green
    hitl_df.at[idx, 'llm_confidence'] = confidence
    hitl_df.at[idx, 'llm_rationale'] = rationale

# Save with LLM suggestions
hitl_df.to_csv('hitl_green_100_with_llm.csv', index=False)
print("LLM evaluation complete. Review hitl_green_100_with_llm.csv and add human labels.")

Processing 1/100...
Processing 2/100...
Processing 3/100...
Processing 4/100...
Processing 5/100...
Processing 6/100...
Processing 7/100...
Processing 8/100...
Processing 9/100...
Processing 10/100...
Processing 11/100...
Processing 12/100...
Processing 13/100...
Processing 14/100...
Processing 15/100...
Processing 16/100...
Processing 17/100...
Processing 18/100...
Processing 19/100...
Processing 20/100...
Processing 21/100...
Processing 22/100...
Processing 23/100...
Processing 24/100...
Processing 25/100...
Processing 26/100...
Processing 27/100...
Processing 28/100...
Processing 29/100...
Processing 30/100...
Processing 31/100...
Processing 32/100...
Processing 33/100...
Processing 34/100...
Processing 35/100...
Processing 36/100...
Processing 37/100...
Processing 38/100...
Processing 39/100...
Processing 40/100...
Processing 41/100...
Processing 42/100...
Processing 43/100...
Processing 44/100...
Processing 45/100...
Processing 46/100...
Processing 47/100...
Processing 48/100...
P

In [22]:
# Human review step
# After manually adding 'is_green_human' column to the CSV, run this:

hitl_final = pd.read_csv('hitl_green_100_with_llm.csv')

# Calculate overrides
hitl_final['override'] = (hitl_final['llm_green_suggested'] != hitl_final['is_green_human']).astype(int)
override_count = hitl_final['override'].sum()

print(f"Human overrode LLM in {override_count} out of 100 cases ({override_count}%)")

# Show examples of overrides
override_examples = hitl_final[hitl_final['override'] == 1][['id', 'text', 'llm_green_suggested', 'is_green_human', 'llm_rationale', 'notes']].head(3)
print("\nExample overrides:")
print(override_examples)

# Save final labeled dataset
hitl_final.to_csv('hitl_green_100_final.csv', index=False)

Human overrode LLM in 89 out of 100 cases (89%)

Example overrides:
        id                                               text  \
1  8999166  1. A method for allowing access to the bottom ...   
2  8555568  1. A drain inlet vault comprising a plurality ...   
3  9637231  1. A method comprising: operating an aerial ve...   

   llm_green_suggested  is_green_human  \
1                    1               0   
2                    1               0   
3                    1               0   

                                       llm_rationale  notes  
1  The claim details a process for accessing a wa...    NaN  
2  The claim describes a structure designed for w...    NaN  
3  The claim details a method for aerial vehicle ...    NaN  


## Part D: Final Model (Fine-Tune PatentSBERTa Once)

In [23]:
## Part D: Final Model (Fine-Tune PatentSBERTa Once)

# Load HITL gold labels
hitl_gold = pd.read_csv('hitl_green_100_final.csv')

# Create is_green_gold column for the full dataset
# Start with silver labels
df['is_green_gold'] = df['is_green_silver']

# Override with gold labels for the 100 HITL examples
for _, row in hitl_gold.iterrows():
    df.loc[df['id'] == row['id'], 'is_green_gold'] = row['is_green_human']

# Prepare training data (train_silver + gold_100)
train_gold_df = df[df['split'].isin(['train_silver', 'pool_unlabeled'])].copy()
train_gold_df = train_gold_df[train_gold_df['id'].isin(df[df['split'] == 'train_silver']['id']) | 
                               train_gold_df['id'].isin(hitl_gold['id'])]

eval_df = df[df['split'] == 'eval_silver'].reset_index(drop=True)

In [24]:
# Fine-tune PatentSBERTa
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('AI-Growth-Lab/PatentSBERTa')
model = AutoModelForSequenceClassification.from_pretrained('AI-Growth-Lab/PatentSBERTa', num_labels=2)

# Prepare datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

train_dataset = Dataset.from_pandas(train_gold_df[['text', 'is_green_gold']])
train_dataset = train_dataset.rename_column('is_green_gold', 'labels')
train_dataset = train_dataset.map(tokenize_function, batched=True)

eval_dataset = Dataset.from_pandas(eval_df[['text', 'is_green_gold']])
eval_dataset = eval_dataset.rename_column('is_green_gold', 'labels')
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# ADD THIS FUNCTION - compute metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

# Trainer - ADD compute_metrics parameter
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune
trainer.train()

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at AI-Growth-Lab/PatentSBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/40100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4166,0.403135,0.8194,0.816699,0.829913,0.823253


TrainOutput(global_step=2507, training_loss=0.44703318666641484, metrics={'train_runtime': 2580.3216, 'train_samples_per_second': 15.541, 'train_steps_per_second': 0.972, 'total_flos': 5275376659968000.0, 'train_loss': 0.44703318666641484, 'epoch': 1.0})

In [26]:
# Evaluate on eval_silver
eval_results = trainer.evaluate(eval_dataset)
print("Results on eval_silver:")
print(eval_results)

# Evaluate on gold_100
gold_dataset = Dataset.from_pandas(hitl_gold[['text', 'is_green_human']])
gold_dataset = gold_dataset.rename_column('is_green_human', 'labels')
gold_dataset = gold_dataset.map(tokenize_function, batched=True)

gold_results = trainer.evaluate(gold_dataset)
print("\nResults on gold_100:")
print(gold_results)

# Save fine-tuned model
model.save_pretrained('./patent_sberta_finetuned')
tokenizer.save_pretrained('./patent_sberta_finetuned')

  super().__init__(loader)


Results on eval_silver:
{'eval_loss': 0.4031352400779724, 'eval_accuracy': 0.8194, 'eval_precision': 0.8166990291262136, 'eval_recall': 0.82991318074191, 'eval_f1': 0.8232530827950675, 'eval_runtime': 97.1147, 'eval_samples_per_second': 51.485, 'eval_steps_per_second': 3.223, 'epoch': 1.0}


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  super().__init__(loader)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



Results on gold_100:
{'eval_loss': 0.8510827422142029, 'eval_accuracy': 0.44, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.9399, 'eval_samples_per_second': 51.548, 'eval_steps_per_second': 3.608, 'epoch': 1.0}


('./patent_sberta_finetuned/tokenizer_config.json',
 './patent_sberta_finetuned/special_tokens_map.json',
 './patent_sberta_finetuned/vocab.txt',
 './patent_sberta_finetuned/added_tokens.json',
 './patent_sberta_finetuned/tokenizer.json')