# M4: Assignment 2

## Part A: Baseline Model (Frozen Embeddings)

In [82]:
# !pip install sentence-transformers scikit-learn pandas pyarrow

In [83]:
# !pip install --upgrade transformers sentence-transformers scikit-learn pandas pyarrow

In [84]:
# !pip install "huggingface-hub>=0.34.0,<1.0" --upgrade

In [85]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [86]:
# Load data from HuggingFace
from datasets import load_dataset

dataset = load_dataset("AI-Growth-Lab/patents_claims_1.5m_traim_test")

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [87]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'date', 'text', 'A01B', 'A01C', 'A01D', 'A01F', 'A01G', 'A01H', 'A01J', 'A01K', 'A01L', 'A01M', 'A01N', 'A21B', 'A21C', 'A21D', 'A22B', 'A22C', 'A23B', 'A23C', 'A23D', 'A23F', 'A23G', 'A23J', 'A23K', 'A23L', 'A23N', 'A23P', 'A23V', 'A23Y', 'A24B', 'A24C', 'A24D', 'A24F', 'A41B', 'A41C', 'A41D', 'A41F', 'A41G', 'A41H', 'A42B', 'A42C', 'A43B', 'A43C', 'A43D', 'A44B', 'A44C', 'A44D', 'A45B', 'A45C', 'A45D', 'A45F', 'A46B', 'A46D', 'A47B', 'A47C', 'A47D', 'A47F', 'A47G', 'A47H', 'A47J', 'A47K', 'A47L', 'A61B', 'A61C', 'A61D', 'A61F', 'A61G', 'A61H', 'A61J', 'A61K', 'A61L', 'A61M', 'A61N', 'A61P', 'A61Q', 'A62B', 'A62C', 'A62D', 'A63B', 'A63C', 'A63D', 'A63F', 'A63G', 'A63H', 'A63J', 'A63K', 'B01B', 'B01D', 'B01F', 'B01J', 'B01L', 'B02B', 'B02C', 'B03B', 'B03C', 'B03D', 'B04B', 'B04C', 'B05B', 'B05C', 'B05D', 'B06B', 'B07B', 'B07C', 'B08B', 'B09B', 'B09C', 'B21B', 'B21C', 'B21D', 'B21F', 'B21G', 'B21H', 'B21J', 'B21K', 'B21L', 'B22

In [88]:
# Create the green label (is_green_silver)

train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

y02_columns = ['Y02A', 'Y02B', 'Y02C', 'Y02D', 'Y02E', 'Y02P', 'Y02T', 'Y02W']

# Create green label: 1 if any Y02 column = 1, else 0
train_df['is_green_silver'] = train_df[y02_columns].max(axis=1)
test_df['is_green_silver'] = test_df[y02_columns].max(axis=1)

In [89]:
# Sample balanced data (25k green + 25k not green)

# Separate green and not green patents from train set
green_patents = train_df[train_df['is_green_silver'] == 1]
not_green_patents = train_df[train_df['is_green_silver'] == 0]

# Sample 25k from each
green_sample = green_patents.sample(n=25000, random_state=42)
not_green_sample = not_green_patents.sample(n=25000, random_state=42)

# Combine into 50k balanced dataset
patents_50k_green = pd.concat([green_sample, not_green_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

In [90]:
# Create custom splits and prepare columns

# Split into train_silver (40k) and eval_silver (10k)
patents_50k_green['split'] = (
    ['train_silver'] * 40000 +
    ['pool_unlabeled'] * 5000 +
    ['eval_silver'] * 5000 
)

# Keep only needed columns
patents_50k_green = patents_50k_green[['id', 'date', 'text', 'is_green_silver', 'split']]

In [91]:
# Save as parquet file
patents_50k_green.to_parquet('patents_50k_green.parquet', index=False)

In [92]:
# Load the prepared dataset
df = pd.read_parquet('patents_50k_green.parquet')

# Split into train and eval
train_df = df[df['split'] == 'train_silver'].reset_index(drop=True)
pool_unlabeled_df = df[df['split'] == 'pool_unlabeled'].reset_index(drop=True)
eval_df = df[df['split'] == 'eval_silver'].reset_index(drop=True)

In [93]:
# Load the model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

In [None]:
# Create embeddings. Converts each text into a 768 dimensional vector
# train_embeddings = model.encode(train_df['text'].tolist(), show_progress_bar=True)
# pool_unlabeled_embeddings = model.encode(pool_unlabeled_df['text'].tolist(), show_progress_bar=True)
# eval_embeddings = model.encode(eval_df['text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [95]:
# Print the shape of the embeddings
print(train_embeddings.shape)
print(pool_unlabeled_embeddings.shape)
print(eval_embeddings.shape)

(40000, 768)
(5000, 768)
(5000, 768)


In [97]:
# Train Logistic Regression classifier on frozen embeddings
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(train_embeddings, train_df['is_green_silver'])

# Evaluate on eval set
predictions = classifier.predict(eval_embeddings)
probabilities = classifier.predict_proba(eval_embeddings)

# Report metrics
print(classification_report(eval_df['is_green_silver'], predictions, 
                          target_names=['Not Green', 'Green']))

              precision    recall  f1-score   support

   Not Green       0.78      0.80      0.79      2466
       Green       0.80      0.78      0.79      2534

    accuracy                           0.79      5000
   macro avg       0.79      0.79      0.79      5000
weighted avg       0.79      0.79      0.79      5000



## Part B: Identify High-Risk Examples (Uncertainty Sampling)

## Part C: Implement LLM â†’ Human HITL (Gold Labels)

## Part D: Final Model (Fine-Tune PatentSBERTa Once)