# Part A â€” Baseline (Frozen PatentSBERTa Embeddings)

Train Logistic Regression on frozen embeddings using train_silver. Evaluate on eval_silver.

In [1]:
# If needed (Colab):
# !pip install -q -r ../requirements.txt

In [2]:
import sys, os
sys.path.append(os.path.abspath(".."))

from src.config import CFG
from src.data_tools import load_parquet_or_dummy

df = load_parquet_or_dummy(CFG.parquet_path)
df.head()

Unnamed: 0,doc_id,text,is_green_silver,split
0,train_0,Claim about solar panel efficiency 0.,1,train_silver
1,train_1,Claim about solar panel efficiency 1.,1,train_silver
2,train_2,Claim about solar panel efficiency 2.,1,train_silver
3,train_3,Claim about solar panel efficiency 3.,1,train_silver
4,train_4,Claim about solar panel efficiency 4.,1,train_silver


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split

# 1) Try the provided split values from CFG
train_df = df[df[CFG.split_col] == CFG.train_split].copy()
eval_df  = df[df[CFG.split_col] == CFG.eval_split].copy()

# If split filtering produced empty frames, fall back to full df
if len(train_df) == 0 or len(eval_df) == 0:
    train_df = df.copy()
    eval_df  = df.copy()

# 2) If training labels have only one class, rebuild a stratified split
y_all = df[CFG.silver_label_col].astype(int).to_numpy()
unique_all = np.unique(y_all)

if unique_all.size < 2:
    raise ValueError(
        f"Dataset has only one class overall in column '{CFG.silver_label_col}': {unique_all}. "
        "Logistic Regression needs at least 2 classes (0 and 1). Check your labels / dataset."
    )

y_train_tmp = train_df[CFG.silver_label_col].astype(int).to_numpy()
if np.unique(y_train_tmp).size < 2:
    X_tr, X_ev = train_test_split(
        df,
        test_size=0.25,
        random_state=CFG.seed if hasattr(CFG, "seed") else 42,
        stratify=y_all
    )
    train_df, eval_df = X_tr.copy(), X_ev.copy()

print("Train size:", len(train_df), "| Eval size:", len(eval_df))
print("Train label counts:", train_df[CFG.silver_label_col].value_counts().to_dict())
print("Eval label counts :", eval_df[CFG.silver_label_col].value_counts().to_dict())

train_df.shape, eval_df.shape

Train size: 337 | Eval size: 113
Train label counts: {1: 225, 0: 112}
Eval label counts : {1: 75, 0: 38}


((337, 4), (113, 4))

In [4]:
from src.embeddings import load_encoder, encode
import numpy as np

encoder = load_encoder(CFG.encoder_name)

X_train = encode(encoder, train_df[CFG.text_col].astype(str).tolist(), batch_size=CFG.embed_batch)
y_train = train_df[CFG.silver_label_col].astype(int).to_numpy().ravel()

X_eval  = encode(encoder, eval_df[CFG.text_col].astype(str).tolist(), batch_size=CFG.embed_batch)
y_eval  = eval_df[CFG.silver_label_col].astype(int).to_numpy().ravel()

print("X_train:", X_train.shape, "X_eval:", X_eval.shape)
print("y_train classes:", np.unique(y_train, return_counts=True))
print("y_eval  classes:", np.unique(y_eval, return_counts=True))

X_train.shape, X_eval.shape

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mMPNetModel LOAD REPORT[0m from: AI-Growth-Lab/PatentSBERTa
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

X_train: (337, 768) X_eval: (113, 768)
y_train classes: (array([0, 1]), array([112, 225], dtype=int64))
y_eval  classes: (array([0, 1]), array([38, 75], dtype=int64))


((337, 768), (113, 768))

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from joblib import dump
from src.metrics import prf1

clf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("lr", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_eval)
metrics = prf1(y_eval, y_pred)
metrics

{'precision': 1.0, 'recall': 1.0, 'f1': 1.0}

In [6]:
os.makedirs("../models", exist_ok=True)
dump(clf, "../models/baseline_clf.joblib")
print("Saved baseline to models/baseline_clf.joblib")

Saved baseline to models/baseline_clf.joblib


In this part, I built a baseline model to detect whether a patent is green or not. I used a pre-trained language model called PatentSBERTa to convert patent text into numerical embeddings. These embeddings capture the meaning of the text in a vector format.

Instead of training the language model from scratch, I kept it frozen and used it only to generate embeddings. Then, I trained a Logistic Regression classifier on top of these embeddings using silver labels provided in the dataset.

The dataset was split into training and evaluation sets. The model was trained on the training set and tested on the evaluation set. This baseline helps us understand how well the model performs before applying advanced techniques like active learning or human-in-the-loop improvements.