
# Comment Classification, Summarization, and Response Notebook

End-to-end workflow for training the substantive classifier, summarizing comments, and drafting responses using Azure OpenAI.



## Workflow Outline
1. Load the labeled CSV (`northmet-feis-adequacy-exhibit-a.csv`) and clean columns/labels.
2. Parse the Appendix PDF (`014_appendix_a_response_to_comments_on_the_NorthMet_EIS.pdf`) to capture comment/response pairs.
3. Train a classifier (Azure embedding + logistic regression) on the labeled dataset.
4. Evaluate results and export the classifier for the FastAPI service.
5. Demonstrate summarization + response drafting using the appendix dataset.


In [1]:

import os
import re
from pathlib import Path
from typing import List, Dict

import joblib
import numpy as np
import pandas as pd
import pdfplumber
from dotenv import load_dotenv
from openai import AzureOpenAI
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [2]:

# Detect repo root (notebook usually runs from /notebooks)
ROOT = Path().resolve()
if not (ROOT / "data").exists():
    ROOT = ROOT.parent

DATA_DIR = ROOT / "data"
MODEL_DIR = ROOT / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

LABELED_PATH = DATA_DIR / "northmet-feis-adequacy-exhibit-a.csv"
APPENDIX_PATH = DATA_DIR / "014_appendix_a_response_to_comments_on_the_NorthMet_EIS.pdf"
MODEL_PATH = MODEL_DIR / "classifier.joblib"
EMBED_CACHE = DATA_DIR / "comment_embeddings.npy"

print(f"Using root: {ROOT}")
print(f"CSV path: {LABELED_PATH.exists()} | PDF path: {APPENDIX_PATH.exists()}")


Using root: /Users/samuelsetsofia/dev/law-and-governance-project
CSV path: True | PDF path: True


In [4]:

# Load environment variables (expects ../.env when running from notebooks)
from pathlib import Path as _Path
env_path = ROOT / ".env"
load_dotenv(env_path)
print(f"Loaded env from: {env_path}")

required_keys = [
    "AZURE_OPENAI_ENDPOINT",
    "AZURE_OPENAI_API_KEY",
    "AZURE_OPENAI_EMBEDDING_DEPLOYMENT",
    "AZURE_OPENAI_CHAT_DEPLOYMENT",
]
missing = [key for key in required_keys if not os.getenv(key)]
if missing:
    raise ValueError(
        f"Missing Azure OpenAI environment variables: {', '.join(missing)}."
        "Create a .env file (see .env.example) or export these variables before running."
    )

API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-01")
AZURE_CLIENT = AzureOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=API_VERSION,
)
EMBED_MODEL = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
CHAT_MODEL = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"]


Loaded env from: /Users/samuelsetsofia/dev/law-and-governance-project/.env


In [5]:

# Helper: parse labeled CSV with substantive flag
cols = [
    "Name of Sender",
    " Comment",
    " Issue",
    "Substantive / Non-Substantive",
    "Old / New",
    "Response ID",
    " RGU Consideration",
]
df = pd.read_csv(LABELED_PATH, names=cols, header=0, engine="python")

rename_map = {
    "Name of Sender": "sender",
    " Comment": "comment",
    " Issue": "issue",
    "Substantive / Non-Substantive": "label_raw",
    "Old / New": "comment_type",
    "Response ID": "response_id",
    " RGU Consideration": "response_text",
}
df = df.rename(columns=rename_map)

# Clean
original_count = len(df)
df = df.dropna(subset=["comment", "label_raw"]).copy()
df["comment"] = df["comment"].astype(str).str.strip()
df = df[df["comment"].str.len() > 20]

# Normalize labels
label_map = {
    "S": "substantive",
    "NS": "non-substantive",
    "SUBSTANTIVE": "substantive",
    "NON-SUBSTANTIVE": "non-substantive",
}
df["label_key"] = df["label_raw"].astype(str).str.strip().str.upper()
df = df[df["label_key"].isin(label_map)].copy()
df["label"] = df["label_key"].map(label_map)
df["response_text"] = df["response_text"].fillna("").str.strip()
df = df.drop(columns=["label_raw", "label_key"])

print(f"Rows after cleaning: {len(df)} (dropped {original_count - len(df)} rows)")
print(df["label"].value_counts())


Rows after cleaning: 4389 (dropped 1166 rows)
label
non-substantive    2767
substantive        1622
Name: count, dtype: int64


In [6]:

# Parser for Appendix PDF comment/response pairs

def parse_appendix_pdf(pdf_path: Path) -> pd.DataFrame:
    records: List[Dict[str, str]] = []
    current = None
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            words = page.extract_words(use_text_flow=True)
            for idx, word in enumerate(words):
                text = word["text"].strip()
                if not text:
                    continue
                x0 = word["x0"]
                if text.isdigit() and len(text) >= 3 and x0 < 110:
                    lookahead = words[idx + 1 : idx + 6]
                    if not any("Comment" in w.get("text", "") for w in lookahead):
                        continue
                    if current and (current["comment_text"].strip() or current["response_text"].strip()):
                        records.append(current)
                    current = {
                        "comment_id": text,
                        "comment_text": "",
                        "response_text": "",
                        "themes": "",
                    }
                    continue
                if current is None:
                    continue
                if x0 < 140:
                    # comment numbering / artifacts
                    continue
                if x0 < 360:
                    current["comment_text"] += " " + text
                elif x0 < 540:
                    current["response_text"] += " " + text
                else:
                    current["themes"] += " " + text
    if current and (current["comment_text"].strip() or current["response_text"].strip()):
        records.append(current)

    appendix_df = pd.DataFrame(records)
    appendix_df = appendix_df[appendix_df["comment_text"].str.contains("Comment #", na=False)].copy()
    appendix_df["comment_text"] = (
        appendix_df["comment_text"]
        .str.replace(r"Comment #\s*\d+\.\s*", "", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    appendix_df["response_text"] = appendix_df["response_text"].str.replace(r"\s+", " ", regex=True).str.strip()
    appendix_df["themes"] = appendix_df["themes"].str.replace(r"\s+", " ", regex=True).str.strip()
    return appendix_df

appendix_df = parse_appendix_pdf(APPENDIX_PATH)
print(f"Parsed {len(appendix_df)} appendix comment/response pairs")
appendix_df.head(3)


Parsed 33 appendix comment/response pairs


Unnamed: 0,comment_id,comment_text,response_text,themes
0,2981,Spill prevention is an important part of the m...,To guard against possible adverse effects from...,"spilled ore, PolyMet plans the couplings and l..."
1,2982,Pages 5-50 forward describe how the company ha...,Mine waste rock would be sorted and stored its...,into four categories based on not produce acid...
2,2983,"Page 5-157, Section 5.2.2.3.3, 2nd Paragraph: ...","FEIS Section 5.2.14.2.3, which expands upon SD...",the discussion from the Hydrometallurgical Res...


In [7]:

# Train/test split for classifier
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
print(f"Train rows: {len(train_df)} | Test rows: {len(test_df)}")


Train rows: 3511 | Test rows: 878


In [8]:

def embed_batch(texts: List[str]) -> List[List[float]]:
    response = AZURE_CLIENT.embeddings.create(input=texts, model=EMBED_MODEL)
    return [item.embedding for item in response.data]


def build_embeddings(texts: List[str], batch_size: int = 64) -> np.ndarray:
    vectors: List[List[float]] = []
    for start in range(0, len(texts), batch_size):
        batch = texts[start : start + batch_size]
        vectors.extend(embed_batch(batch))
    return np.array(vectors)


if EMBED_CACHE.exists():
    print("Loading cached embeddings...")
    cached = np.load(EMBED_CACHE, allow_pickle=True).item()
    train_X = cached["train_X"]
    test_X = cached["test_X"]
else:
    print("Computing embeddings from Azure OpenAI (may take several minutes)...")
    train_X = build_embeddings(train_df["comment"].tolist())
    test_X = build_embeddings(test_df["comment"].tolist())
    np.save(EMBED_CACHE, {"train_X": train_X, "test_X": test_X})

train_y = train_df["label"].values
test_y = test_df["label"].values


Computing embeddings from Azure OpenAI (may take several minutes)...


In [9]:

# Train classifier (logistic regression on embeddings)
clf = LogisticRegression(max_iter=200)
clf.fit(train_X, train_y)

pred_y = clf.predict(test_X)
pred_probs = clf.predict_proba(test_X)

print(classification_report(test_y, pred_y))
print(confusion_matrix(test_y, pred_y))

joblib.dump(clf, MODEL_PATH)
print(f"Saved classifier to {MODEL_PATH}")


                 precision    recall  f1-score   support

non-substantive       0.87      0.87      0.87       554
    substantive       0.77      0.77      0.77       324

       accuracy                           0.83       878
      macro avg       0.82      0.82      0.82       878
   weighted avg       0.83      0.83      0.83       878

[[480  74]
 [ 74 250]]
Saved classifier to /Users/samuelsetsofia/dev/law-and-governance-project/models/classifier.joblib


In [10]:

# Helper functions for summarization + response generation
def summarize_comment(comment: str, response: str | None = None) -> str:
    if response:
        prompt = (
            "Summarize the public comment and note how the existing agency response addresses it.\n"
            f"Comment: {comment}\n"
            f"Agency Response: {response}"
        )
    else:
        prompt = (
            "Summarize the public comment in 2 sentences, highlighting issues and requested actions.\n"
            f"Comment: {comment}"
        )
    completion = AZURE_CLIENT.responses.create(
        model=CHAT_MODEL,
        input=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_output_tokens=200,
    )
    return completion.output[0].content[0].text.strip()


def draft_response(comment: str, summary: str, label: str, response: str | None = None) -> str:
    prompt_lines = [
        "Draft a short agency response (<=150 words).",
        "Acknowledge the comment, reference commitments, and stay neutral.",
        f"Summary: {summary}",
        f"Classification: {label}",
        f"Comment: {comment}",
    ]
    if response:
        prompt_lines.append(f"Existing response: {response}")
        prompt_lines.append("Only add new info if necessary; otherwise reiterate commitments succinctly.")

    completion = AZURE_CLIENT.responses.create(
        model=CHAT_MODEL,
        input=[{"role": "user", "content": "\n".join(prompt_lines)}],
        temperature=0.4,
        max_output_tokens=300,
    )
    return completion.output[0].content[0].text.strip()


In [11]:

# Demo across both datasets (set demo_mode=True to skip Azure calls)
demo_mode = True
results = []

sample_comments = appendix_df.head(3).to_dict(orient="records")
for item in sample_comments:
    if demo_mode:
        results.append(
            {
                "comment_id": item["comment_id"],
                "comment": item["comment_text"][:200] + "...",
                "response_excerpt": item["response_text"][:200] + "...",
            }
        )
    else:
        summary = summarize_comment(item["comment_text"], item["response_text"])
        probs = clf.predict_proba(np.array(embed_batch([item["comment_text"]])))
        label_idx = int(np.argmax(probs[0]))
        label = clf.classes_[label_idx]
        generated = draft_response(item["comment_text"], summary, label, item["response_text"])
        results.append(
            {
                "comment_id": item["comment_id"],
                "summary": summary,
                "label": label,
                "probability": float(probs[0][label_idx]),
                "generated_response": generated,
            }
        )

pd.DataFrame(results)


Unnamed: 0,comment_id,comment,response_excerpt
0,2981,Spill prevention is an important part of the m...,To guard against possible adverse effects from...
1,2982,Pages 5-50 forward describe how the company ha...,Mine waste rock would be sorted and stored its...
2,2983,"Page 5-157, Section 5.2.2.3.3, 2nd Paragraph: ...","FEIS Section 5.2.14.2.3, which expands upon SD..."
