In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/Field-Key-Analysis'

/content/drive/MyDrive/Field-Key-Analysis


## Setup

### Install Packages

In [None]:
import logging
import os
import pickle
import re
import sys
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from typing import List, Tuple
from tqdm.auto import tqdm
from beam_module import title_tokens,beam_search_title_only, style_ok
from scoring_utils import (
    cosine_score,
    nli_entailment_prob_batch,
    nli_entailment_prob,
    combined_cosine_nli_score,
    clean_title_for_cosine,
    )
from suggest_pipeline import suggest_for_row

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import importlib, scoring_utils
importlib.reload(scoring_utils)

print("Device:", scoring_utils._DEVICE)
print("AMP enabled:", scoring_utils._USE_AMP)

Device: cuda
AMP enabled: True


In [None]:
# ---- Paths ----
PROJ_ROOT = Path.cwd()
DATA_DIR = PROJ_ROOT / "data"
CACHE_DIR = PROJ_ROOT / "cache"
OUT_DIR = PROJ_ROOT / "output"

DATA_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Reproducibility / Displayxs ----
RNG_SEED = 42



### Helper Functions

In [None]:
def to_snake(text: str) -> str:
  """Normalize a field key into a snake_case.

    Steps: substitute camelCase/PascalCase/kebab-case with snake_case,
    lowercase,
    strip non-alphanumerics (keep spaces), collapse whitespace.

    Args:
      text: Raw key text.

    Returns:
      Normalized string (possibly empty).
    """
  if pd.isna(text):
      return ""
  s = str(text).strip()
  s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", "_", s)   # split camelCase
  s = s.lower()
  s = re.sub(r"[^a-z0-9_]+", "_", s)              # non-word -> _
  s = re.sub(r"_{2,}", "_", s).strip("_")         # collapse/trim _
  return s




### Load Data

In [None]:
field_keys = pd.read_csv(
    OUT_DIR / "final_df.csv"
)
field_keys_df = field_keys.reset_index(drop=True).copy()
cols_ending_in_y = [col for col in field_keys_df.columns if col.endswith("_y")]
field_keys_df.drop(columns=cols_ending_in_y, inplace=True)

cols_ending_in_x = [col for col in field_keys_df.columns if col.endswith("_x")]
field_keys_df.rename(columns={col: col[:-2] for col in cols_ending_in_x}, inplace=True)

# Display the shape and head of the DataFrame
logging.info(
    "fields_keys_df Rows: %d, Columns: %d",
    field_keys_df.shape[0],
    field_keys_df.shape[1],
)

field_keys_df.head()

Unnamed: 0,report_class_id,organization_name,field_key,field_title,field_type,field_key_definition,row_id,field_key_exists_in_library_flag,structural_severity,semantic_severity,...,nli_axis,nli_axis_label,len_ratio,facet_title_missing,facet_hard_mismatch,facet_partial_mismatch,facet_missing_context,facet_hidden_agent,facet_temporal_mismatch,facet_token_validity_issues
0,AmalReport,Amal,meeting_summary,סיכום הביקור,textarea,,0,False,1.0,0.30262,...,-0.222508,Partial Contradiction,0.0,False,True,True,False,False,False,False
1,APNFamilyTherapyReport,Hillsides,data,Data:,textarea,The Field ‘data' contains the main topics cove...,1,True,,,...,,,,,,,,,,
2,APNFamilyTherapyReport,Hillsides,assessment,Assessment:,textarea,This field captures the thorough analysis or e...,2,True,,,...,,,,,,,,,,
3,APNFamilyTherapyReport,Hillsides,plan,Plan:,textarea,This field describes the future steps or actio...,3,True,,,...,,,,,,,,,,
4,BarryCountyCMHAPeerSupportNoteReport,Barry County CMHA,purpose,Purpose of Contact,textarea,,9,False,1.0,0.633742,...,0.340272,Weak Match,1.5,False,False,False,True,False,False,False


In [None]:
field_keys_df.sort_values("row_id", ascending = True)

Unnamed: 0,report_class_id,organization_name,field_key,field_title,field_type,field_key_definition,row_id,field_key_exists_in_library_flag,structural_severity,semantic_severity,...,nli_axis,nli_axis_label,len_ratio,facet_title_missing,facet_hard_mismatch,facet_partial_mismatch,facet_missing_context,facet_hidden_agent,facet_temporal_mismatch,facet_token_validity_issues
0,AmalReport,Amal,meeting_summary,סיכום הביקור,textarea,,0,False,1.0,0.302620,...,-0.222508,Partial Contradiction,0.0,False,True,True,False,False,False,False
1,APNFamilyTherapyReport,Hillsides,data,Data:,textarea,The Field ‘data' contains the main topics cove...,1,True,,,...,,,,,,,,,,
2,APNFamilyTherapyReport,Hillsides,assessment,Assessment:,textarea,This field captures the thorough analysis or e...,2,True,,,...,,,,,,,,,,
3,APNFamilyTherapyReport,Hillsides,plan,Plan:,textarea,This field describes the future steps or actio...,3,True,,,...,,,,,,,,,,
4,BarryCountyCMHAPeerSupportNoteReport,Barry County CMHA,purpose,Purpose of Contact,textarea,,9,False,1.0,0.633742,...,0.340272,Weak Match,1.5,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284,ZepfIndividualTherapyServiceNote,Zepf,interventions,Describe the interventions provided.,textarea,The Field ‘interventions' refers to the variou...,2820,True,,,...,,,,,,,,,,
2285,ZepfIndividualTherapyServiceNote,Zepf,client_response,Describe the client's response to the interven...,textarea,,2821,False,1.0,0.582165,...,0.540515,Weak Match,1.5,False,True,False,False,False,False,False
2286,ZepfIndividualTherapyServiceNote,Zepf,plan_document,Document the plan. If there were barriers desc...,textarea,,2822,False,1.0,0.726039,...,0.755276,Exact Match,1.5,False,False,True,False,False,False,False
2287,ZepfTbsOutreachReport,Zepf,intervention_description_and_narrative,Describe Interventions Delivered:,textarea,,2823,False,1.0,0.500976,...,0.008771,Uncertain,1.0,False,False,True,False,False,False,False


## Deterministic Baseline

In [None]:
# Filter the DataFrame
condition = ((field_keys_df["field_type"] == "textarea") & (field_keys_df["field_key_exists_in_library_flag"] == False))
filtered_df = field_keys_df[condition]

# Save the filtered DataFrame
with open(CACHE_DIR / "filtered_field_keys_df.pkl", "wb") as f:
    pickle.dump(filtered_df, f)

filtered_df.head()

Unnamed: 0,report_class_id,organization_name,field_key,field_title,field_type,field_key_definition,row_id,field_key_exists_in_library_flag,structural_severity,semantic_severity,...,nli_axis,nli_axis_label,len_ratio,facet_title_missing,facet_hard_mismatch,facet_partial_mismatch,facet_missing_context,facet_hidden_agent,facet_temporal_mismatch,facet_token_validity_issues
0,AmalReport,Amal,meeting_summary,סיכום הביקור,textarea,,0,False,1.0,0.30262,...,-0.222508,Partial Contradiction,0.0,False,True,True,False,False,False,False
4,BarryCountyCMHAPeerSupportNoteReport,Barry County CMHA,purpose,Purpose of Contact,textarea,,9,False,1.0,0.633742,...,0.340272,Weak Match,1.5,False,False,False,True,False,False,False
5,BarryCountyCMHAPeerSupportNoteReport,Barry County CMHA,response,Client Response,textarea,,10,False,1.0,0.661814,...,0.475051,Weak Match,1.5,False,False,False,True,True,False,False
8,BarryCountyCMHAPeerSupportNoteReport,Barry County CMHA,explanation,"Explanation (use direct quotes from Client, wh...",textarea,,13,False,1.0,0.484134,...,0.434662,Weak Match,1.5,False,True,False,False,True,False,False
11,BestPointCareCoordination,Best Point,observed_reported_mood_affect_behavior,Functioning - Observed or Reported (may includ...,textarea,,16,False,1.0,0.827652,...,0.789676,Exact Match,1.5,False,False,True,False,False,False,False


### Apply to_snake function  

In [None]:
# Test key_normalize function
test_cases = [
    "camelCaseExample",
    "PascalCaseExample",
    "snake_case_example",
    "kebab-case-example",
    "   extra   spaces   ",
    "special@characters!#$%^&*()",
    None
]

for case in test_cases:
    print(f"Input: {case}\nNormalized: {to_snake(case)}\n")


warnings.filterwarnings("ignore")

# Normalize field keys and titles
filtered_df["norm_key"]   = filtered_df["field_key"].map(to_snake)
filtered_df["norm_title"] = filtered_df["field_title"].map(to_snake)

# quick sanity preview
filtered_df[["field_key","norm_key","field_title","norm_title"]].head()

Input: camelCaseExample
Normalized: camel_case_example

Input: PascalCaseExample
Normalized: pascal_case_example

Input: snake_case_example
Normalized: snake_case_example

Input: kebab-case-example
Normalized: kebab_case_example

Input:    extra   spaces   
Normalized: extra_spaces

Input: special@characters!#$%^&*()
Normalized: special_characters

Input: None
Normalized: 



Unnamed: 0,field_key,norm_key,field_title,norm_title
0,meeting_summary,meeting_summary,סיכום הביקור,
4,purpose,purpose,Purpose of Contact,purpose_of_contact
5,response,response,Client Response,client_response
8,explanation,explanation,"Explanation (use direct quotes from Client, wh...",explanation_use_direct_quotes_from_client_when...
11,observed_reported_mood_affect_behavior,observed_reported_mood_affect_behavior,Functioning - Observed or Reported (may includ...,functioning_observed_or_reported_may_include_m...


### Candidate Generator

In [None]:
import importlib, scoring_utils, beam_module, suggest_pipeline, key_cleaning
importlib.reload(scoring_utils)
importlib.reload(beam_module)
importlib.reload(suggest_pipeline)
importlib.reload(key_cleaning)

<module 'key_cleaning' from '/content/drive/MyDrive/Field-Key-Analysis/key_cleaning.py'>

In [None]:
work = filtered_df[["field_title", "field_key", "row_id"]]
work.head()

Unnamed: 0,field_title,field_key,row_id
0,סיכום הביקור,meeting_summary,0
4,Purpose of Contact,purpose,9
5,Client Response,response,10
8,"Explanation (use direct quotes from Client, wh...",explanation,13
11,Functioning - Observed or Reported (may includ...,observed_reported_mood_affect_behavior,16


In [None]:
def suggest_for_row_debug(row, log=print):
    """
    Wraps suggest_for_row but logs diagnostics whenever a suggestion fails.
    """
    res = suggest_for_row(row)

    if not res.get("suggested_key"):  # blank case
        reason = res.get("diag_reason", "unknown")
        tokens = row.get("field_title", "")
        log("⚠️ Empty suggestion")
        log(f"  row_id: {row.get('row_id', '<no id>')}")
        log(f"  field_title: {tokens}")
        log(f"  diag_reason: {reason}")

        # Optionally: rerun beam manually to peek at first candidates
        toks = title_tokens(str(tokens).lower().replace(" ", "_"))
        if toks:
            try:
                best_key, best_score, best_seq = beam_search_title_only(
                    toks, str(tokens), beam_width=3, max_len=3
                )
                log(f"  beam peek → {best_key} (score={best_score:.3f}) tokens={best_seq}")
            except Exception as e:
                log(f"  beam peek error: {e}")

    return res

In [None]:
rows = []
for _, r in tqdm(work.iterrows(), total=len(work), desc="Suggesting keys"):
    res = suggest_for_row(r)   # 👈 normal version, no debug prints
    rows.append({
        "field_title": r["field_title"],
        "row_id": r.get("row_id"),
        **res
    })
review = pd.DataFrame(rows)

Suggesting keys:   0%|          | 0/1065 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:

# Save the filtered DataFrame
with open(CACHE_DIR / "suggested_field_keys_df.pkl", "wb") as f:
    pickle.dump(review, f)

## Summary (key metrics + artifact paths)

In [None]:
with open(CACHE_DIR / "suggested_field_keys_df.pkl", "rb") as f:
    suggested_field_keys_df = pickle.load(f, encoding="utf-8")

suggested_field_keys_df.to_csv(OUT_DIR / "field_key_suggestions_analytics.csv", index=False)



In [None]:
# Create a boolean mask
mask = (suggested_field_keys_df["suggested_key"].isna()) | (suggested_field_keys_df["suggested_key"] == "") | (suggested_field_keys_df["diag_reason"].astype(str) != "")

# Filter rows
problematic = suggested_field_keys_df[mask][['field_title','suggested_key','diag_reason']]

print(problematic['diag_reason'].value_counts())
problematic.head()

diag_reason
beam_empty_or_style_fail    21
empty_title                  8
Name: count, dtype: int64


Unnamed: 0,field_title,suggested_key,diag_reason
0,סיכום הביקור,,beam_empty_or_style_fail
205,Response to Intervention/Progress Toward Goals...,,beam_empty_or_style_fail
225,Psychiatric Recommendations including Justific...,,beam_empty_or_style_fail
375,Person's Response to Intervention/Progress Tow...,,beam_empty_or_style_fail
377,Person's Response to Intervention/Progress Tow...,,beam_empty_or_style_fail


In [None]:
field_keys_df = pd.read_csv(OUT_DIR / "final_df.csv")
field_keys_df.shape

(2289, 37)

In [None]:
columns_to_keep = ["row_id", "suggested_key"]
cols_to_drop = [c for c in suggested_field_keys_df.columns if c not in columns_to_keep]
suggested_field_keys_df = suggested_field_keys_df.drop(columns=cols_to_drop)

field_keys_df = pd.read_csv(OUT_DIR / "final_df.csv")
merged = field_keys_df.merge(suggested_field_keys_df, on="row_id", how="left", suffixes=("", "_dup"))
merged = merged.drop(columns=[c for c in merged.columns if c.endswith("_dup")])

new_order= ['report_class_id', 'organization_name', 'field_key', 'suggested_key', 'field_title',
       'field_type', 'field_key_definition', 'row_id',
       'field_key_exists_in_library_flag', 'structural_severity',
       'semantic_severity', 'final_severity', 'reasons',
       'normalized_field_key', 'validity_stats', 'token_valid_ratio',
       'char_valid_ratio', 'valid_tokens', 'invalid_tokens', 'tokens',
       'token_validity_ratio_label', 'containment_title', 'matched_tokens',
       'key_tokens', 'title_tokens', 'containment_title_label', 'cosine_sim',
       'cosine_verdict', 'nli_axis', 'nli_axis_label', 'len_ratio',
       'facet_title_missing', 'facet_hard_mismatch', 'facet_partial_mismatch',
       'facet_missing_context', 'facet_hidden_agent',
       'facet_temporal_mismatch', 'facet_token_validity_issues',
        ]

merged = merged[new_order]

merged.to_csv(OUT_DIR / "field_key_df_with_suggestions.csv", index=False)



In [None]:
merged.shape

(2289, 38)

### Artifact Paths

In [None]:
artifacts = [
    OUT_DIR / "field_key_suggestions_analytics.csv",
    OUT_DIR / "field_key_df_with_suggestions.csv",
]

manifest_lines = []
for p in artifacts:
    status = "OK" if Path(p).exists() else "MISSING"
    manifest_lines.append(f"- {p}  [{status}]")

logging.info("Artifact manifest:\n%s", "\n".join(manifest_lines))