In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/Field-Key-Analysis'

/content/drive/MyDrive/Field-Key-Analysis


In [None]:
!git status



## Setup

In [4]:
import importlib, scoring_utils
importlib.reload(scoring_utils)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

<module 'scoring_utils' from '/content/drive/MyDrive/Field-Key-Analysis/scoring_utils.py'>

### Install Packages

In [5]:
import logging
import os
import pickle
import re
import sys
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from typing import List, Tuple
from beam_module import title_tokens, beam_search_title_only, style_ok
from scoring_utils import (
    cosine_score,
    nli_entailment_prob_batch,
    nli_entailment_prob,
    combined_cosine_nli_score,
    clean_title_for_cosine,
    )
from tqdm.auto import tqdm


In [6]:
import importlib, scoring_utils
importlib.reload(scoring_utils)

print("Device:", scoring_utils._DEVICE)
print("AMP enabled:", scoring_utils._USE_AMP)

Device: cuda
AMP enabled: True


In [7]:
# ---- Paths ----
PROJ_ROOT = Path.cwd()
DATA_DIR = PROJ_ROOT / "data"
CACHE_DIR = PROJ_ROOT / "cache"
OUT_DIR = PROJ_ROOT / "output"

DATA_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Reproducibility / Displayxs ----
RNG_SEED = 42



### Helper Functions

In [8]:
def to_snake(text: str) -> str:
  """Normalize a field key into a snake_case.

    Steps: substitute camelCase/PascalCase/kebab-case with snake_case,
    lowercase,
    strip non-alphanumerics (keep spaces), collapse whitespace.

    Args:
      text: Raw key text.

    Returns:
      Normalized string (possibly empty).
    """
  if pd.isna(text):
      return ""
  s = str(text).strip()
  s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", "_", s)   # split camelCase
  s = s.lower()
  s = re.sub(r"[^a-z0-9_]+", "_", s)              # non-word -> _
  s = re.sub(r"_{2,}", "_", s).strip("_")         # collapse/trim _
  return s




### Load Data

In [9]:
field_keys = pd.read_csv(
    OUT_DIR / "final_df.csv"
)
field_keys_df = field_keys.reset_index(drop=True).copy()
cols_ending_in_y = [col for col in field_keys_df.columns if col.endswith("_y")]
field_keys_df.drop(columns=cols_ending_in_y, inplace=True)

cols_ending_in_x = [col for col in field_keys_df.columns if col.endswith("_x")]
field_keys_df.rename(columns={col: col[:-2] for col in cols_ending_in_x}, inplace=True)

# Display the shape and head of the DataFrame
logging.info(
    "fields_keys_df Rows: %d, Columns: %d",
    field_keys_df.shape[0],
    field_keys_df.shape[1],
)

field_keys_df.head()

Unnamed: 0,report_class_id,field_key,field_title,field_type,field_key exists in field_key_library?,field_key_definition,row_id,field_key_exists_in_library_flag,organization_name,normalized_field_key,...,facet_hard_mismatch,facet_partial_mismatch,facet_missing_context,facet_hidden_agent,facet_temporal_mismatch,facet_token_validity_issues,semantic_severity,structural_severity,final_severity,reasons
0,AmalReport,meeting_summary,סיכום הביקור,textarea,False,,0,False,Amal,meeting summary,...,True,True,False,False,False,False,0.30262,1.0,0.790786,"hard_mismatch, partial_mismatch"
1,APNFamilyTherapyReport,data,Data:,textarea,True,The Field ‘data' contains the main topics cove...,1,,,,...,,,,,,,,,,
2,APNFamilyTherapyReport,assessment,Assessment:,textarea,True,This field captures the thorough analysis or e...,2,,,,...,,,,,,,,,,
3,APNFamilyTherapyReport,plan,Plan:,textarea,True,This field describes the future steps or actio...,3,,,,...,,,,,,,,,,
4,ARCAIndividualTherapyNoteReport,Patient_States,Patient states,textarea,False,,4,False,,patient states,...,False,False,False,False,False,False,0.922214,1.0,0.976664,


In [10]:
field_keys_df.sort_values("row_id", ascending = True)

Unnamed: 0,report_class_id,field_key,field_title,field_type,field_key exists in field_key_library?,field_key_definition,row_id,field_key_exists_in_library_flag,organization_name,normalized_field_key,...,facet_hard_mismatch,facet_partial_mismatch,facet_missing_context,facet_hidden_agent,facet_temporal_mismatch,facet_token_validity_issues,semantic_severity,structural_severity,final_severity,reasons
0,AmalReport,meeting_summary,סיכום הביקור,textarea,False,,0,False,Amal,meeting summary,...,True,True,False,False,False,False,0.302620,1.0,0.790786,"hard_mismatch, partial_mismatch"
1,APNFamilyTherapyReport,data,Data:,textarea,True,The Field ‘data' contains the main topics cove...,1,,,,...,,,,,,,,,,
2,APNFamilyTherapyReport,assessment,Assessment:,textarea,True,This field captures the thorough analysis or e...,2,,,,...,,,,,,,,,,
3,APNFamilyTherapyReport,plan,Plan:,textarea,True,This field describes the future steps or actio...,3,,,,...,,,,,,,,,,
4,ARCAIndividualTherapyNoteReport,Patient_States,Patient states,textarea,False,,4,False,,patient states,...,False,False,False,False,False,False,0.922214,1.0,0.976664,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4341,ZepfIndividualTherapyServiceNote,interventions,Describe the interventions provided.,textarea,True,The Field ‘interventions' refers to the variou...,2820,,,,...,,,,,,,,,,
4342,ZepfIndividualTherapyServiceNote,client_response,Describe the client's response to the interven...,textarea,False,,2821,False,Zepf,client response,...,True,False,False,False,False,False,0.579451,1.0,0.873835,hard_mismatch
4343,ZepfIndividualTherapyServiceNote,plan_document,Document the plan. If there were barriers desc...,textarea,False,,2822,False,Zepf,plan document,...,False,True,False,False,False,False,0.694796,1.0,0.908439,partial_mismatch
4344,ZepfTbsOutreachReport,intervention_description_and_narrative,Describe Interventions Delivered:,textarea,False,,2823,False,Zepf,intervention description and narrative,...,False,True,False,False,False,False,0.527066,1.0,0.858120,partial_mismatch


## Deterministic Baseline

In [11]:
# Filter the DataFrame
condition = ((field_keys_df["field_type"] == "textarea") & (field_keys_df["field_key exists in field_key_library?"] == False))
filtered_df = field_keys_df[condition]

# Save the filtered DataFrame
with open(CACHE_DIR / "filtered_field_keys_df.pkl", "wb") as f:
    pickle.dump(filtered_df, f)

filtered_df.head()

Unnamed: 0,report_class_id,field_key,field_title,field_type,field_key exists in field_key_library?,field_key_definition,row_id,field_key_exists_in_library_flag,organization_name,normalized_field_key,...,facet_hard_mismatch,facet_partial_mismatch,facet_missing_context,facet_hidden_agent,facet_temporal_mismatch,facet_token_validity_issues,semantic_severity,structural_severity,final_severity,reasons
0,AmalReport,meeting_summary,סיכום הביקור,textarea,False,,0,False,Amal,meeting summary,...,True,True,False,False,False,False,0.30262,1.0,0.790786,"hard_mismatch, partial_mismatch"
4,ARCAIndividualTherapyNoteReport,Patient_States,Patient states,textarea,False,,4,False,,patient states,...,False,False,False,False,False,False,0.922214,1.0,0.976664,
7,ARCAIndividualTherapyNoteReport,Comments_Concerns,Comments or concerns about UDS,textarea,False,,7,False,,comments concerns,...,False,False,True,False,False,False,0.599356,1.0,0.879807,missing_context
8,ARCAIndividualTherapyNoteReport,Narrative_Summary,Narrative summary,textarea,False,,8,False,,narrative summary,...,False,False,False,False,False,False,0.831905,1.0,0.949572,
9,BarryCountyCMHAPeerSupportNoteReport,purpose,Purpose of Contact,textarea,False,,9,False,Barry County CMHA,purpose,...,False,False,True,False,False,False,0.643245,1.0,0.892973,missing_context


### Apply to_snake function  

In [12]:
# Test key_normalize function
test_cases = [
    "camelCaseExample",
    "PascalCaseExample",
    "snake_case_example",
    "kebab-case-example",
    "   extra   spaces   ",
    "special@characters!#$%^&*()",
    None
]

for case in test_cases:
    print(f"Input: {case}\nNormalized: {to_snake(case)}\n")


warnings.filterwarnings("ignore")

# Normalize field keys and titles
filtered_df["norm_key"]   = filtered_df["field_key"].map(to_snake)
filtered_df["norm_title"] = filtered_df["field_title"].map(to_snake)

# quick sanity preview
filtered_df[["field_key","norm_key","field_title","norm_title"]].head()

Input: camelCaseExample
Normalized: camel_case_example

Input: PascalCaseExample
Normalized: pascal_case_example

Input: snake_case_example
Normalized: snake_case_example

Input: kebab-case-example
Normalized: kebab_case_example

Input:    extra   spaces   
Normalized: extra_spaces

Input: special@characters!#$%^&*()
Normalized: special_characters

Input: None
Normalized: 



Unnamed: 0,field_key,norm_key,field_title,norm_title
0,meeting_summary,meeting_summary,סיכום הביקור,
4,Patient_States,patient_states,Patient states,patient_states
7,Comments_Concerns,comments_concerns,Comments or concerns about UDS,comments_or_concerns_about_uds
8,Narrative_Summary,narrative_summary,Narrative summary,narrative_summary
9,purpose,purpose,Purpose of Contact,purpose_of_contact


### Candidate Generator

In [13]:
import importlib, scoring_utils, beam_module
importlib.reload(scoring_utils)
importlib.reload(beam_module)

<module 'beam_module' from '/content/drive/MyDrive/Field-Key-Analysis/beam_module.py'>

In [14]:
work = filtered_df[["field_title", "field_key", "row_id"]]
work.head()

Unnamed: 0,field_title,field_key,row_id
0,סיכום הביקור,meeting_summary,0
4,Patient states,Patient_States,4
7,Comments or concerns about UDS,Comments_Concerns,7
8,Narrative summary,Narrative_Summary,8
9,Purpose of Contact,purpose,9


In [15]:


def suggest_for_row(row):
    raw_title = str(row["field_title"])
    # Use your normalizer for tokenization input (the scorer sees raw/cleaned text)
    norm_title = raw_title.lower().replace(" ", "_")
    toks = title_tokens(norm_title)
    if not toks:
        return {"suggested_key": "", "suggested_cosine": 0.0, "suggested_nli": 0.0, "suggested_combined": 0.0}

    best_key, best_score, best_seq = beam_search_title_only(
        toks, raw_title, beam_width=5, max_len=5
    )
    if not best_key or not style_ok(best_key):
        return {"suggested_key": "", "suggested_cosine": 0.0, "suggested_nli": 0.0, "suggested_combined": 0.0}

    # Scores for the suggestion
    cos = cosine_score(best_key, raw_title)
    cos01 = 0.5 * (cos + 1.0)
    ent = nli_entailment_prob(clean_title_for_cosine(raw_title), best_key.replace("_", " "))
    comb = combined_cosine_nli_score(best_key, raw_title, alpha=0.6, beta=0.4)

    out = {
        "suggested_key": best_key,
        "suggested_cosine": float(cos01),
        "suggested_nli": float(ent),
        "suggested_combined": float(comb),
    }

    # Optional: compare to original key if present
    if "field_key" in row and isinstance(row["field_key"], str) and row["field_key"]:
        orig = row["field_key"]
        o_cos = cosine_score(orig, raw_title); o_cos01 = 0.5 * (o_cos + 1.0)
        o_ent = nli_entailment_prob(clean_title_for_cosine(raw_title), orig.replace("_", " "))
        o_comb = combined_cosine_nli_score(orig, raw_title, alpha=0.6, beta=0.4)

        out.update({
            "original_key": orig,
            "original_cosine": float(o_cos01),
            "original_nli": float(o_ent),
            "original_combined": float(o_comb),
            "delta_cosine": float(cos01 - o_cos01),
            "delta_nli": float(ent - o_ent),
            "delta_combined": float(comb - o_comb),
        })
    return out

In [16]:
rows = []
for _, r in tqdm(work.iterrows(), total=len(work), desc="Suggesting keys"):
    res = suggest_for_row(r)  # your existing per-row function
    rows.append({
        "row_id": r["row_id"],           # keep the unique identifier
        "field_title": r["field_title"],
        **({"field_key": r["field_key"]} if "field_key" in r else {}),
        **res
    })

review = pd.DataFrame(rows)

# Sort by biggest improvement (if you kept original comparison)
if "delta_combined" in review.columns:
    review = review.sort_values("delta_combined", ascending=False)

review.to_csv("field_key_suggestions_review.csv", index=False)
review.head(10)

Suggesting keys:   0%|          | 0/2848 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,row_id,field_title,field_key,suggested_key,suggested_cosine,suggested_nli,suggested_combined,original_key,original_cosine,original_nli,original_combined,delta_cosine,delta_nli,delta_combined
148,172,,Progress_toward_goals_textbox,,1.0,0.880859,0.952344,Progress_toward_goals_textbox,0.523835,0.054596,0.336139,0.476165,0.826263,0.616204
1548,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1549,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1550,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1555,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1547,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1546,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1545,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1544,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1542,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473


In [17]:

# Save the filtered DataFrame
with open(CACHE_DIR / "suggested_field_keys_df.pkl", "wb") as f:
    pickle.dump(review, f)

## Artificats

In [18]:
with open(CACHE_DIR / "suggested_field_keys_df.pkl", "rb") as f:
    suggested_field_keys_df = pickle.load(f, encoding="utf-8")

suggested_field_keys_df.head()

Unnamed: 0,row_id,field_title,field_key,suggested_key,suggested_cosine,suggested_nli,suggested_combined,original_key,original_cosine,original_nli,original_combined,delta_cosine,delta_nli,delta_combined
148,172,,Progress_toward_goals_textbox,,1.0,0.880859,0.952344,Progress_toward_goals_textbox,0.523835,0.054596,0.336139,0.476165,0.826263,0.616204
1548,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1549,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1550,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473
1555,1304,,homicidal_ideation_text,,1.0,0.880859,0.952344,homicidal_ideation_text,0.523268,0.082275,0.346871,0.476732,0.798584,0.605473


In [22]:
merged = field_keys_df.merge(suggested_field_keys_df, on="row_id", how="left", suffixes=("", "_dup"))
merged = merged.drop(columns=[c for c in merged.columns if c.endswith("_dup")])

columns_to_keep = [
    "row_id",
    "field_title",
    "field_key",
    "suggested_key",
    "suggested_cosine",
    "suggested_nli",
    "suggested_combined",
    "delta_cosine",
    "delta_nli",
    "delta_combined"
]
cols_to_drop = [col for col in merged.columns if col not in columns_to_keep]
merged = merged.drop(columns=cols_to_drop)

merged.to_csv(OUT_DIR / "field_key_suggestions.csv", index=False)
merged.head()
