## Setup

### Install Packages

In [None]:
import logging
import os
import pickle
import re
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from typing import List, Tuple
from beam_module import title_tokens, beam_search_title_only, style_ok
from scoring_utils import (
    cosine_score,
    nli_entailment_prob,
    combined_cosine_nli_score,
    clean_title_for_cosine,
    )
from tqdm.auto import tqdm


In [None]:
# ---- Paths ----
PROJ_ROOT = Path.cwd()
DATA_DIR = PROJ_ROOT / "data"
CACHE_DIR = PROJ_ROOT / "cache"
OUT_DIR = PROJ_ROOT / "output"

DATA_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Reproducibility / Displayxs ----
RNG_SEED = 42



### Helper Functions

In [None]:
def to_snake(text: str) -> str:
  """Normalize a field key into a snake_case.

    Steps: substitute camelCase/PascalCase/kebab-case with snake_case,
    lowercase,
    strip non-alphanumerics (keep spaces), collapse whitespace.

    Args:
      text: Raw key text.

    Returns:
      Normalized string (possibly empty).
    """
  if pd.isna(text):
      return ""
  s = str(text).strip()
  s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", "_", s)   # split camelCase
  s = s.lower()
  s = re.sub(r"[^a-z0-9_]+", "_", s)              # non-word -> _
  s = re.sub(r"_{2,}", "_", s).strip("_")         # collapse/trim _
  return s


def join_tokens(tokens: List[str]) -> str:
    """Turn a token sequence into a snake_case key and validate it."""
    s = "_".join(tokens)
    s = re.sub(r"_{2,}", "_", s).strip("_")
    return s if style_ok(s) else ""




def token_overlap(a: List[str], b: List[str]) -> float:
    if not a or not b: 
        return 0.0
    return len(set(a) & set(b)) / min(len(set(a)), len(set(b)))

def jaccard(a: List[str], b: List[str]) -> float:
    A, B = set(a), set(b)
    if not A and not B: return 1.0
    if not A or not B:  return 0.0
    return len(A & B) / len(A | B)


### REPLACE LATER WITH ACTUAL SCORE CANDIDATE
def score_candidate_cosine(candidate_tokens: list[str], original_title: str) -> float:
    """
    Replace this with:
        α * cosine(candidate, title) + β * nli_entailment(candidate|title) - γ * penalties
    For now: deterministic lexical proxy.
    """
    candidate_key = "_".join(candidate_tokens)
    if not candidate_key:
        return 0.0
    return cosine_score(candidate_key, original_title)


    # ---- Step 1: initialize beam with all 1-token sequences (valid only)
    candidates = []
    for t in title_tokens_clean:
        key = join_tokens([t])
        if not key: 
            continue
        score = score_candidate_cosine([t], title_tokens_clean)
        candidates.append(([t], score))

    if not candidates:
        return "", 0.0, []

    # keep top-k (deterministic tie-break)
    candidates.sort(key=lambda x: (-x[1], join_tokens(x[0])))
    beam = candidates[:beam_width]

    best_seq, best_score = beam[0]

    # ---- Steps 2..max_len: expand beam
    for _ in range(2, max_len + 1):
        expansions = []
        for seq, _score in beam:
            remaining = [t for t in title_tokens_clean if t not in seq]
            if not remaining:
                continue
            for tok in remaining:
                # try inserting at every position (0..len(seq))
                for pos in range(len(seq) + 1):
                    new_seq = seq[:pos] + [tok] + seq[pos:]
                    key = join_tokens(new_seq)
                    if not key: 
                        continue
                    sc = score_candidate(new_seq, title_tokens_clean)
                    expansions.append((new_seq, sc))

        if not expansions:
            break

        # deduplicate by sequence tuple; keep best score/tie-break
        dedup = {}
        for seq, sc in expansions:
            tup = tuple(seq)
            key_str = join_tokens(seq)
            prev = dedup.get(tup)
            if (prev is None) or (sc > prev[0]) or (sc == prev[0] and key_str < prev[1]):
                dedup[tup] = (sc, key_str)

        next_beam = [(list(tup), sc_key[0]) for tup, sc_key in dedup.items()]
        next_beam.sort(key=lambda x: (-x[1], join_tokens(x[0])))
        beam = next_beam[:beam_width]

        # track global best
        if beam and (beam[0][1] > best_score or 
                     (beam[0][1] == best_score and join_tokens(beam[0][0]) < join_tokens(best_seq))):
            best_seq, best_score = beam[0]

    return join_tokens(best_seq), best_score, best_seq


def cosine_score(candidate_key: str, title_text: str) -> float:
    cand_txt = key_to_text(candidate_key)
    title_txt = clean_title_for_cosine(title_text)
    if not cand_txt or not title_txt:
        return 0.0
    vecs = embed([cand_txt, title_txt])  # 2 × d
    # embeddings are normalized → dot = cosine
    return float(np.dot(vecs[0], vecs[1]))

### Load Data

In [None]:
field_keys = pd.read_csv(
    OUT_DIR / "final_df.csv"
)
field_keys_df = field_keys.reset_index(drop=True).copy()
cols_ending_in_y = [col for col in field_keys_df.columns if col.endswith("_y")]
field_keys_df.drop(columns=cols_ending_in_y, inplace=True)

cols_ending_in_x = [col for col in field_keys_df.columns if col.endswith("_x")]
field_keys_df.rename(columns={col: col[:-2] for col in cols_ending_in_x}, inplace=True)

# Display the shape and head of the DataFrame
logging.info(
    "fields_keys_df Rows: %d, Columns: %d",
    field_keys_df.shape[0],
    field_keys_df.shape[1],
)

field_keys_df.head()

## Deterministic Baseline

In [None]:
# Filter the DataFrame
condition = ((field_keys_df["field_type"] == "textarea") & (field_keys_df["field_key exists in field_key_library?"] == False))
filtered_df = field_keys_df[condition]

# Save the filtered DataFrame
with open(CACHE_DIR / "filtered_field_keys_df.pkl", "wb") as f:
    pickle.dump(filtered_df, f)

filtered_df.head()

### Apply to_snake function  

In [None]:
# Test key_normalize function
test_cases = [
    "camelCaseExample",
    "PascalCaseExample",
    "snake_case_example",
    "kebab-case-example",
    "   extra   spaces   ",
    "special@characters!#$%^&*()",
    None
]

for case in test_cases:
    print(f"Input: {case}\nNormalized: {to_snake(case)}\n")


warnings.filterwarnings("ignore")

# Normalize field keys and titles
filtered_df["norm_key"]   = filtered_df["field_key"].map(to_snake)
filtered_df["norm_title"] = filtered_df["field_title"].map(to_snake)

# quick sanity preview
filtered_df[["field_key","norm_key","field_title","norm_title"]].head()

### Candidate Generator

In [None]:
import importlib, scoring_utils, beam_module
importlib.reload(scoring_utils)
importlib.reload(beam_module)

In [None]:
work = filtered_df[["field_title", "field_key", "row_id"]]
work.head()

In [None]:


def suggest_for_row(row):
    raw_title = str(row["field_title"])
    # Use your normalizer for tokenization input (the scorer sees raw/cleaned text)
    norm_title = raw_title.lower().replace(" ", "_")
    toks = title_tokens(norm_title)
    if not toks:
        return {"suggested_key": "", "suggested_cosine": 0.0, "suggested_nli": 0.0, "suggested_combined": 0.0}

    best_key, best_score, best_seq = beam_search_title_only(
        toks, raw_title, beam_width=5, max_len=5
    )
    if not best_key or not style_ok(best_key):
        return {"suggested_key": "", "suggested_cosine": 0.0, "suggested_nli": 0.0, "suggested_combined": 0.0}

    # Scores for the suggestion
    cos = cosine_score(best_key, raw_title)
    cos01 = 0.5 * (cos + 1.0)
    ent = nli_entailment_prob(clean_title_for_cosine(raw_title), best_key.replace("_", " "))
    comb = combined_cosine_nli_score(best_key, raw_title, alpha=0.6, beta=0.4)

    out = {
        "suggested_key": best_key,
        "suggested_cosine": float(cos01),
        "suggested_nli": float(ent),
        "suggested_combined": float(comb),
    }

    # Optional: compare to original key if present
    if "field_key" in row and isinstance(row["field_key"], str) and row["field_key"]:
        orig = row["field_key"]
        o_cos = cosine_score(orig, raw_title); o_cos01 = 0.5 * (o_cos + 1.0)
        o_ent = nli_entailment_prob(clean_title_for_cosine(raw_title), orig.replace("_", " "))
        o_comb = combined_cosine_nli_score(orig, raw_title, alpha=0.6, beta=0.4)

        out.update({
            "original_key": orig,
            "original_cosine": float(o_cos01),
            "original_nli": float(o_ent),
            "original_combined": float(o_comb),
            "delta_cosine": float(cos01 - o_cos01),
            "delta_nli": float(ent - o_ent),
            "delta_combined": float(comb - o_comb),
        })
    return out

In [None]:
rows = []
for _, r in tqdm(work.iterrows(), total=len(work), desc="Suggesting keys"):
    res = suggest_for_row(r)  # your existing per-row function
    rows.append({
        "field_title": r["field_title"],
        **({"field_key": r["field_key"]} if "field_key" in r else {}),
        **res
    })

review = pd.DataFrame(rows)

# Sort by biggest improvement (if you kept original comparison)
if "delta_combined" in review.columns:
    review = review.sort_values("delta_combined", ascending=False)

review.to_csv("field_key_suggestions_review.csv", index=False)
review.head(10)