## Setup

### Install Packages

In [None]:
import logging
import os
import pickle
import re
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

In [None]:
# ---- Paths ----
PROJ_ROOT = Path.cwd()
DATA_DIR = PROJ_ROOT / "data"
CACHE_DIR = PROJ_ROOT / "cache"
OUT_DIR = PROJ_ROOT / "output"

DATA_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- Reproducibility / Displayxs ----
RNG_SEED = 42



### Load Data

In [None]:
field_keys = pd.read_csv(
    OUT_DIR / "final_df.csv"
)
field_keys_df = field_keys.reset_index(drop=True).copy()
cols_ending_in_y = [col for col in field_keys_df.columns if col.endswith("_y")]
field_keys_df.drop(columns=cols_ending_in_y, inplace=True)

cols_ending_in_x = [col for col in field_keys_df.columns if col.endswith("_x")]
field_keys_df.rename(columns={col: col[:-2] for col in cols_ending_in_x}, inplace=True)
# Display the shape and head of the DataFrame
logging.info(
    "fields_keys_df Rows: %d, Columns: %d",
    field_keys_df.shape[0],
    field_keys_df.shape[1],
)

field_keys_df.head()

### Helper Functions

In [None]:
def to_snake(text: str) -> str:
  """Normalize a field key into a snake_case.

    Steps: substitute camelCase/PascalCase/kebab-case with snake_case,
    lowercase,
    strip non-alphanumerics (keep spaces), collapse whitespace.

    Args:
      text: Raw key text.

    Returns:
      Normalized string (possibly empty).
    """
  if pd.isna(text):
      return ""
  s = str(text).strip()
  s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", "_", s)   # split camelCase
  s = s.lower()
  s = re.sub(r"[^a-z0-9_]+", "_", s)              # non-word -> _
  s = re.sub(r"_{2,}", "_", s).strip("_")         # collapse/trim _
  return s

def style_ok(s: str) -> bool:
    if not isinstance(s, str) or not s:
        return False
    if len(s) > STYLE["max_len"]:
        return False
    if s.startswith("_") or s.endswith("_") or "__" in s:
        return False
    if not re.match(STYLE["allowed_charset"], s):
        return False
    return True

## Deterministic Baseline

In [None]:
# Filter the DataFrame
condition = ((field_keys_df["field_type"] == "textarea") & (field_keys_df["field_key exists in field_key_library?"] == False))
filtered_df = field_keys_df[condition]



# Save the filtered DataFrame
with open(CACHE_DIR / "filtered_field_keys_df.pkl", "wb") as f:
    pickle.dump(filtered_df, f)




    
filtered_df.head()

### Test to_snake function  

In [None]:
# Test key_normalize function
test_cases = [
    "camelCaseExample",
    "PascalCaseExample",
    "snake_case_example",
    "kebab-case-example",
    "   extra   spaces   ",
    "special@characters!#$%^&*()",
    None
]

for case in test_cases:
    print(f"Input: {case}\nNormalized: {to_snake(case)}\n")



### Apply to_snake

In [None]:
warnings.filterwarnings("ignore")

# Normalize field keys and titles
filtered_df["norm_key"]   = filtered_df["field_key"].map(to_snake)
filtered_df["norm_title"] = filtered_df["field_title"].map(to_snake)

# quick sanity preview
filtered_df[["field_key","norm_key","field_title","norm_title"]].head(10)

### Candidate Generator

In [None]:
import re

STYLE = {
    "allowed_charset": r"^[a-z0-9_]+$",
    "max_len": 40,
    "banned_tokens": {"other", "misc", "tmp", "test", "sample", "new"},
}



In [None]:
5|3