In [1]:
import re
import math
import pandas as pd
from pathlib import Path

# ---------- Parsing utilities ----------
RULE_RE = re.compile(r"\s*([A-Za-z_][A-Za-z0-9_]*)\s*(<=|>=|<|>|==)\s*([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*")

def _f(x):
    return f"{x:.6g}"

def consolidate_path(path: str):
    """Consolidate multiple constraints into a final tight range per feature."""
    slots = {}
    for part in filter(None, (p.strip() for p in str(path).split("--"))):
        m = RULE_RE.fullmatch(part)
        if not m:
            continue
        feat, op, v = m.groups()
        v = float(v)
        s = slots.setdefault(feat, {"lower": None, "upper": None, "lower_incl": False, "upper_incl": False})
        if op in ("<", "<="):
            if s["upper"] is None or v < s["upper"] or math.isclose(v, s["upper"]):
                s["upper"] = v
                s["upper_incl"] = (op == "<=")
        elif op in (">", ">="):
            if s["lower"] is None or v > s["lower"] or math.isclose(v, s["lower"]):
                s["lower"] = v
                s["lower_incl"] = (op == ">=")
        else:  # ==
            s.update(lower=v, upper=v, lower_incl=True, upper_incl=True)

    out = {}
    for feat, s in slots.items():
        lo, hi = s["lower"], s["upper"]
        if lo is not None and hi is not None:
            if math.isclose(lo, hi):
                out[feat] = f"X=={_f(hi)}"
            else:
                lo_sym = ">=" if s["lower_incl"] else ">"
                out[feat] = f"{_f(hi)}=>X{lo_sym}{_f(lo)}"
        elif lo is not None:
            out[feat] = f"X{'>=' if s['lower_incl'] else '>'}{_f(lo)}"
        elif hi is not None:
            out[feat] = f"X{'<=' if s['upper_incl'] else '<'}{_f(hi)}"
        else:
            out[feat] = ""
    return out


def group_to_profile_name(gv: int) -> str:
    """Map 1->Profile_A, 2->Profile_B, ..."""
    letter =chr(ord('A') + gv) if 0 <= gv <= 5 else str(gv)
    return f"Profile_{letter}"


# ---------- Core builders ----------
def single_profile_column_for_target(df: pd.DataFrame, target_value: int,group_number) -> pd.DataFrame:
    """Pick the row whose 'value' is closest to target_value, and return a one-column DataFrame."""
    required = {"value", "path", "group"}
    if not required.issubset(df.columns):
        missing = required - set(df.columns)
        raise ValueError(f"Missing required columns: {missing}")

    sorted_by_dist = (pd.to_numeric(df["value"], errors="coerce") - group_number).abs().sort_values()
    idx = sorted_by_dist.index[target_value-1]
    row = df.loc[idx]

    meta_cols = [c for c in df.columns if c not in ["group", "path"]]
    meta_items = [(m, row[m]) for m in meta_cols if m in row.index]

    final_ranges = consolidate_path(row["path"])
    feature_items = sorted(final_ranges.items())
    rows = meta_items + feature_items
    profile_col = group_to_profile_name(group_number)

    out_df = pd.DataFrame(rows, columns=["FEATURE", profile_col]).set_index("FEATURE")
    out_df.index.name = "FEATURE"
    return out_df


def process_folder_for_target(folder: str, target_value: int, out_root: str):
    """Process all CSVs in folder for a given target value, saving in the shared 'new_format' folder."""
    folder_path = Path(folder)
    out_root = Path(out_root)
    out_root.mkdir(parents=True, exist_ok=True)

    csv_files = sorted(folder_path.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    collected = []
    for f in csv_files:
        try:
            df = pd.read_csv(f)
            match = re.search(r"group_(\d+)", f.name)
            group_num = int(match.group(1))
            col_df = single_profile_column_for_target(df, target_value,group_num)
            collected.append(col_df)
        except Exception as e:
            print(f"[WARN] Skipping {f.name}: {e}")

    if not collected:
        raise RuntimeError("No valid CSVs produced profile columns.")

    merged = pd.concat(collected, axis=1, join="outer")

    # Sort columns alphabetically (Profile_A, Profile_B, ...)
    merged = merged.reindex(sorted(merged.columns), axis=1)

    out_path = out_root / f"new_format_{target_value}.csv"
    merged.to_csv(out_path, index=True, encoding="utf-8-sig")
    print(f"✅ Saved: {out_path.resolve()}")
    return out_path


def process_folder_for_targets_1_to_5(folder: str):
    """Main loop: create new_format_1 ... new_format_5 in a shared folder."""
    out_root = Path(folder) / "new_format"
    for i in range(1, 6):
        process_folder_for_target(folder, i, out_root)


# ---------- Run ----------
# Example:
# process_folder_for_targets_1_to_5("ml/group_nodes_5")


In [2]:
for  folder in ['group_nodes_5','group_nodes_10','group_nodes_20','group_nodes_all','group_nodes_None']:
    process_folder_for_targets_1_to_5(f"ml/{folder}")

✅ Saved: C:\Users\achit\OneDrive - ariel.ac.il\Current_research\ASC2\pythonProject\places\tel_aviv\ml\group_nodes_5\new_format\new_format_1.csv
✅ Saved: C:\Users\achit\OneDrive - ariel.ac.il\Current_research\ASC2\pythonProject\places\tel_aviv\ml\group_nodes_5\new_format\new_format_2.csv
✅ Saved: C:\Users\achit\OneDrive - ariel.ac.il\Current_research\ASC2\pythonProject\places\tel_aviv\ml\group_nodes_5\new_format\new_format_3.csv
✅ Saved: C:\Users\achit\OneDrive - ariel.ac.il\Current_research\ASC2\pythonProject\places\tel_aviv\ml\group_nodes_5\new_format\new_format_4.csv
✅ Saved: C:\Users\achit\OneDrive - ariel.ac.il\Current_research\ASC2\pythonProject\places\tel_aviv\ml\group_nodes_5\new_format\new_format_5.csv
✅ Saved: C:\Users\achit\OneDrive - ariel.ac.il\Current_research\ASC2\pythonProject\places\tel_aviv\ml\group_nodes_10\new_format\new_format_1.csv
✅ Saved: C:\Users\achit\OneDrive - ariel.ac.il\Current_research\ASC2\pythonProject\places\tel_aviv\ml\group_nodes_10\new_format\new_for

In [17]:
# test area

['node_id', 'leaf_id', 'value', 'impurity', 'n_node_samples']

In [60]:
folder = 'group_nodes_10'
folder_path = Path(f"ml/{folder}")
csv_files = sorted(folder_path.glob("*.csv"))
f = csv_files[1]
m = re.search(r"group_(\d+)", f.name)
group_number = int(m.group(1)) if m else None
df = pd.read_csv(f)
df

Unnamed: 0,node_id,leaf_id,value,group,impurity,n_node_samples,path
0,10426,-1,1.06314,2,0.844136,586,day <= 0.500000--closeness > 0.018784--SEleve1...
1,10393,-1,1.054198,2,0.920548,941,day <= 0.500000--closeness > 0.018784--SEleve1...
2,26843,-1,1.034884,2,0.937321,602,day > 0.500000--closeness > 0.017555--SEleve1_...
3,24421,-1,1.024433,2,0.938321,573,day > 0.500000--closeness > 0.017555--SEleve1_...
4,18401,-1,1.031593,2,0.972903,728,day > 0.500000--closeness > 0.017555--SEleve1_...


In [107]:
idx = (pd.to_numeric(df["value"], errors="coerce") - group_number).abs().sort_values()
idx.index[1]

np.int64(1)

In [98]:

row = df.loc[idx]
row

KeyError: "None of [Index([0.9368600682593855, 0.9458023379383635, 0.9651162790697674,\n       0.9755671902268761, 0.9684065934065933],\n      dtype='float64')] are in the [index]"

[WindowsPath('ml/group_nodes_10/group_1.csv'),
 WindowsPath('ml/group_nodes_10/group_2.csv'),
 WindowsPath('ml/group_nodes_10/group_3.csv'),
 WindowsPath('ml/group_nodes_10/group_4.csv'),
 WindowsPath('ml/group_nodes_10/group_5.csv')]

In [19]:
# 3) Collect metadata values from the selected row
meta_items = [(m, target_row[m]) for m in META_ORDER if m in target_row.index]
meta_items

[('node_id', np.int64(87376)),
 ('leaf_id', np.int64(-1)),
 ('value', np.float64(0.8343777976723367)),
 ('impurity', np.float64(0.8642434132223655)),
 ('n_node_samples', np.int64(2234))]

In [20]:
# 4) Append consolidated feature ranges (sorted alphabetically)
feature_items = sorted(final_ranges.items())
# 5) Combine everything
rows = meta_items + feature_items
# 6) Build the single-column DataFrame
out_df = pd.DataFrame(rows, columns=["FEATURE", profile_col]).set_index("FEATURE")
out_df.index.name = "FEATURE"
print(df)
print(path)

{'day': 'X>0.5',
 'SEleve1_10': 'X<=6.5',
 'closeness': 'X>0.023692',
 'green_canopy': 'X<=0.095',
 'synagogues': 'X<=12.5',
 'pop_dens': '0.020401=>X>0.012547',
 'sidewalk_width': 'X>1.71593',
 'shadows': 'X>127.629',
 'betweennes': 'X>0.00015'}