# **This sbs array_dict with pickle v1**

# **00_setup.py** | *Imports, glossary, config flags*

**Modular Workbook for Array Dictionary Implementation**
_
https://copilot.microsoft.com/shares/pages/3HV5tomNJNNZi9VReAwN9

In [32]:
import pandas as pd
import numpy as np
import tkinter as tk
import pickle
import os
from datetime import datetime
xl_pathway = "xl_ad_sn.csv"          #xl versions are in REPO  data\xl_ad_sn.csv
pkl_pathway = "ad_sn"                # pkl versions are in data folder to avoid git and allow normal version free bu

# **01_array_loader.py** |  *Load flat arrays (CSV, Excel) into base structures*

In [33]:
# Retreive  xl_ad_sn from .csl
# Define the path (reset path to WSL preventing possible p errors)
base_path = os.path.expanduser("~")  # Gets /home/bhuns in WSL
csv_path = os.path.join(base_path, "JL_1/data/xl_ad_sn.csv")

# Load with headers
df = pd.read_csv(csv_path)

# Convert to NumPy array (excluding headers)
xl_ad_sn = df.to_numpy()
#xl_ad_sn

# **02_array_dict_builder.py**   |  *Transform arrays into array_dict with metadata*
_ ***Script Update for Data Dictionary Creation** link *https://copilot.microsoft.com/shares/pages/jLLrDDP9aZqQjatwkyFfr*

In [36]:
# Read and isolate the raw headers from "xl_ad_sn_hdr,csl"
# Load CSV
csv_path = os.path.join(os.path.expanduser("~"), "JL_1/data/xl_ad_sn.csv")
df = pd.read_csv(csv_path)

# Inspect raw headers
raw_headers = df.columns.tolist()
#print("Raw headers:", raw_headers)


In [None]:
# ✅ Full Setup >> headers >> dtv_index >> array_dict
csv_path = os.path.join(os.path.expanduser("~"), "JL_1/data/xl_ad_sn.csv")
df = pd.read_csv(csv_path)         # read array from "csv_path"

# Extract headers and data
headers = df.columns.tolist()
xl_ad_sn = df.to_numpy()

# Locate the index of the 'dtv' column
dtv_index = headers.index('dtv')

# Build array_dict
array_dict = {}

for row in xl_ad_sn:
    dtv_key = row[dtv_index]
    raw_values = np.delete(row, dtv_index)

    array_dict[dtv_key] = {
        'raw_values': raw_values.tolist(),
        'flags': {},
        'attributes': {
            'features': [h for i, h in enumerate(headers) if i != dtv_index]
        },
        'classification': {}
    }


In [None]:
# 🧪 Verify the construction

for dtv, entry in list(array_dict.items())[:3]:
    print(f"DTV: {dtv}")
    print("Raw:", entry['raw_values'])
    print("Features:", entry['attributes']['features'])
    print("-" * 40)


In [38]:
# Step 3: Continue Building array_dict
xl_ad_sn = df.to_numpy()

array_dict = {}
for row in xl_ad_sn:
    dtv_key = row[dtv_index]
    raw_values = np.delete(row, dtv_index)

    array_dict[dtv_key] = {
        'raw_values': raw_values.tolist(),
        'flags': {},
        'attributes': {
            'features': [h for i, h in enumerate(raw_headers) if i != dtv_index]
        },
        'classification': {}
    }



In [40]:
for dvt, entry in list(array_dict.items())[:3]:
    print(f"DVT: {dvt}")
    print("Raw:", entry['raw_values'])
    print("Features:", entry['attributes']['features'])
    print("-" * 40)

DVT: 45860.0
Raw: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Features: ['s000', 's001', 's002', 's003', 's004', 's005', 's006', 's007', 's008', 's009', 's010', 's011', 's012', 's013', 's014', 's015']
----------------------------------------
DVT: 45861.0
Raw: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Features: ['s000', 's001', 's002', 's003', 's004', 's005', 's006', 's007', 's008', 's009', 's010', 's011', 's012', 's013', 's014', 's015']
----------------------------------------
DVT: 45862.0
Raw: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Features: ['s000', 's001', 's002', 's003', 's004', 's005', 's006', 's007', 's008', 's009', 's010', 's011', 's012', 's013', 's014', 's015']
----------------------------------------


# **03_classification_registry.py**  |  *Define classification args, flags, and validation logic*

**Classification Registry Implementation** 
_ *https://copilot.microsoft.com/shares/pages/P91viiY6QzR3Mp4WUnbFf*

## 🧠 Classification Registry Scaffold (classification_registry.py)

### Perfect, Bill. You’ve got a classification matrix scaffolded—rows as attribute_sn (e.g. a000, a001, ...) and columns as dat_col.sn (e.g. s000, s001, ...). 
1. That’s a great foundation for a modular classification registry.
2. Let’s build a separate file—say, classification_registry.py—that can:
3. Load and validate the matrix
4. Apply classification logic per attribute or per column
5. Track provenance and confidence
6. Return structured results for embedding into array_dict

In [41]:
#🔹 1. Load the Matrix

def load_classification_matrix(path):
    df = pd.read_csv(path)  # or pd.read_excel
    df.index.name = 'attribute_sn'
    return df


In [42]:
# 🔹 2. Define Classification Methods
def threshold_classifier(row, threshold=0.5):
    score = row.mean()
    group = 'A' if score > threshold else 'B'
    confidence = round(abs(score - threshold), 3)
    return group, confidence

def zscore_classifier(row):
    z = (row - row.mean()) / row.std()
    group = 'High' if z.mean() > 1 else 'Low'
    confidence = round(z.std(), 3)
    return group, confidence


In [43]:
# 🔹 3. Registry Dictionary
classification_registry = {
    'threshold': threshold_classifier,
    'zscore': zscore_classifier
}


In [44]:
# 🔹 4. Apply Classification
def classify_matrix(df, method='threshold'):
    results = {}
    classifier = classification_registry[method]

    for attr_sn, row in df.iterrows():
        group, confidence = classifier(row)
        results[attr_sn] = {
            'group': group,
            'confidence': confidence,
            'method': method
        }
    return results


In [45]:
# 🔹 5. Embed into array_dict
def embed_classification(array_dict, classification_results):
    for dvt_key, result in classification_results.items():
        if dvt_key in array_dict:
            array_dict[dvt_key]['classification'] = result


In [46]:
# 🧪 Optional Diagnostic
def preview_classification(results, n=5):
    for k in list(results.keys())[:n]:
        print(f"{k}: {results[k]}")


# **04_gui_selector.py**  |  *Operator-driven GUI for selecting and grouping entries*

# **05_storage_utils.py**  |  *Save/load using Pickle, Excel, HDF5*

# **06_diagnostics.py**  |  *Overlay tools for inspecting metadata and classification*

# **07_tests.py**  |  *Reproducible test cases and versioning checks*

### 