In [73]:
#>>Import
#  Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [74]:
# Load INI configuration from lang_pipeline.ini
import configparser
import ast
from pathlib import Path


def find_config(name="lang_pipeline.ini"):
    # Search up to 6 levels up then fallback to rglob
    p = Path.cwd()
    for _ in range(6):
        candidate = p / name
        if candidate.exists():
            return candidate
        p = p.parent
    found = next(Path.cwd().rglob(name), None)
    return found


def _parse_list_option(val):
    if not val:
        return []
    try:
        parsed = ast.literal_eval(val)
        if isinstance(parsed, (list, tuple)):
            return [str(x).strip() for x in parsed]
    except Exception:
        return [c.strip() for c in str(val).split(',') if c.strip()]
    return []


cfg_path = find_config()
CONFIG = {}
if cfg_path:
    cp = configparser.ConfigParser()
    cp.read(cfg_path)
    if 'pipeline' in cp:
        sec = cp['pipeline']
        CONFIG['source'] = sec.get('source', None)
        CONFIG['source_browser'] = sec.getboolean('source_browser', fallback=True)
        CONFIG['preview'] = sec.getboolean('preview', fallback=True)
        CONFIG['preview_rows'] = sec.getint('preview_rows', fallback=20)
        CONFIG['fix_nbsp'] = sec.getboolean('fix_nbsp', fallback=True)
        CONFIG['transliterate'] = sec.getboolean('transliterate', fallback=False)
        CONFIG['normalize'] = sec.getboolean('normalize', fallback=False)
        CONFIG['custom_prompt'] = sec.get('custom_prompt', '')
        CONFIG['chunk_size'] = sec.getint('chunk_size', fallback=10000)
        CONFIG['columns_preprocess'] = _parse_list_option(sec.get('columns_preprocess', sec.get('columns_process', '')))
        CONFIG['columns_agent'] = _parse_list_option(sec.get('columns_agent', ''))
        CONFIG['columns_postprocess'] = _parse_list_option(sec.get('columns_postprocess', ''))
        # New export-related options
        CONFIG['columns_export_filter'] = sec.getboolean('columns_export_filter', fallback=False)
        CONFIG['columns_export'] = _parse_list_option(sec.get('columns_export', ''))
        # Validation options
        CONFIG['columns_validate_active'] = sec.getboolean('columns_validate_active', fallback=False)
        CONFIG['columns_validate_content'] = _parse_list_option(sec.get('columns_validate_content', ''))
    print(f"Loaded config from: \n{cfg_path}")
else:
    print("No lang_pipeline.ini found; using defaults")


# Back-compat variables for convenience
SOURCE_BROWSER = CONFIG.get('source_browser', True)
PREVIEW_ENABLED = CONFIG.get('preview', True)
PREVIEW_ROWS = CONFIG.get('preview_rows', 20)
COLUMNS_PREPROCESS = CONFIG.get('columns_preprocess', ['translation'])
COLUMNS_AGENT = CONFIG.get('columns_agent', [])
COLUMNS_POSTPROCESS = CONFIG.get('columns_postprocess', ['translation'])
# Export settings
COLUMNS_EXPORT_FILTER = CONFIG.get('columns_export_filter', False)
COLUMNS_EXPORT = CONFIG.get('columns_export', [])
# Validation settings
COLUMNS_VALIDATE_ACTIVE = CONFIG.get('columns_validate_active', False)
COLUMNS_VALIDATE_CONTENT = CONFIG.get('columns_validate_content', [])

# # Lightweight pandas display settings (mitigate heavy HTML rendering in Jupyter/Electron/Chrome)
# import pandas as pd
# pd.set_option("display.html.table_schema", False)
# pd.set_option("display.max_rows", 20)
# pd.set_option("display.max_columns", 20)
# # Force plain-text notebook repr to reduce CPU and memory usage
# pd.set_option("display.notebook_repr_html", False)
# # Also set a conservative width for column display
# pd.set_option("display.width", 120)


Loaded config from: 
/Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/Notebooks/lang_pipeline.ini


In [75]:
# File selection (interactive or from INI)
import platform, subprocess
from pathlib import Path

# If config forces using source directly, use it
if not globals().get('SOURCE_BROWSER', True) and CONFIG.get('source'):
    input_path = Path(CONFIG['source'])
    if not input_path.exists():
        raise FileNotFoundError(f"Configured (INI) source not found: {input_path}")
    print("Configured (INI) source used:", input_path)
else:
    # Try existing picker; otherwise fallback to OS-specific implementation
    try:
        from tools.ui_native import pick_file
    except Exception:
        def pick_file(filter_str=None):
            system = platform.system()
            if system == "Darwin":
                script = '''
                set theFile to choose file
                POSIX path of theFile
                '''
                res = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
                return res.stdout.strip()
            elif system == "Windows":
                ps_script = r'''
                Add-Type -AssemblyName System.Windows.Forms
                $ofd = New-Object System.Windows.Forms.OpenFileDialog
                $ofd.Filter = "All files (*.*)|*.*"
                if ($ofd.ShowDialog() -eq "OK") { Write-Output $ofd.FileName }
                '''
                res = subprocess.run(["powershell", "-NoProfile", "-Command", ps_script], capture_output=True, text=True)
                return res.stdout.strip()
            else:
                raise NotImplementedError("No native file dialog for this OS")

    pick_path = pick_file("CSV files (*.csv)|*.csv")
    if not pick_path:
        raise FileNotFoundError("No file selected")

    input_path = Path(pick_path)
    print(f"Selected input file: \n{input_path}")

Configured (INI) source used: /Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered.csv


In [76]:
# Line-by-line import with malformed-line filtering
import csv

rows = []
bad_rows = []
total_lines = 0
with input_path.open("r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f, delimiter="\t", quotechar='"', escapechar='\\')
    try:
        header = next(reader)
    except StopIteration:
        header = []
    for line_number, line in enumerate(reader, start=2):
        total_lines += 1
        if len(line) == len(header):
            rows.append(dict(zip(header, line)))
        elif len(line) > len(header) and len(line) % len(header) == 0:
            for i in range(0, len(line), len(header)):
                subline = line[i:i+len(header)]
                rows.append(dict(zip(header, subline)))
        else:
            bad_rows.append((line_number, line))

# Summary and fallout storage
expected_columns = header if 'header' in locals() else []
print(f"Header: \n{expected_columns}\n") 

print(f"Imported {len(rows)} rows ({total_lines} lines read).")
if bad_rows:
    # fallout file has same extension and stem with _fallout appended
    fallout_path = input_path.with_name(f"{input_path.stem}_fallout{input_path.suffix}")

    # Re-read original file and write raw lines corresponding to bad line numbers
    bad_line_numbers = {ln for ln, _ in bad_rows}
    with input_path.open("r", encoding="utf-8", errors="replace") as src, fallout_path.open("w", encoding="utf-8", errors="replace") as out:
        # write header for context if available
        if header:
            out.write('\t'.join(header) + '\n')
        for ln, raw in enumerate(src, start=1):
            if ln in bad_line_numbers:
                out.write(raw)

    print(f"{len(bad_rows)} malformed lines found. \nMalformed lines stored in: \n{fallout_path}")
else:
    print("No malformed lines found.")


Header: 
['ID', 'name', 'language', 'translation', 'confidence', 'breakdown', 'Remaining', 'Unallowed_characters', 'transliterations', 'Unmapped_in_transliterions']

Imported 277576 rows (277586 lines read).
10 malformed lines found. 
Malformed lines stored in: 
/Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_fallout.csv


In [77]:
# Validate required columns (from INI)
# If COLUMNS_VALIDATE_ACTIVE is True, ensure every row has non-empty values for listed columns
if globals().get('COLUMNS_VALIDATE_ACTIVE', False):
    required = COLUMNS_VALIDATE_CONTENT or []
    print(f"Validation active. Required columns: {required}")

    if not required:
        print("No columns specified in 'columns_validate_content'; skipping validation.")
    else:
        # Warn about columns that aren't present in the detected header
        missing_cols = [c for c in required if c not in expected_columns]
        if missing_cols:
            print(f"Warning: these columns specified in 'columns_validate_content' were not found and will be skipped: {missing_cols}")

        validate_cols = [c for c in required if c in expected_columns]
        if not validate_cols:
            print("No valid columns to validate after filtering; skipping validation.")
        else:
            # Collect rows that are missing one or more required columns
            rows_with_missing = []
            for i, row in enumerate(rows):
                # treat None or all-whitespace as missing
                empty = [c for c in validate_cols if not str(row.get(c, '')).strip()]
                if empty:
                    rows_with_missing.append((i, empty))

            if rows_with_missing:
                total = len(rows_with_missing)
                # Aggregate missing column counts
                from collections import Counter
                col_counts = Counter()
                for _, empties in rows_with_missing:
                    for c in empties:
                        col_counts[c] += 1

                print(f"Validation failed for {total} rows.")
                print("Columns missing (counts):")
                for c, cnt in col_counts.most_common():
                    print(f" - {c}: {cnt} rows")

                # Show a few examples
                N = 5
                print(f"First {min(N, total)} failing rows (index -> missing columns):")
                for idx, empties in rows_with_missing[:N]:
                    print(f" Row {idx}: {empties}")

                # Exclude invalid rows from further processing
                invalid_indexes = {idx for idx, _ in rows_with_missing}
                rows = [r for i, r in enumerate(rows) if i not in invalid_indexes]

            else:
                print("All rows passed validation.")

else:
    print("Column content validation disabled (COLUMNS_VALIDATE_ACTIVE=False)")

Validation active. Required columns: ['ID', 'name', 'language', 'translation']
Validation failed for 277576 rows.
Columns missing (counts):
 - ID: 277576 rows
First 5 failing rows (index -> missing columns):
 Row 0: ['ID']
 Row 1: ['ID']
 Row 2: ['ID']
 Row 3: ['ID']
 Row 4: ['ID']


In [78]:
# Preview of imported lines
import pandas as pd

df = pd.DataFrame(rows)
if globals().get('PREVIEW_ENABLED', True):
    print(f"Preview (first {PREVIEW_ROWS} rows):")
    # Print a lightweight plain-text preview to avoid heavy HTML rendering
    print(df.head(PREVIEW_ROWS).to_string(index=False))
else:
    print("Preview disabled by INI (PREVIEW_ENABLED=False)")


Preview (first 20 rows):
Empty DataFrame
Columns: []
Index: []


In [79]:
#  Import
#>>Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [80]:
# Determine the expected set of columns from the header
expected_columns = list(rows[0].keys())

bad_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows.append((i, row))

if bad_rows:
    print("Bad rows:")
    for idx, row in bad_rows:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows found.")

IndexError: list index out of range

In [None]:
# Exclude bad rows from further processing
bad_rows_list = []
good_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows_list.append((i, row))
    else:
        good_rows.append(row)

if bad_rows_list:
    print("Excluding bad rows:")
    for idx, row in bad_rows_list:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows to exclude.")

rows = good_rows
print(f"Proceeding with {len(rows)} good rows.")

In [None]:
# Ensure language column is correct as means of verifying the imported data
languages = {row.get("language") for row in rows}
primary_lang = next(iter(languages)) # pick the first
languages
if len(languages) > 1:
    print("Warning: multiple language values found:", languages)
else:
    print(f"Primary language: {primary_lang}")

In [None]:
# Fix spaces and NBSPs
import re

# Configuration for space normalization
NPC_AS_SPACE = ["\u00A0"]  # Characters to treat as spaces

# Build regex patterns
NPC_AS_SPACE_CLASS = r'\u00A0'  # Since only one char
SEQ_SPACES_NPC = re.compile(r'(?:[ ]|' + NPC_AS_SPACE_CLASS + r')+')
MULTI_SPACES = re.compile(r'[ ]{2,}')
NPC_RUN = re.compile(r'(?:' + NPC_AS_SPACE_CLASS + r'){1,}')

def normalize_text(s: str) -> str:
    """Normalize spaces and non-printable characters in text."""
    if s is None:
        return s
    # Replace any sequence of ASCII spaces and NBSP with a single ASCII space
    s = SEQ_SPACES_NPC.sub(' ', s)
    # Replace any remaining runs of NBSP with a single ASCII space
    s = NPC_RUN.sub(' ', s)
    # Collapse 2+ ASCII spaces to one
    s = MULTI_SPACES.sub(' ', s)
    return s

# Columns to process (driven by config 'columns_preprocess'; defaults to ['translation'])
# COLUMNS_PREPROCESS is loaded from lang_pipeline.ini if present
columns_to_fix = COLUMNS_PREPROCESS if 'COLUMNS_PREPROCESS' in globals() else ['translation']
# Support special string values like ['all'] to indicate 'process all columns'
if isinstance(columns_to_fix, list) and len(columns_to_fix) == 1 and str(columns_to_fix[0]).lower() == 'all':
    columns_to_fix = list(expected_columns) if expected_columns else []
# Fallback safety
if not columns_to_fix:
    columns_to_fix = ['translation']

changed_count = 0
for row in rows:
    for col in columns_to_fix:
        if col in row:
            original = row[col]
            new = normalize_text(row[col])
            row[col] = new
            if new != original:
                changed_count += 1

print(f"Normalized {changed_count} cells in columns: {columns_to_fix}")

In [None]:
# Preview table (lightweight)
import pandas as pd

df = pd.DataFrame(rows)
if globals().get('PREVIEW_ENABLED', True):
    print(f"Preview (first {PREVIEW_ROWS} rows):")
    # Use plain-text to_string to avoid HTML rendering; cap at PREVIEW_ROWS rows
    print(df.head(PREVIEW_ROWS).to_string(index=False))
else:
    print("Preview disabled by INI (PREVIEW_ENABLED=False)")


In [None]:
#  Import
#  Preprocess Python
#>>LLM processing
#  Postprocess Python
#  Export

In [None]:
# LLM processing placeholder
def llm_transform(records):
    """
    Placeholder for LLM transformation.
    Input: list of dicts
    Output: list of dicts (same schema or extended)
    """
    # TODO: replace with actual LLM call
    return records

processed_rows = llm_transform(rows)


In [None]:
#  Import
#  Preprocess Python
#  LLM processing
#>>Postprocess Python
#  Export

In [None]:
# Normalize CSV:
# - make cells proper strings
# - make dict cells into json

def normalize_for_csv(row):
    out = {}
    for k, v in row.items():
        if isinstance(v, (dict, list)):
            out[k] = json.dumps(v, ensure_ascii=False)
        else:
            out[k] = "" if v is None else str(v)
    return out

processed_rows = [normalize_for_csv(r) for r in processed_rows]


In [None]:
# Exclude columns from export
# Compute EXPORT_FIELDNAMES based on INI settings and detected columns
EXPORT_FIELDNAMES = list(expected_columns) if 'expected_columns' in globals() else []

try:
    if globals().get('COLUMNS_EXPORT_FILTER', False):
        cols = COLUMNS_EXPORT or []
        # Support a single token 'all' to mean export all columns
        if len(cols) == 1 and str(cols[0]).lower() == 'all':
            EXPORT_FIELDNAMES = list(expected_columns)
        else:
            # Preserve order from COLUMNS_EXPORT but only keep columns that exist
            EXPORT_FIELDNAMES = [c for c in cols if c in expected_columns]
            missing = [c for c in cols if c not in expected_columns]
            if missing:
                print(f"Warning: these columns from 'columns_export' were not found and will be skipped: {missing}")
        if not EXPORT_FIELDNAMES:
            print("Warning: no valid export columns found; exporting all columns instead.")
            EXPORT_FIELDNAMES = list(expected_columns)
    else:
        EXPORT_FIELDNAMES = list(expected_columns)
except Exception as e:
    print(f"Error determining export columns: {e}. Exporting all columns.")
    EXPORT_FIELDNAMES = list(expected_columns)

print(f"Exporting columns: {EXPORT_FIELDNAMES}")


In [None]:
# Export result
output_suffix = "_out"

# Extract stem (filename without extension) and extension
stem = input_path.stem          # "input"
ext = input_path.suffix         # ".csv"

# Build new filename
output_path = input_path.with_name(f"{stem}{output_suffix}{ext}")
print(f"Output to {output_path}")

with output_path.open("w", encoding="utf-8", newline="") as f:
    # Use computed EXPORT_FIELDNAMES so export can be filtered via INI
    # Allow extra keys in rows (ignore them) and write '' for missing fields
    writer = csv.DictWriter(f, fieldnames=EXPORT_FIELDNAMES, delimiter="\t", extrasaction='ignore', restval='')
    writer.writeheader()
    writer.writerows(processed_rows)

print("Export complete")
