In [1]:
#>>Import
#  Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [2]:
# File selection (interactive)
import platform, subprocess
from pathlib import Path

# Try existing picker; otherwise fallback to OS-specific implementation
try:
    from tools.ui_native import pick_file
except Exception:
    def pick_file(filter_str=None):
        system = platform.system()
        if system == "Darwin":
            script = '''
            set theFile to choose file
            POSIX path of theFile
            '''
            res = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
            return res.stdout.strip()
        elif system == "Windows":
            ps_script = r'''
            Add-Type -AssemblyName System.Windows.Forms
            $ofd = New-Object System.Windows.Forms.OpenFileDialog
            $ofd.Filter = "All files (*.*)|*.*"
            if ($ofd.ShowDialog() -eq "OK") { Write-Output $ofd.FileName }
            '''
            res = subprocess.run(["powershell", "-NoProfile", "-Command", ps_script], capture_output=True, text=True)
            return res.stdout.strip()
        else:
            raise NotImplementedError("No native file dialog for this OS")

pick_path = pick_file("CSV files (*.csv)|*.csv")
if not pick_path:
    raise FileNotFoundError("No file selected")

input_path = Path(pick_path)
print("Selected:", input_path)

Selected: /Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_mini.csv


In [3]:
# Blocking Pipeline UI: scan file, present columns/options, and wait for Start or Cancel
import asyncio
import csv
import configparser
from pathlib import Path
from datetime import datetime

PIPELINE_OPTIONS = globals().get("PIPELINE_OPTIONS", {})

try:
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
    WIDGETS_AVAILABLE = True
except Exception:
    WIDGETS_AVAILABLE = False

# INI helpers and location (notebook dir)
INI_FILENAME = "lcc_agentic.ini"
try:
    NOTEBOOK_DIR = Path(__file__).resolve().parent
except Exception:
    NOTEBOOK_DIR = Path.cwd()
INI_PATH = NOTEBOOK_DIR / INI_FILENAME

# Defaults
_defaults = dict(
    preview=True,
    fix_nbsp=True,
    fix_arabic_numerals=False,
    use_agent=False,
    custom_prompt="",
    chunk_size=10000,
    columns=None,
)
for k, v in _defaults.items():
    PIPELINE_OPTIONS.setdefault(k, v)

# Load previously saved settings if present
cp = configparser.ConfigParser()
if INI_PATH.exists():
    cp.read(INI_PATH, encoding='utf-8')
    if 'pipeline' in cp:
        sec = cp['pipeline']
        PIPELINE_OPTIONS['preview'] = sec.getboolean('preview', fallback=PIPELINE_OPTIONS['preview'])
        PIPELINE_OPTIONS['fix_nbsp'] = sec.getboolean('fix_nbsp', fallback=PIPELINE_OPTIONS['fix_nbsp'])
        PIPELINE_OPTIONS['fix_arabic_numerals'] = sec.getboolean('fix_arabic_numerals', fallback=PIPELINE_OPTIONS.get('fix_arabic_numerals', False))
        PIPELINE_OPTIONS['use_agent'] = sec.getboolean('use_agent', fallback=PIPELINE_OPTIONS.get('use_agent', False))
        PIPELINE_OPTIONS['custom_prompt'] = sec.get('custom_prompt', fallback=PIPELINE_OPTIONS['custom_prompt'])
        PIPELINE_OPTIONS['chunk_size'] = sec.getint('chunk_size', fallback=PIPELINE_OPTIONS['chunk_size'])
        cols = sec.get('columns', fallback='').strip()
        PIPELINE_OPTIONS['columns'] = [c.strip() for c in cols.split(',')] if cols else None

# Quick scan the file to get header and an approximate line count + preview
p = Path(input_path)
header = []
preview_rows = []
line_count = 0
with p.open('r', encoding='utf-8', errors='replace') as f:
    reader = csv.reader(f, delimiter='\t', quotechar='"', escapechar='\\')
    try:
        header = next(reader)
    except StopIteration:
        header = []
    for row in reader:
        line_count += 1
        if len(preview_rows) < 5:
            preview_rows.append(row)

# Default selection logic: prefer 'translation' if present, else first column
default_cols = [c for c in header if c == 'translation'] or (header[:1] if header else [])

if WIDGETS_AVAILABLE:
    col_sel = widgets.SelectMultiple(options=header, value=tuple(default_cols), description='Columns')
    chk_nbsp = widgets.Checkbox(value=PIPELINE_OPTIONS.get('fix_nbsp', True), description='Fix NBSPs')
    chk_arabnum = widgets.Checkbox(value=PIPELINE_OPTIONS.get('fix_arabic_numerals', False), description='Fix Arabic numerals')
    chk_agent = widgets.Checkbox(value=PIPELINE_OPTIONS.get('use_agent', False), description='Use Agent')
    chk_preview = widgets.Checkbox(value=PIPELINE_OPTIONS.get('preview', True), description='Enable preview')
    prompt = widgets.Textarea(value=PIPELINE_OPTIONS.get('custom_prompt', ''), placeholder='Custom instructions for the AI agent (optional)', rows=4, layout=widgets.Layout(width='100%'))

    start_btn = widgets.Button(description='Start', button_style='success')
    cancel_btn = widgets.Button(description='Cancel', button_style='danger')
    out = widgets.Output()

    # Display the UI
    display(HTML(f"<b>File:</b> {p.name} — lines (after header): {line_count}"))
    display(HTML(f"<b>Header:</b> {header}"))
    display(HTML("<b>Preview (first rows):</b>"))
    display(HTML("<pre>" + "\n".join(str(r) for r in preview_rows) + "</pre>"))
    display(widgets.VBox([col_sel, widgets.HBox([chk_nbsp, chk_arabnum, chk_agent, chk_preview]), prompt, widgets.HBox([start_btn, cancel_btn]), out]))

    def _on_load(_):
        # Reload settings from INI and update widgets
        loaded = load_settings()
        if loaded:
            col_sel.options = header
            col_sel.value = tuple(PIPELINE_OPTIONS['columns']) if PIPELINE_OPTIONS.get('columns') else tuple(default_cols)
            chk_nbsp.value = PIPELINE_OPTIONS.get('fix_nbsp', True)
            chk_arabnum.value = PIPELINE_OPTIONS.get('fix_arabic_numerals', False)
            chk_agent.value = PIPELINE_OPTIONS.get('use_agent', False)
            chk_preview.value = PIPELINE_OPTIONS.get('preview', True)
            prompt.value = PIPELINE_OPTIONS.get('custom_prompt', '')
        with out:
            out.clear_output()
            print('Reloaded settings from', INI_PATH if INI_PATH.exists() else '(no INI found)')

    def _on_start(_):
        # Collect options and save, don't block — downstream cells will check PIPELINE_OPTIONS['started']
        PIPELINE_OPTIONS['columns'] = list(col_sel.value) if col_sel.value else None
        PIPELINE_OPTIONS['fix_nbsp'] = bool(chk_nbsp.value)
        PIPELINE_OPTIONS['fix_arabic_numerals'] = bool(chk_arabnum.value)
        PIPELINE_OPTIONS['use_agent'] = bool(chk_agent.value)
        PIPELINE_OPTIONS['preview'] = bool(chk_preview.value)
        PIPELINE_OPTIONS['custom_prompt'] = str(prompt.value).strip()
        saved = save_settings()
        PIPELINE_OPTIONS['last_saved'] = str(saved)
        PIPELINE_OPTIONS['started'] = True
        PIPELINE_OPTIONS['start_time'] = datetime.utcnow().isoformat() + 'Z'
        with out:
            out.clear_output()
            print(f"Saved settings to: {saved}")
            print('Settings saved. Proceed by running the import cell (or continue Run All after Start).')

    def _on_cancel(_):
        with out:
            out.clear_output()
            print('User cancelled. Pipeline will not start.')
        PIPELINE_OPTIONS['started'] = False

    start_btn.on_click(_on_start)
    cancel_btn.on_click(_on_cancel)
    load_btn = widgets.Button(description='Load', button_style='')
    load_btn.on_click(_on_load)
    display(load_btn)

    # Informational note: this cell no longer blocks — it saves settings on Start; run import cell to continue
    with out:
        out.clear_output()
        print('UI ready — press Start to save settings. Then run the import cell to continue.')
else:
    # Non-interactive: proceed with existing settings (save defaults for reproducibility)
    try:
        saved = save_settings()
        print(f"Saved defaults to: {saved}")
    except Exception as e:
        print("Could not save defaults:", e)

# Print effective options for downstream cells
print('PIPELINE_OPTIONS (effective):', PIPELINE_OPTIONS)

VBox(children=(SelectMultiple(description='Columns', index=(0,), options=('ID', ' name                        …

Button(description='Load', style=ButtonStyle())

PIPELINE_OPTIONS (effective): {'preview': False, 'fix_nbsp': True, 'fix_arabic_numerals': False, 'use_agent': False, 'custom_prompt': 'aaa', 'chunk_size': 10000, 'columns': None}


NameError: name 'save_settings' is not defined

In [4]:
# TEST: Save settings to notebook dir and verify file exists + show contents
try:
    saved = save_settings()
    print("Saved settings file:", saved)
    print("Exists:", saved.exists())
    print("--- File contents ---")
    print(saved.read_text(encoding='utf-8'))
    print("--- Directory listing ---")
    for p in sorted(saved.parent.iterdir()):
        print(" ", p.name)
except Exception as e:
    print("INI save/verify failed:", e)

INI save/verify failed: name 'save_settings' is not defined


In [5]:
# Line-by-line import with malformed-line filtering
import csv

# Ensure user started the pipeline
if not PIPELINE_OPTIONS.get('started'):
    raise RuntimeError('Pipeline has not been started. Please press Start in the UI cell and re-run this cell.')

rows = []
bad_rows = []
total_lines = 0
with input_path.open("r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f, delimiter="\t", quotechar='"', escapechar='\\')
    try:
        header = next(reader)
    except StopIteration:
        header = []
    for line_number, line in enumerate(reader, start=2):
        total_lines += 1
        if len(line) == len(header):
            rows.append(dict(zip(header, line)))
        elif len(line) > len(header) and len(line) % len(header) == 0:
            for i in range(0, len(line), len(header)):
                subline = line[i:i+len(header)]
                rows.append(dict(zip(header, subline)))
        else:
            bad_rows.append((line_number, line))
# Summary
print(f"Imported {len(rows)} rows ({total_lines} lines read), {len(bad_rows)} malformed lines.")

RuntimeError: Pipeline has not been started. Please press Start in the UI cell and re-run this cell.

In [None]:
# Preview and basic stats
import pandas as pd

expected_columns = header if 'header' in globals() else []
df = pd.DataFrame(rows)
print("Header:", expected_columns)
print("Preview (first 5 rows):")
display(df.head(5))
print("Total imported rows:", len(rows))
print("Malformed lines:", len(bad_rows))

# NBSP count for 'translation' column if present
if 'translation' in expected_columns:
    nbsp_count = sum(1 for r in rows if r.get('translation') and '\u00A0' in r.get('translation'))
    print("Rows with NBSP in 'translation':", nbsp_count)

Header: ['ID', ' name                                                          ', ' language', ' translation                                       ', ' confidence', ' breakdown                                                                                                                                                                                     ', ' Remaining                      ', ' Unallowed_characters', ' transliterations', ' Unmapped_in_transliterions']
Preview (first 5 rows):


Unnamed: 0,ID,name,language,translation,confidence,breakdown,Remaining,Unallowed_characters,transliterations,Unmapped_in_transliterions
0,,Achraf Rent Car ...,ARA,تأجير سيارات أشرف ...,85,Rent Car' translates to تأجير سيارات which cl...,,,,
1,,Achraf rohdine ...,ARA,أشرف روهدين ...,90,"Transliterated as a proper noun, maintaining ...",,,,
2,,Achraf Soft ...,ARA,أشرف سوفت ...,95,Transliteration of the brand name 'Soft' to p...,,,,
3,,Dar El Mizan دار الميزان ...,ARA,دار الميزان ...,90,The name 'Dar El Mizan' is transliterated as ...,,,,
4,,Dar Lbacha دار الباشا برشيد ...,ARA,دار الباشا برشيد ...,90,Dar Lbacha' is transliterated to 'دار الباشا ...,,,,


Total imported rows: 30
Malformed lines: 1


In [None]:
#  Import
#>>Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [None]:
# Determine the expected set of columns from the header
expected_columns = list(rows[0].keys())

bad_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows.append((i, row))

if bad_rows:
    print("Bad rows:")
    for idx, row in bad_rows:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows found.")

No bad rows found.


In [None]:
# Exclude bad rows from further processing
bad_rows_list = []
good_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows_list.append((i, row))
    else:
        good_rows.append(row)

if bad_rows_list:
    print("Excluding bad rows:")
    for idx, row in bad_rows_list:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows to exclude.")

rows = good_rows
print(f"Proceeding with {len(rows)} good rows.")

No bad rows to exclude.
Proceeding with 30 good rows.


In [None]:
# Ensure language column is correct as means of verifying the imported data
languages = {row.get("language") for row in rows}
primary_lang = next(iter(languages)) # pick the first
languages
if len(languages) > 1:
    print("Warning: multiple language values found:", languages)
else:
    print(f"Primary language: {primary_lang}")

Primary language: None


In [None]:
# Fix spaces, NBSPs and (optionally) Arabic numerals according to PIPELINE_OPTIONS
import re

# Configuration for space normalization
NPC_AS_SPACE = ["\u00A0"]  # Characters to treat as spaces
NPC_AS_SPACE_CLASS = r'\u00A0'
SEQ_SPACES_NPC = re.compile(r'(?:[ ]|' + NPC_AS_SPACE_CLASS + r')+')
MULTI_SPACES = re.compile(r'[ ]{2,}')
NPC_RUN = re.compile(r'(?:' + NPC_AS_SPACE_CLASS + r'){1,}')

# Arabic numeral mappings
ARABIC_INDIC = '٠١٢٣٤٥٦٧٨٩'
EASTERN_ARABIC = '۰۱۲۳۴۵۶۷۸۹'
_digit_map = {c: d for c, d in zip(ARABIC_INDIC + EASTERN_ARABIC, '0123456789' * 2)}
ARABIC_TRANS = str.maketrans(_digit_map)

def normalize_text(s: str) -> str:
    s = SEQ_SPACES_NPC.sub(' ', s)
    s = NPC_RUN.sub(' ', s)
    s = MULTI_SPACES.sub(' ', s)
    return s

def fix_arabic_numerals(s: str) -> str:
    return s.translate(ARABIC_TRANS)

# Determine columns to process
columns_to_fix = PIPELINE_OPTIONS.get('columns') or ['translation']

changed_count = 0
for row in rows:
    for col in columns_to_fix:
        if col in row and row[col] is not None:
            original = row[col]
            s = original
            if PIPELINE_OPTIONS.get('fix_nbsp', True):
                s = normalize_text(s)
            if PIPELINE_OPTIONS.get('fix_arabic_numerals', False):
                s = fix_arabic_numerals(s)
            if s != original:
                changed_count += 1
                row[col] = s

print(f"Transformed {changed_count} cells in columns: {columns_to_fix}")

Normalized 0 rows in columns: ['translation']


In [None]:
# Preview table (respect PIPELINE_OPTIONS['preview'])
df = pd.DataFrame(rows)
if PIPELINE_OPTIONS.get('preview', True):
    display(df.head(5))
else:
    print(f"Preview disabled (rows={len(rows)}). Enable preview in PIPELINE_OPTIONS to see a sample.")

Unnamed: 0,ID,name,language,translation,confidence,breakdown,Remaining,Unallowed_characters,transliterations,Unmapped_in_transliterions
0,,Achraf Rent Car ...,ARA,تأجير سيارات أشرف ...,85,Rent Car' translates to تأجير سيارات which cl...,,,,
1,,Achraf rohdine ...,ARA,أشرف روهدين ...,90,"Transliterated as a proper noun, maintaining ...",,,,
2,,Achraf Soft ...,ARA,أشرف سوفت ...,95,Transliteration of the brand name 'Soft' to p...,,,,
3,,Dar El Mizan دار الميزان ...,ARA,دار الميزان ...,90,The name 'Dar El Mizan' is transliterated as ...,,,,
4,,Dar Lbacha دار الباشا برشيد ...,ARA,دار الباشا برشيد ...,90,Dar Lbacha' is transliterated to 'دار الباشا ...,,,,
5,,Darb Omar درب عمر ...,ARA,درب عمر ...,95,Darb Omar' translates directly to 'درب عمر'. ...,,,,
6,,Darna - دارنا ...,ARA,دارنا ...,100,Darna - دارنا' retains its form as it transla...,,,,
7,,Debdou دبدو ...,ARA,دبدو ...,100,Debdou' is transliterated as it stands since ...,,,,
8,,Derb Rabat درب الرباط ...,ARA,درب الرباط ...,90,The name 'Derb' translates to 'path' or 'way'...,,,,
9,,Douche Ben Diban دوش رشاشات بنديبان ...,ARA,دوش رشاشات بنديبان ...,85,"Douche' refers to a shower, which can be unde...",,,,


In [None]:
#  Import
#  Preprocess Python
#>>LLM processing
#  Postprocess Python
#  Export

In [None]:
# LLM processing (guarded by PIPELINE_OPTIONS['use_agent'])

def llm_transform(records, prompt=''):
    """Placeholder for LLM transformation that accepts a prompt.
    Replace with actual LLM / agent integration.
    """
    # TODO: replace with actual LLM call
    # Example: return call_agent(records, prompt)
    return records

if PIPELINE_OPTIONS.get('use_agent'):
    prompt = PIPELINE_OPTIONS.get('custom_prompt', '')
    processed_rows = llm_transform(rows, prompt=prompt)
else:
    processed_rows = rows

In [None]:
#  Import
#  Preprocess Python
#  LLM processing
#>>Postprocess Python
#  Export

In [None]:
# Normalize CSV:
# - make cells proper strings
# - make dict cells into json

def normalize_for_csv(row):
    out = {}
    for k, v in row.items():
        if isinstance(v, (dict, list)):
            out[k] = json.dumps(v, ensure_ascii=False)
        else:
            out[k] = "" if v is None else str(v)
    return out

processed_rows = [normalize_for_csv(r) for r in processed_rows]


In [None]:
# Export result
output_suffix = "_out"

# Extract stem (filename without extension) and extension
stem = input_path.stem          # "input"
ext = input_path.suffix         # ".csv"

# Build new filename
output_path = input_path.with_name(f"{stem}{output_suffix}{ext}")
print(f"Output to {output_path}")

with output_path.open("w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=expected_columns, delimiter="\t")
    writer.writeheader()
    writer.writerows(processed_rows)


Output to /Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_mini_out.csv
