In [243]:
#>>Import
#  Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [None]:
# Load INI configuration from lang_pipeline.ini
import configparser
import ast
from pathlib import Path


def find_config(name="lang_pipeline.ini"):
    # Search up to 6 levels up then fallback to rglob
    p = Path.cwd()
    for _ in range(6):
        candidate = p / name
        if candidate.exists():
            return candidate
        p = p.parent
    found = next(Path.cwd().rglob(name), None)
    return found


def _parse_list_option(val):
    if not val:
        return []
    try:
        parsed = ast.literal_eval(val)
        if isinstance(parsed, (list, tuple)):
            return [str(x).strip() for x in parsed]
    except Exception:
        return [c.strip() for c in str(val).split(',') if c.strip()]
    return []


cfg_path = find_config()
CONFIG = {}
if cfg_path:
    cp = configparser.ConfigParser()
    cp.read(cfg_path)
    if 'pipeline' in cp:
        sec = cp['pipeline']
        CONFIG['source'] = sec.get('source', None)
        CONFIG['source_browser'] = sec.getboolean('source_browser', fallback=True)
        CONFIG['preview'] = sec.getboolean('preview', fallback=True)
        CONFIG['fix_nbsp'] = sec.getboolean('fix_nbsp', fallback=True)
        CONFIG['transliterate'] = sec.getboolean('transliterate', fallback=False)
        CONFIG['normalize'] = sec.getboolean('normalize', fallback=False)
        CONFIG['custom_prompt'] = sec.get('custom_prompt', '')
        CONFIG['chunk_size'] = sec.getint('chunk_size', fallback=10000)
        CONFIG['columns_preprocess'] = _parse_list_option(sec.get('columns_preprocess', sec.get('columns_process', '')))
        CONFIG['columns_agent'] = _parse_list_option(sec.get('columns_agent', ''))
        CONFIG['columns_postprocess'] = _parse_list_option(sec.get('columns_postprocess', ''))
    print(f"Loaded config from: \n{cfg_path}")
else:
    print("No lang_pipeline.ini found; using defaults")


# Back-compat variables for convenience
SOURCE_BROWSER = CONFIG.get('source_browser', True)
PREVIEW_ENABLED = CONFIG.get('preview', True)
COLUMNS_PREPROCESS = CONFIG.get('columns_preprocess', ['translation'])
COLUMNS_AGENT = CONFIG.get('columns_agent', [])
COLUMNS_POSTPROCESS = CONFIG.get('columns_postprocess', ['translation'])

Loaded config from 
/Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/Notebooks/lang_pipeline.ini


In [245]:
# File selection (interactive or from INI)
import platform, subprocess
from pathlib import Path

# If config forces using source directly, use it
if not globals().get('SOURCE_BROWSER', True) and CONFIG.get('source'):
    input_path = Path(CONFIG['source'])
    if not input_path.exists():
        raise FileNotFoundError(f"Configured source not found: {input_path}")
    print("Configured source used:", input_path)
else:
    # Try existing picker; otherwise fallback to OS-specific implementation
    try:
        from tools.ui_native import pick_file
    except Exception:
        def pick_file(filter_str=None):
            system = platform.system()
            if system == "Darwin":
                script = '''
                set theFile to choose file
                POSIX path of theFile
                '''
                res = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
                return res.stdout.strip()
            elif system == "Windows":
                ps_script = r'''
                Add-Type -AssemblyName System.Windows.Forms
                $ofd = New-Object System.Windows.Forms.OpenFileDialog
                $ofd.Filter = "All files (*.*)|*.*"
                if ($ofd.ShowDialog() -eq "OK") { Write-Output $ofd.FileName }
                '''
                res = subprocess.run(["powershell", "-NoProfile", "-Command", ps_script], capture_output=True, text=True)
                return res.stdout.strip()
            else:
                raise NotImplementedError("No native file dialog for this OS")

    pick_path = pick_file("CSV files (*.csv)|*.csv")
    if not pick_path:
        raise FileNotFoundError("No file selected")

    input_path = Path(pick_path)
    print(f"Selected input file: \n{input_path}")

Selected input file: 
/Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered.csv


In [246]:
# Line-by-line import with malformed-line filtering
import csv

rows = []
bad_rows = []
total_lines = 0
with input_path.open("r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f, delimiter="\t", quotechar='"', escapechar='\\')
    try:
        header = next(reader)
    except StopIteration:
        header = []
    for line_number, line in enumerate(reader, start=2):
        total_lines += 1
        if len(line) == len(header):
            rows.append(dict(zip(header, line)))
        elif len(line) > len(header) and len(line) % len(header) == 0:
            for i in range(0, len(line), len(header)):
                subline = line[i:i+len(header)]
                rows.append(dict(zip(header, subline)))
        else:
            bad_rows.append((line_number, line))

# Summary and fallout storage
print(f"Imported {len(rows)} rows ({total_lines} lines read).")
if bad_rows:
    # fallout file has same extension and stem with _fallout appended
    fallout_path = input_path.with_name(f"{input_path.stem}_fallout{input_path.suffix}")

    # Re-read original file and write raw lines corresponding to bad line numbers
    bad_line_numbers = {ln for ln, _ in bad_rows}
    with input_path.open("r", encoding="utf-8", errors="replace") as src, fallout_path.open("w", encoding="utf-8", errors="replace") as out:
        # write header for context if available
        if header:
            out.write('\t'.join(header) + '\n')
        for ln, raw in enumerate(src, start=1):
            if ln in bad_line_numbers:
                out.write(raw)

    print(f"{len(bad_rows)} malformed lines found. \nMalformed lines stored in: \n{fallout_path}")
else:
    print("No malformed lines found.")

Imported 277576 rows (277586 lines read).
10 malformed lines found. 
Malformed lines stored in: 
/Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_fallout.csv


In [247]:
# Preview and basic stats
import pandas as pd

expected_columns = header if 'header' in globals() else []
df = pd.DataFrame(rows)
print("Header:", expected_columns)
if globals().get('PREVIEW_ENABLED', True):
    print("Preview (first 5 rows):")
    display(df.head(5))
else:
    print("Preview disabled by config (PREVIEW_ENABLED=False)")
print("Total imported rows:", len(rows))
print("Malformed lines:", len(bad_rows))

# NBSP count for 'translation' column if present
if 'translation' in expected_columns:
    nbsp_count = sum(1 for r in rows if r.get('translation') and '\u00A0' in r.get('translation'))
    print("Rows with NBSP in 'translation':", nbsp_count)

Header: ['ID', 'name', 'language', 'translation', 'confidence', 'breakdown', 'Remaining', 'Unallowed_characters', 'transliterations', 'Unmapped_in_transliterions']
Preview (first 5 rows):


Unnamed: 0,ID,name,language,translation,confidence,breakdown,Remaining,Unallowed_characters,transliterations,Unmapped_in_transliterions
0,,Achraf Rent Car,ARA,تأجير سيارات أشرف,85,Rent Car' translates to تأجير سيارات which cla...,,,,
1,,Achraf rohdine,ARA,أشرف روهدين,90,"Transliterated as a proper noun, maintaining t...",,,,
2,,Achraf Soft,ARA,أشرف سوفت,95,Transliteration of the brand name 'Soft' to pr...,,,,
3,,Dar El Mizan دار الميزان,ARA,دار الميزان,90,The name 'Dar El Mizan' is transliterated as '...,,,,
4,,Dar Lbacha دار الباشا برشيد,ARA,دار الباشا برشيد,90,Dar Lbacha' is transliterated to 'دار الباشا ب...,,,,


Total imported rows: 277576
Malformed lines: 10
Rows with NBSP in 'translation': 1256


In [248]:
#  Import
#>>Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [249]:
# Determine the expected set of columns from the header
expected_columns = list(rows[0].keys())

bad_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows.append((i, row))

if bad_rows:
    print("Bad rows:")
    for idx, row in bad_rows:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows found.")

No bad rows found.


In [250]:
# Exclude bad rows from further processing
bad_rows_list = []
good_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows_list.append((i, row))
    else:
        good_rows.append(row)

if bad_rows_list:
    print("Excluding bad rows:")
    for idx, row in bad_rows_list:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows to exclude.")

rows = good_rows
print(f"Proceeding with {len(rows)} good rows.")

No bad rows to exclude.
Proceeding with 277576 good rows.


In [251]:
# Ensure language column is correct as means of verifying the imported data
languages = {row.get("language") for row in rows}
primary_lang = next(iter(languages)) # pick the first
languages
if len(languages) > 1:
    print("Warning: multiple language values found:", languages)
else:
    print(f"Primary language: {primary_lang}")



In [252]:
# Fix spaces and NBSPs
import re

# Configuration for space normalization
NPC_AS_SPACE = ["\u00A0"]  # Characters to treat as spaces

# Build regex patterns
NPC_AS_SPACE_CLASS = r'\u00A0'  # Since only one char
SEQ_SPACES_NPC = re.compile(r'(?:[ ]|' + NPC_AS_SPACE_CLASS + r')+')
MULTI_SPACES = re.compile(r'[ ]{2,}')
NPC_RUN = re.compile(r'(?:' + NPC_AS_SPACE_CLASS + r'){1,}')

def normalize_text(s: str) -> str:
    """Normalize spaces and non-printable characters in text."""
    if s is None:
        return s
    # Replace any sequence of ASCII spaces and NBSP with a single ASCII space
    s = SEQ_SPACES_NPC.sub(' ', s)
    # Replace any remaining runs of NBSP with a single ASCII space
    s = NPC_RUN.sub(' ', s)
    # Collapse 2+ ASCII spaces to one
    s = MULTI_SPACES.sub(' ', s)
    return s

# Columns to process (driven by config 'columns_preprocess'; defaults to ['translation'])
# COLUMNS_PREPROCESS is loaded from lang_pipeline.ini if present
columns_to_fix = COLUMNS_PREPROCESS if 'COLUMNS_PREPROCESS' in globals() else ['translation']
# Support special string values like ['all'] to indicate 'process all columns'
if isinstance(columns_to_fix, list) and len(columns_to_fix) == 1 and str(columns_to_fix[0]).lower() == 'all':
    columns_to_fix = list(expected_columns) if expected_columns else []
# Fallback safety
if not columns_to_fix:
    columns_to_fix = ['translation']

changed_count = 0
for row in rows:
    for col in columns_to_fix:
        if col in row:
            original = row[col]
            new = normalize_text(row[col])
            row[col] = new
            if new != original:
                changed_count += 1

print(f"Normalized {changed_count} cells in columns: {columns_to_fix}")

Normalized 1262 cells in columns: ['translation']


In [253]:
# Preview table
df = pd.DataFrame(rows)
# df.head()
df

Unnamed: 0,ID,name,language,translation,confidence,breakdown,Remaining,Unallowed_characters,transliterations,Unmapped_in_transliterions
0,,Achraf Rent Car,ARA,تأجير سيارات أشرف,85,Rent Car' translates to تأجير سيارات which cla...,,,,
1,,Achraf rohdine,ARA,أشرف روهدين,90,"Transliterated as a proper noun, maintaining t...",,,,
2,,Achraf Soft,ARA,أشرف سوفت,95,Transliteration of the brand name 'Soft' to pr...,,,,
3,,Dar El Mizan دار الميزان,ARA,دار الميزان,90,The name 'Dar El Mizan' is transliterated as '...,,,,
4,,Dar Lbacha دار الباشا برشيد,ARA,دار الباشا برشيد,90,Dar Lbacha' is transliterated to 'دار الباشا ب...,,,,
...,...,...,...,...,...,...,...,...,...,...
277571,,Ziteks,ARA,زيتكس,85,Ziteks' is a brand name and thus transliterate...,,,,
277572,,Zitouni‌​,ARA,زيتونيّ,90,"Zitouni' is likely a proper noun or brand, so ...",,,,
277573,,Zonetech,ARA,زونيتك,90,Brand name transliterated to preserve global i...,,,,
277574,,Zuhr,ARA,الظهر,95,"Common term for 'noon' in Arabic, translated f...",,,,


In [254]:
#  Import
#  Preprocess Python
#>>LLM processing
#  Postprocess Python
#  Export

In [255]:
# LLM processing placeholder
def llm_transform(records):
    """
    Placeholder for LLM transformation.
    Input: list of dicts
    Output: list of dicts (same schema or extended)
    """
    # TODO: replace with actual LLM call
    return records

processed_rows = llm_transform(rows)


In [256]:
#  Import
#  Preprocess Python
#  LLM processing
#>>Postprocess Python
#  Export

In [257]:
# Normalize CSV:
# - make cells proper strings
# - make dict cells into json

def normalize_for_csv(row):
    out = {}
    for k, v in row.items():
        if isinstance(v, (dict, list)):
            out[k] = json.dumps(v, ensure_ascii=False)
        else:
            out[k] = "" if v is None else str(v)
    return out

processed_rows = [normalize_for_csv(r) for r in processed_rows]


In [258]:
# Export result
output_suffix = "_out"

# Extract stem (filename without extension) and extension
stem = input_path.stem          # "input"
ext = input_path.suffix         # ".csv"

# Build new filename
output_path = input_path.with_name(f"{stem}{output_suffix}{ext}")
print(f"Output to {output_path}")

with output_path.open("w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=expected_columns, delimiter="\t")
    writer.writeheader()
    writer.writerows(processed_rows)


Output to /Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_out.csv
