In [21]:
#>>Import
#  Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [22]:
# File selection (interactive)
import platform, subprocess
from pathlib import Path

# Try existing picker; otherwise fallback to OS-specific implementation
try:
    from tools.ui_native import pick_file
except Exception:
    def pick_file(filter_str=None):
        system = platform.system()
        if system == "Darwin":
            script = '''
            set theFile to choose file
            POSIX path of theFile
            '''
            res = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
            return res.stdout.strip()
        elif system == "Windows":
            ps_script = r'''
            Add-Type -AssemblyName System.Windows.Forms
            $ofd = New-Object System.Windows.Forms.OpenFileDialog
            $ofd.Filter = "All files (*.*)|*.*"
            if ($ofd.ShowDialog() -eq "OK") { Write-Output $ofd.FileName }
            '''
            res = subprocess.run(["powershell", "-NoProfile", "-Command", ps_script], capture_output=True, text=True)
            return res.stdout.strip()
        else:
            raise NotImplementedError("No native file dialog for this OS")

pick_path = pick_file("CSV files (*.csv)|*.csv")
if not pick_path:
    raise FileNotFoundError("No file selected")

input_path = Path(pick_path)
print("Selected:", input_path)

Selected: /Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_mini.csv


In [23]:
# Line-by-line import with malformed-line filtering
import csv

rows = []
bad_rows = []
total_lines = 0
with input_path.open("r", encoding="utf-8", errors="replace") as f:
    reader = csv.reader(f, delimiter="\t", quotechar='"', escapechar='\\')
    try:
        header = next(reader)
    except StopIteration:
        header = []
    for line_number, line in enumerate(reader, start=2):
        total_lines += 1
        if len(line) == len(header):
            rows.append(dict(zip(header, line)))
        elif len(line) > len(header) and len(line) % len(header) == 0:
            for i in range(0, len(line), len(header)):
                subline = line[i:i+len(header)]
                rows.append(dict(zip(header, subline)))
        else:
            bad_rows.append((line_number, line))
# Summary
print(f"Imported {len(rows)} rows ({total_lines} lines read), {len(bad_rows)} malformed lines.")

Imported 30 rows (31 lines read), 1 malformed lines.


In [24]:
# Preview and basic stats
import pandas as pd

expected_columns = header if 'header' in globals() else []
df = pd.DataFrame(rows)
print("Header:", expected_columns)
print("Preview (first 5 rows):")
display(df.head(5))
print("Total imported rows:", len(rows))
print("Malformed lines:", len(bad_rows))

# NBSP count for 'translation' column if present
if 'translation' in expected_columns:
    nbsp_count = sum(1 for r in rows if r.get('translation') and '\u00A0' in r.get('translation'))
    print("Rows with NBSP in 'translation':", nbsp_count)

Header: ['ID', ' name                                                          ', ' language', ' translation                                       ', ' confidence', ' breakdown                                                                                                                                                                                     ', ' Remaining                      ', ' Unallowed_characters', ' transliterations', ' Unmapped_in_transliterions']
Preview (first 5 rows):


Unnamed: 0,ID,name,language,translation,confidence,breakdown,Remaining,Unallowed_characters,transliterations,Unmapped_in_transliterions
0,,Achraf Rent Car ...,ARA,تأجير سيارات أشرف ...,85,Rent Car' translates to تأجير سيارات which cl...,,,,
1,,Achraf rohdine ...,ARA,أشرف روهدين ...,90,"Transliterated as a proper noun, maintaining ...",,,,
2,,Achraf Soft ...,ARA,أشرف سوفت ...,95,Transliteration of the brand name 'Soft' to p...,,,,
3,,Dar El Mizan دار الميزان ...,ARA,دار الميزان ...,90,The name 'Dar El Mizan' is transliterated as ...,,,,
4,,Dar Lbacha دار الباشا برشيد ...,ARA,دار الباشا برشيد ...,90,Dar Lbacha' is transliterated to 'دار الباشا ...,,,,


Total imported rows: 30
Malformed lines: 1


In [25]:
#  Import
#>>Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [26]:
# Determine the expected set of columns from the header
expected_columns = list(rows[0].keys())

bad_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows.append((i, row))

if bad_rows:
    print("Bad rows:")
    for idx, row in bad_rows:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows found.")

No bad rows found.


In [27]:
# Exclude bad rows from further processing
bad_rows_list = []
good_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != set(expected_columns):
        bad_rows_list.append((i, row))
    else:
        good_rows.append(row)

if bad_rows_list:
    print("Excluding bad rows:")
    for idx, row in bad_rows_list:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows to exclude.")

rows = good_rows
print(f"Proceeding with {len(rows)} good rows.")

No bad rows to exclude.
Proceeding with 30 good rows.


In [28]:
# Ensure language column is correct as means of verifying the imported data
languages = {row.get("language") for row in rows}
primary_lang = next(iter(languages)) # pick the first
languages
if len(languages) > 1:
    print("Warning: multiple language values found:", languages)
else:
    print(f"Primary language: {primary_lang}")

Primary language: None


In [29]:
# Fix spaces and NBSPs
import re

# Configuration for space normalization
NPC_AS_SPACE = ["\u00A0"]  # Characters to treat as spaces

# Build regex patterns
NPC_AS_SPACE_CLASS = r'\u00A0'  # Since only one char
SEQ_SPACES_NPC = re.compile(r'(?:[ ]|' + NPC_AS_SPACE_CLASS + r')+')
MULTI_SPACES = re.compile(r'[ ]{2,}')
NPC_RUN = re.compile(r'(?:' + NPC_AS_SPACE_CLASS + r'){1,}')

def normalize_text(s: str) -> str:
    """Normalize spaces and non-printable characters in text."""
    # Replace any sequence of ASCII spaces and NBSP with a single ASCII space
    s = SEQ_SPACES_NPC.sub(' ', s)
    # Replace any remaining runs of NBSP with a single ASCII space
    s = NPC_RUN.sub(' ', s)
    # Collapse 2+ ASCII spaces to one
    s = MULTI_SPACES.sub(' ', s)
    return s

# Columns to process (default: 'translation'; set to list(expected_columns) for all)
columns_to_fix = ['translation']  # Modify this list to add more columns or use list(expected_columns) for all

changed_count = 0
for row in rows:
    for col in columns_to_fix:
        if col in row:
            original = row[col]
            row[col] = normalize_text(row[col])
            if row[col] != original:
                changed_count += 1

print(f"Normalized {changed_count} rows in columns: {columns_to_fix}")

Normalized 0 rows in columns: ['translation']


In [30]:
# Preview table
df = pd.DataFrame(rows)
# df.head()
df

Unnamed: 0,ID,name,language,translation,confidence,breakdown,Remaining,Unallowed_characters,transliterations,Unmapped_in_transliterions
0,,Achraf Rent Car ...,ARA,تأجير سيارات أشرف ...,85,Rent Car' translates to تأجير سيارات which cl...,,,,
1,,Achraf rohdine ...,ARA,أشرف روهدين ...,90,"Transliterated as a proper noun, maintaining ...",,,,
2,,Achraf Soft ...,ARA,أشرف سوفت ...,95,Transliteration of the brand name 'Soft' to p...,,,,
3,,Dar El Mizan دار الميزان ...,ARA,دار الميزان ...,90,The name 'Dar El Mizan' is transliterated as ...,,,,
4,,Dar Lbacha دار الباشا برشيد ...,ARA,دار الباشا برشيد ...,90,Dar Lbacha' is transliterated to 'دار الباشا ...,,,,
5,,Darb Omar درب عمر ...,ARA,درب عمر ...,95,Darb Omar' translates directly to 'درب عمر'. ...,,,,
6,,Darna - دارنا ...,ARA,دارنا ...,100,Darna - دارنا' retains its form as it transla...,,,,
7,,Debdou دبدو ...,ARA,دبدو ...,100,Debdou' is transliterated as it stands since ...,,,,
8,,Derb Rabat درب الرباط ...,ARA,درب الرباط ...,90,The name 'Derb' translates to 'path' or 'way'...,,,,
9,,Douche Ben Diban دوش رشاشات بنديبان ...,ARA,دوش رشاشات بنديبان ...,85,"Douche' refers to a shower, which can be unde...",,,,


In [31]:
#  Import
#  Preprocess Python
#>>LLM processing
#  Postprocess Python
#  Export

In [32]:
# LLM processing placeholder
def llm_transform(records):
    """
    Placeholder for LLM transformation.
    Input: list of dicts
    Output: list of dicts (same schema or extended)
    """
    # TODO: replace with actual LLM call
    return records

processed_rows = llm_transform(rows)


In [33]:
#  Import
#  Preprocess Python
#  LLM processing
#>>Postprocess Python
#  Export

In [34]:
# Normalize CSV:
# - make cells proper strings
# - make dict cells into json

def normalize_for_csv(row):
    out = {}
    for k, v in row.items():
        if isinstance(v, (dict, list)):
            out[k] = json.dumps(v, ensure_ascii=False)
        else:
            out[k] = "" if v is None else str(v)
    return out

processed_rows = [normalize_for_csv(r) for r in processed_rows]


In [35]:
# Export result
output_suffix = "_out"

# Extract stem (filename without extension) and extension
stem = input_path.stem          # "input"
ext = input_path.suffix         # ".csv"

# Build new filename
output_path = input_path.with_name(f"{stem}{output_suffix}{ext}")
print(f"Output to {output_path}")

with output_path.open("w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=expected_columns, delimiter="\t")
    writer.writeheader()
    writer.writerows(processed_rows)


Output to /Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_mini_out.csv
