In [29]:
#> Import
#  Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [30]:
import pandas as pd
import json
from pathlib import Path
import csv

# Optional: configure display
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", None)

input_path = Path("/Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_mini.csv")

rows = []
with input_path.open("r", encoding="utf-8", errors="replace") as f:
    reader = csv.DictReader(
        f,
        delimiter="\t",
        quotechar='"',
        escapechar="\\",
        strict=False
    )
    for line_number, row in enumerate(reader, start=1):
        try:
            rows.append(row)
        except Exception as e:
            print(f"Skipping malformed row {line_number}: {e}")


In [31]:
#  Import
#> Preprocess Python
#  LLM processing
#  Postprocess Python
#  Export

In [32]:
# Determine the expected set of columns from the header
expected_columns = set(rows[0].keys())

bad_rows = []
for i, row in enumerate(rows):
    if set(row.keys()) != expected_columns:
        bad_rows.append((i, row))

if bad_rows:
    print("Bad rows:")
    for idx, row in bad_rows:
        print(f"Row {idx}: {row}")
else:
    print("No bad rows found.")

No bad rows found.


In [33]:
languages = {row.get("language") for row in rows}
primary_lang = next(iter(languages)) # pick the first
languages
if len(languages) > 1:
    print("Warning: multiple language values found:", languages)
else:
    print(f"Primary language: {primary_lang}")

Primary language: ARA


In [None]:
# fix spaces and NBSPs
# \u00A0


In [34]:
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,ID,name,language,translation,confidence,breakdown,Remaining,Unallowed_characters,transliterations,Unmapped_in_transliterions
0,,Achraf Rent Car,ARA,تأجير سيارات أشرف,85,Rent Car' translates to تأجير سيارات which cla...,,,,
1,,Achraf rohdine,ARA,أشرف روهدين,90,"Transliterated as a proper noun, maintaining t...",,,,
2,,Achraf Soft,ARA,أشرف سوفت,95,Transliteration of the brand name 'Soft' to pr...,,,,
3,,Dar El Mizan دار الميزان,ARA,دار الميزان,90,The name 'Dar El Mizan' is transliterated as '...,,,,
4,,Dar Lbacha دار الباشا برشيد,ARA,دار الباشا برشيد,90,Dar Lbacha' is transliterated to 'دار الباشا ب...,,,,


In [35]:
#  Import
#  Preprocess Python
#> LLM processing
#  Postprocess Python
#  Export

In [36]:
# LLM processing placeholder
def llm_transform(records):
    """
    Placeholder for LLM transformation.
    Input: list of dicts
    Output: list of dicts (same schema or extended)
    """
    # TODO: replace with actual LLM call
    return records

processed_rows = llm_transform(rows)


In [37]:
#  Import
#  Preprocess Python
#  LLM processing
#> Postprocess Python
#  Export

In [38]:
#Postprocessing
def normalize_for_csv(row):
    out = {}
    for k, v in row.items():
        if isinstance(v, (dict, list)):
            out[k] = json.dumps(v, ensure_ascii=False)
        else:
            out[k] = "" if v is None else str(v)
    return out

processed_rows = [normalize_for_csv(r) for r in processed_rows]


In [39]:
#  Import
#  Preprocess Python
#  LLM processing
#  Postprocess Python
#> Export

In [40]:
# Export result
output_suffix = "_out"

# Extract stem (filename without extension) and extension
stem = input_path.stem          # "input"
ext = input_path.suffix         # ".csv"

# Build new filename
output_path = input_path.with_name(f"{stem}{output_suffix}{ext}")
print(f"Output to {output_path}")

with output_path.open("w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=expected_columns, delimiter="\t")
    writer.writeheader()
    writer.writerows(processed_rows)


Output to /Users/dunevv/WorkLocal/_AI_/HoudiniElf/tools_Didka/test_files/sources/Process_names_LLMoutput_numbered_mini_out.csv
