In [20]:
# -*- coding: utf-8 -*-
"""
Build a unified table for vehicles, registrations, parts, and K3AG1 components.

- Baseline: Fahrzeuge_OEM1_Typ11 + Fahrzeuge_OEM1_Typ12
- Merge Zulassungen (registrations) by vehicle ID
- Merge parts (Bestandteile_Fahrzeuge_*) by vehicle ID
- Merge Komponente_K3AG1 by transmission ID (ID_Schaltung) coming from parts
- Keep all baseline rows; mark missing linked data with 'has_missing_linked_data'
- Output: case/data/case_komponente.csv
"""

from __future__ import annotations

import os
from typing import List
import pandas as pd
import numpy as np
from pandas.errors import EmptyDataError

# ---------------------------
# Helpers: robust CSV reading
# ---------------------------

def read_csv_auto(path: str) -> pd.DataFrame:
    """Read CSV with automatic delimiter detection and robust date parsing off (we'll parse later)."""
    # engine='python' allows sep=None (sniff delimiter)
    df = pd.read_csv(path, sep=None, engine="python", dtype=str)
    # Drop typical unnamed index columns, if any
    df = df.loc[:, ~df.columns.str.contains(r"^Unnamed", case=False)]
    # Strip quotes/spaces from column names
    df.columns = df.columns.str.strip().str.replace('"', '', regex=False)
    # Strip surrounding quotes/spaces from string cells
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].map(lambda x: x.strip().strip('"') if isinstance(x, str) else x)
    return df


def ensure_dir_for_file(path: str) -> None:
    """Create parent directory for the given file path if it doesn't exist."""
    os.makedirs(os.path.dirname(path), exist_ok=True)


def coalesce_first(df: pd.DataFrame, candidates: List[str], new_name: str) -> pd.DataFrame:
    """
    Create/overwrite df[new_name] with the first existing, non-null column among candidates.
    If none exist, the column is created with NaN.
    """
    values = None
    for c in candidates:
        if c in df.columns:
            values = df[c] if values is None else values.fillna(df[c])
    df[new_name] = values if values is not None else np.nan
    return df


# --------------------------------------
# 1) Load all required source data files
# --------------------------------------

# Vehicles (baseline)
path_typ11 = "source/Fahrzeug/Fahrzeuge_OEM1_Typ11.csv"
path_typ12 = "source/Fahrzeug/Fahrzeuge_OEM1_Typ12.csv"

# Registrations
path_reg   = "source/Zulassungen/Zulassungen_alle_Fahrzeuge.csv"

# Parts per vehicle
path_parts11 = "source/Fahrzeug/Bestandteile_Fahrzeuge_OEM1_Typ11.csv"
path_parts12 = "source/Fahrzeug/Bestandteile_Fahrzeuge_OEM1_Typ12.csv"

# Component (K3AG1)
path_k3ag1   = "source/Komponente/Komponente_K3AG1.csv"

df_typ11  = read_csv_auto(path_typ11)
df_typ12  = read_csv_auto(path_typ12)
df_reg    = read_csv_auto(path_reg)
df_parts11 = read_csv_auto(path_parts11)
df_parts12 = read_csv_auto(path_parts12)
df_k3ag1  = read_csv_auto(path_k3ag1)

# ------------------------------------------------
# 2) Harmonize key columns and minimal English map
# ------------------------------------------------

# Vehicles: ensure presence of the vehicle ID column name 'ID_Fahrzeug'
# (Some exports may have it quoted; read_csv_auto already strips quotes.)
if "ID_Fahrzeug" not in df_typ11.columns:
    # Attempt to find a close match; fallback to no-op if absent.
    pass
if "ID_Fahrzeug" not in df_typ12.columns:
    pass

# Tag vehicle type for provenance (English)
if len(df_typ11):
    df_typ11["vehicle_type"] = "Typ11"
if len(df_typ12):
    df_typ12["vehicle_type"] = "Typ12"

# Baseline vehicles: stack, keep all columns; align by column name
df_vehicles_base = pd.concat([df_typ11, df_typ12], ignore_index=True, sort=False)

# Parts: unify the two parts tables and keep the shared columns
# Expected keys include: ID_Karosserie, ID_Schaltung, ID_Sitze, ID_Motor, ID_Fahrzeug
df_parts_all = pd.concat([df_parts11, df_parts12], ignore_index=True, sort=False)

# Registrations: standardize join key name to 'ID_Fahrzeug' and rename to English labels
if len(df_reg):
    # Some files name the vehicle ID as 'IDNummer'
    if "IDNummer" in df_reg.columns and "ID_Fahrzeug" not in df_reg.columns:
        df_reg = df_reg.rename(columns={"IDNummer": "ID_Fahrzeug"})
    # Rename German -> English for clarity in the output
    df_reg = df_reg.rename(columns={
        "Gemeinden": "registration_municipality",
        "Zulassung": "registration_date",
    })

# K3AG1 component table: coalesce the 'ID_Schaltung' column from possible variants,
# and keep a clean set of component attributes with English-friendly names.
if len(df_k3ag1):
    # Some exports have multiple duplicated sets with .x/.y plus a final unsuffixed set.
    df_k3ag1 = coalesce_first(
        df_k3ag1,
        candidates=["ID_Schaltung", "ID_Schaltung.x", "ID_Schaltung.y"],
        new_name="ID_Schaltung"
    )
    # Similarly coalesce common fields we care about
    df_k3ag1 = coalesce_first(df_k3ag1,
        ["Produktionsdatum", "Produktionsdatum.x", "Produktionsdatum.y"],
        "component_production_date"
    )
    df_k3ag1 = coalesce_first(df_k3ag1,
        ["Herstellernummer", "Herstellernummer.x", "Herstellernummer.y"],
        "component_manufacturer_no"
    )
    df_k3ag1 = coalesce_first(df_k3ag1,
        ["Werksnummer", "Werksnummer.x", "Werksnummer.y"],
        "component_plant_no"
    )
    df_k3ag1 = coalesce_first(df_k3ag1,
        ["Fehlerhaft", "Fehlerhaft.x", "Fehlerhaft.y"],
        "component_defective_flag"
    )
    df_k3ag1 = coalesce_first(df_k3ag1,
        ["Fehlerhaft_Datum", "Fehlerhaft_Datum.x", "Fehlerhaft_Datum.y"],
        "component_defective_date"
    )
    df_k3ag1 = coalesce_first(df_k3ag1,
        ["Fehlerhaft_Fahrleistung", "Fehlerhaft_Fahrleistung.x", "Fehlerhaft_Fahrleistung.y"],
        "component_defective_mileage"
    )

    # Keep only the columns we standardized plus the join key
    keep_cols_k3 = [
        "ID_Schaltung",
        "component_production_date",
        "component_manufacturer_no",
        "component_plant_no",
        "component_defective_flag",
        "component_defective_date",
        "component_defective_mileage",
    ]
    # Some may be missing if source empty; intersect with actual columns
    keep_cols_k3 = [c for c in keep_cols_k3 if c in df_k3ag1.columns]
    df_k3ag1 = df_k3ag1[keep_cols_k3].drop_duplicates()

# ---------------------------------------------------
# 3) Build the unified table with left joins on keys
# ---------------------------------------------------

# Merge baseline with registrations
df_merged = df_vehicles_base.merge(
    df_reg,
    how="left",
    on="ID_Fahrzeug",
    validate="m:1"
)

# Merge with parts (by vehicle)
df_merged = df_merged.merge(
    df_parts_all,
    how="left",
    on="ID_Fahrzeug",
    suffixes=("", "_parts"),
    validate="m:1"
)

# Merge with K3AG1 component (by transmission ID from parts)
if len(df_k3ag1) and "ID_Schaltung" in df_merged.columns:
    df_merged = df_merged.merge(
        df_k3ag1,
        how="left",
        on="ID_Schaltung",
        validate="m:1"
    )

# ---------------------------------------------------------
# 4) Mark rows where linked data is missing (English flag)
# ---------------------------------------------------------

linked_columns_candidates = []

# From registrations
linked_columns_candidates += [c for c in ["registration_municipality", "registration_date"] if c in df_merged.columns]

# From parts
linked_columns_candidates += [c for c in ["ID_Karosserie", "ID_Schaltung", "ID_Sitze", "ID_Motor"] if c in df_merged.columns]

# From component K3AG1
linked_columns_candidates += [c for c in [
    "component_production_date",
    "component_manufacturer_no",
    "component_plant_no",
    "component_defective_flag",
    "component_defective_date",
    "component_defective_mileage",
] if c in df_merged.columns]

# def compute_missing_flag(row: pd.Series, cols: List[str]) -> bool:
#     """
#     True if *any* of the linked data columns are missing; False otherwise.
#     If no linked columns exist (e.g., all sources empty), returns True conservatively.
#     """
#     if not cols:
#         return True
#     return any(pd.isna(row[c]) for c in cols)

# df_merged["has_missing_linked_data"] = df_merged.apply(
#     compute_missing_flag, axis=1, cols=linked_columns_candidates
# )

# ------------------------------------
# 5) Final tidy-up and write the CSV
# ------------------------------------

columns_to_drop = [
    "X1",
    "ID_Karosserie",
    "ID_Schaltung",
    "ID_Sitze",
    "ID_Motor",
    "vehicle_type"
]
df_merged = df_merged.drop(columns=[c for c in columns_to_drop if c in df_merged.columns])

rename_map = {
    "ID_Fahrzeug": "vehicle_id",
    "Produktionsdatum": "vehicle_production_date",
    "Herstellernummer": "vehicle_manufacturer_no",
    "Werksnummer": "vehicle_plant_no",
    "Fehlerhaft": "vehicle_defective_flag",
    "Fehlerhaft_Datum": "vehicle_defective_date",
    "Fehlerhaft_Fahrleistung": "vehicle_defective_mileage"
}

df_merged = df_merged.rename(columns=rename_map)

# Optional: attempt to cast obvious dates; keep as strings if parsing fails
date_like_cols = [c for c in [
    "Produktionsdatum",             # vehicle production date (German original)
    "registration_date",
    "component_production_date",
    "Fehlerhaft_Datum",             # original if present
    "component_defective_date",
] if c in df_merged.columns]

for c in date_like_cols:
    try:
        parsed = pd.to_datetime(df_merged[c], errors="coerce")
        # Keep ISO-8601 string for portability
        df_merged[c] = parsed.dt.strftime("%Y-%m-%d")
    except Exception:
        # Leave as-is on any parsing issues
        pass

# Ensure output directory exists
output_path = "case/data/case_komponente.csv"
ensure_dir_for_file(output_path)

# Write CSV (UTF-8, comma delimiter, English-friendly headers we added remain in English)
df_merged.to_csv(output_path, index=False, encoding="utf-8")
