In [None]:
# -*- coding: utf-8 -*-
# Purpose: Count how many T16 parts ended up in vehicles registered in Adelshofen.
# Assumptions:
# - CSVs use semicolon as delimiter and double quotes as quotechar (as shown in samples).
# - Vehicle registration "IDNummer" matches "ID_Fahrzeug" in vehicle composition CSVs.
# - For each component ID like "K1BE1-101-1011-90", there is a file:
#     source/Komponente/Bestandteile_Komponente_<COMPONENT>.csv
# - A component file contributes T16 parts iff it contains column "ID_T16".
# - The total T16 count equals the sum of non-null rows in column "ID_T16" of each component file,
#   multiplied by how many times that component is used across the filtered vehicles.

from pathlib import Path
import pandas as pd

# -----------------------------
# Helper configuration & utils
# -----------------------------
BASE_DIR = Path(".")
REGISTRATIONS_CSV = BASE_DIR / "source" / "Zulassungen" / "Zulassungen_alle_Fahrzeuge.csv"
VEHICLE_DIR = BASE_DIR / "source" / "Fahrzeug"
COMPONENT_DIR = BASE_DIR / "source" / "Komponente"

def read_semicolon_csv(csv_path: Path) -> pd.DataFrame:
    """Read a semicolon-delimited CSV with quoted fields. Drop unnamed leading index column if present."""
    df = pd.read_csv(
        csv_path,
        sep=";",
        quotechar='"',
        dtype=str,
        encoding="utf-8",
        engine="python",
    )
    # Drop the leading unnamed index-like column if it exists (e.g., column named "" or startswith 'Unnamed')
    possible_index_cols = [col for col in df.columns if (col == "" or col.lower().startswith("unnamed"))]
    if possible_index_cols:
        df = df.drop(columns=possible_index_cols)
    return df

def safe_strip_upper(series: pd.Series) -> pd.Series:
    """Trim whitespace and uppercase; handles NaNs safely."""
    return series.astype(str).str.strip().str.upper()

# -----------------------------
# 1) Load registrations and filter to Adelshofen
# -----------------------------
registrations_df = read_semicolon_csv(REGISTRATIONS_CSV)

# Normalize column names just in case
registrations_df.columns = [c.strip() for c in registrations_df.columns]

# Validate required columns exist
required_cols_regs = {"IDNummer", "Gemeinden"}
missing_regs = required_cols_regs - set(registrations_df.columns)
if missing_regs:
    raise ValueError(f"Registrations CSV missing required columns: {missing_regs}")

# Filter rows where Gemeinden equals 'ADELSHOFEN' (case-insensitive, trimmed)
is_adelshofen = safe_strip_upper(registrations_df["Gemeinden"]) == "ADELSHOFEN"
adelshofen_vehicle_ids = set(registrations_df.loc[is_adelshofen, "IDNummer"].dropna().astype(str).str.strip())

# Early exit if none found
if not adelshofen_vehicle_ids:
    total_t16_parts_in_adelshofen = 0
    # If needed, you could print or return this variable.
else:
    # -----------------------------
    # 2) Load all vehicle composition CSVs and keep only rows for Adelshofen vehicles
    # -----------------------------
    vehicle_csv_paths = sorted(VEHICLE_DIR.glob("Bestandteile_Fahrzeuge_*.csv"))
    if not vehicle_csv_paths:
        raise FileNotFoundError(f"No vehicle composition CSVs found in {VEHICLE_DIR}")

    vehicle_frames = [read_semicolon_csv(p) for p in vehicle_csv_paths]
    vehicles_df = pd.concat(vehicle_frames, ignore_index=True)

    vehicles_df.columns = [c.strip() for c in vehicles_df.columns]
    if "ID_Fahrzeug" not in vehicles_df.columns:
        raise ValueError("Vehicle composition CSVs must contain 'ID_Fahrzeug' column.")

    vehicles_df["ID_Fahrzeug"] = vehicles_df["ID_Fahrzeug"].astype(str).str.strip()
    vehicles_df = vehicles_df[vehicles_df["ID_Fahrzeug"].isin(adelshofen_vehicle_ids)]

    # Early exit if no matching vehicles after filtering
    if vehicles_df.empty:
        total_t16_parts_in_adelshofen = 0
    else:
        # -----------------------------
        # 3) Collect component IDs used by these vehicles
        #    Identify component columns = all 'ID_*' except 'ID_Fahrzeug'
        # -----------------------------
        component_columns = [c for c in vehicles_df.columns if c.startswith("ID_") and c != "ID_Fahrzeug"]

        # Stack component IDs into a single Series, drop NaNs/empties, and count frequency
        component_ids_series = (
            vehicles_df[component_columns]
            .stack(dropna=True)                        # melt into a single Series
            .astype(str)
            .str.strip()
        )
        component_ids_series = component_ids_series[component_ids_series != ""]
        if component_ids_series.empty:
            total_t16_parts_in_adelshofen = 0
        else:
            component_frequency = component_ids_series.value_counts()

            # -----------------------------
            # 4) For each unique component, read its Komponente file and count T16 rows
            #    Use a cache to avoid re-reading the same component file repeatedly.
            # -----------------------------
            t16_count_cache = {}  # component_id -> number of T16 rows in its file
            total_t16_parts_in_adelshofen = 0

            for component_id, freq_in_vehicles in component_frequency.items():
                if component_id in t16_count_cache:
                    t16_rows_in_component = t16_count_cache[component_id]
                else:
                    component_csv = COMPONENT_DIR / f"Bestandteile_Komponente_{component_id}.csv"
                    if not component_csv.exists():
                        # If a component file is missing, contribute zero
                        t16_rows_in_component = 0
                    else:
                        comp_df = read_semicolon_csv(component_csv)
                        comp_df.columns = [c.strip() for c in comp_df.columns]
                        if "ID_T16" in comp_df.columns:
                            # Count non-null T16 entries; treat empty strings as nulls
                            t16_col = comp_df["ID_T16"].astype(str).str.strip()
                            t16_rows_in_component = t16_col.replace({"": pd.NA}).notna().sum()
                        else:
                            t16_rows_in_component = 0
                    t16_count_cache[component_id] = t16_rows_in_component

                # Multiply the T16 rows by how many times this component appears across filtered vehicles
                total_t16_parts_in_adelshofen += int(freq_in_vehicles) * int(t16_rows_in_component)

# The final answer:
# total_t16_parts_in_adelshofen  -> integer count of T16 parts in vehicles registered in Adelshofen
# You may print or return it as needed, e.g.:
print(total_t16_parts_in_adelshofen)
