## Task 6

In [3]:
# -*- coding: utf-8 -*-
"""
Goal:
Find the registration place (municipality) on 2010-08-11 for the vehicle that
contains the body part number "K5-112-1122-79".

Data inputs (semicolon-separated CSVs with quotes):
- source/Zulassungen/Zulassungen_alle_Fahrzeuge.csv
- source/Fahrzeug/Bestandteile_Fahrzeuge_OEM1_Typ11.csv
- source/Fahrzeug/Bestandteile_Fahrzeuge_OEM1_Typ12.csv
- source/Fahrzeug/Bestandteile_Fahrzeuge_OEM2_Typ21.csv
- source/Fahrzeug/Bestandteile_Fahrzeuge_OEM2_Typ22.csv

Assumptions:
- In each "Bestandteile_*.csv", columns are (no header): 
  [row_id, K5_part, K3_part, K2_part, K1_part, IDNummer]
  where the last column (IDNummer like "12-1-12-1") links to Zulassungen.IDNummer.
- Zulassungen CSV has columns: ["(possibly empty)","IDNummer","Gemeinden","Zulassung"].
  We'll drop any unnamed/empty first column.
- A vehicle may have multiple registration rows over time; we select the latest
  registration date that is <= accident date (2010-08-11).
- File encoding is UTF-8; adjust 'encoding' if needed (e.g., "latin-1") for your files.

Output:
- Prints the municipality and supporting details. Raises informative errors if not found.
"""

from pathlib import Path
from datetime import datetime
import pandas as pd

# ----------------------------
# Configuration / constants
# ----------------------------
# Target body part number to search for in K5 column
TARGET_K5_PART_NUMBER = "K5-112-1122-79"

# Accident date (the date at which the registration should be considered)
ACCIDENT_DATE_STR = "11.08.2010"  # given in DD.MM.YYYY
ACCIDENT_DATE = datetime.strptime(ACCIDENT_DATE_STR, "%d.%m.%Y").date()

# File system locations
BASE_DIR = Path("source")
ZULASSUNGEN_CSV = BASE_DIR / "Zulassungen" / "Zulassungen_alle_Fahrzeuge.csv"

BESTANDTEILE_FILES = [
    BASE_DIR / "Fahrzeug" / "Bestandteile_Fahrzeuge_OEM1_Typ11.csv",
    BASE_DIR / "Fahrzeug" / "Bestandteile_Fahrzeuge_OEM1_Typ12.csv",
    BASE_DIR / "Fahrzeug" / "Bestandteile_Fahrzeuge_OEM2_Typ21.csv",
    BASE_DIR / "Fahrzeug" / "Bestandteile_Fahrzeuge_OEM2_Typ22.csv",
]

# CSV parsing parameters (German-style CSV with semicolon and quoted values)
CSV_KW = dict(sep=";", quotechar='"', encoding="utf-8", engine="python")


# ----------------------------
# Helper functions
# ----------------------------
def load_bestandteile_concat(files):
    """
    Load and vertically concatenate the four 'Bestandteile_Fahrzeuge_*.csv' files.

    Returns
    -------
    pd.DataFrame
        Columns standardized to:
        ['row_id', 'K5_part', 'K3_part', 'K2_part', 'K1_part', 'IDNummer']
    """
    frames = []
    for file_path in files:
        if not file_path.exists():
            raise FileNotFoundError(f"Parts file not found: {file_path}")

        # Files appear to have no header; enforce column names explicitly.
        df = pd.read_csv(file_path, header=None, **CSV_KW)

        # Validate/reshape: expect at least 6 columns (the first 6 are relevant)
        if df.shape[1] < 6:
            raise ValueError(
                f"Unexpected column count in {file_path}. "
                f"Expected >= 6, got {df.shape[1]}"
            )

        df = df.iloc[:, :6]
        df.columns = ["row_id", "K5_part", "K3_part", "K2_part", "K1_part", "IDNummer"]

        # Trim whitespace and quotes if any stray artifacts exist
        for col in ["K5_part", "K3_part", "K2_part", "K1_part", "IDNummer"]:
            df[col] = df[col].astype(str).str.strip()

        frames.append(df)

    if not frames:
        raise RuntimeError("No parts data loaded.")
    return pd.concat(frames, ignore_index=True)


def load_zulassungen_table(csv_path):
    """
    Load the registrations table and normalize columns.

    Returns
    -------
    pd.DataFrame
        Columns: ['IDNummer', 'Gemeinden', 'Zulassung_date']
        'Zulassung_date' is a datetime.date
    """
    if not csv_path.exists():
        raise FileNotFoundError(f"Zulassungen file not found: {csv_path}")

    df = pd.read_csv(csv_path, header=0, **CSV_KW)

    # Drop unnamed/empty columns that sometimes appear due to leading separators or BOM
    unnamed_cols = [c for c in df.columns if str(c).strip().lower().startswith("unnamed") or str(c).strip() == ""]
    if unnamed_cols:
        df = df.drop(columns=unnamed_cols)

    # Normalize expected columns
    expected_cols = {"IDNummer", "Gemeinden", "Zulassung"}
    missing = expected_cols.difference(set(df.columns))
    if missing:
        raise ValueError(
            f"Missing expected columns in Zulassungen CSV: {missing}. "
            f"Found columns: {list(df.columns)}"
        )

    # Clean and parse
    df["IDNummer"] = df["IDNummer"].astype(str).str.strip()
    df["Gemeinden"] = df["Gemeinden"].astype(str).str.strip()

    # Parse date string in ISO format YYYY-MM-DD to date
    df["Zulassung_date"] = pd.to_datetime(df["Zulassung"], format="%Y-%m-%d", errors="coerce").dt.date
    if df["Zulassung_date"].isna().any():
        bad_rows = df[df["Zulassung_date"].isna()]
        raise ValueError(
            "Encountered unparsable 'Zulassung' dates in Zulassungen CSV. "
            f"Examples:\n{bad_rows.head(5)}"
        )

    return df[["IDNummer", "Gemeinden", "Zulassung_date"]]


def get_idnummers_for_k5(parts_df, target_k5):
    """
    Find all IDNummer values that contain the target K5 body part number.

    Returns
    -------
    list[str]
        Unique IDNummer values ordered as they appear.
    """
    mask = parts_df["K5_part"] == target_k5
    matches = parts_df.loc[mask, "IDNummer"].dropna().astype(str).str.strip()
    id_list = matches.unique().tolist()
    return id_list


def select_registration_on_or_before(zulassungen_df, idnummer, reference_date):
    """
    For a given IDNummer, select the registration row whose 'Zulassung_date' is the
    latest date that is <= reference_date.

    Returns
    -------
    pd.Series or None
        The selected row (with fields IDNummer, Gemeinden, Zulassung_date), or None if not found.
    """
    subset = zulassungen_df[zulassungen_df["IDNummer"] == idnummer]
    if subset.empty:
        return None

    subset = subset[subset["Zulassung_date"] <= reference_date]
    if subset.empty:
        return None

    # Pick the row with the max Zulassung_date (i.e., most recent before/equal to reference_date)
    idx = subset["Zulassung_date"].idxmax()
    return subset.loc[idx]


def main():
    # 1) Load and combine parts lists
    parts_df = load_bestandteile_concat(BESTANDTEILE_FILES)

    # 2) Find all candidate IDNummer values matching the target K5 part number
    idnummer_candidates = get_idnummers_for_k5(parts_df, TARGET_K5_PART_NUMBER)

    if not idnummer_candidates:
        raise LookupError(
            f"No vehicle found containing K5 part number '{TARGET_K5_PART_NUMBER}'. "
            "Verify the part number or the files."
        )

    # 3) Load registrations table
    zulassungen_df = load_zulassungen_table(ZULASSUNGEN_CSV)

    # 4) For each candidate IDNummer, pick the valid registration for the accident date
    valid_hits = []
    for idn in idnummer_candidates:
        sel = select_registration_on_or_before(zulassungen_df, idn, ACCIDENT_DATE)
        if sel is not None:
            valid_hits.append(sel)

    if not valid_hits:
        # We found the part in some vehicle(s), but none had a registration not later than the accident date.
        # Provide diagnostic info to help further investigation.
        raise LookupError(
            "No registration found on or before the accident date for the matched vehicle(s).\n"
            f"Matched IDNummer(s): {idnummer_candidates}\n"
            f"Accident date: {ACCIDENT_DATE.isoformat()}"
        )

    # 5) If multiple hits exist (rare), report all (deduplicated by (IDNummer, Gemeinden, Zulassung_date))
    result_df = pd.DataFrame(valid_hits).drop_duplicates()

    # 6) Human-readable report
    print("=== Hit-and-Run Investigation Result ===")
    print(f"Accident date: {ACCIDENT_DATE.isoformat()}  (DD.MM.YYYY input: {ACCIDENT_DATE_STR})")
    print(f"Target K5 body part number: {TARGET_K5_PART_NUMBER}")
    print()
    print("Matched registration(s) valid on the accident date:\n")

    for _, row in result_df.iterrows():
        print(f"- IDNummer: {row['IDNummer']}")
        print(f"  Municipality (Gemeinden): {row['Gemeinden']}")
        print(f"  Registration effective date (Zulassung): {row['Zulassung_date'].isoformat()}")
        print()

    # 7) Evidence chain (from which file did we find the part)
    print("Evidence chain for IDNummer discovery (first few matching rows):")
    evidence = parts_df[parts_df["K5_part"] == TARGET_K5_PART_NUMBER]
    # Show a compact view for traceability
    cols_to_show = ["row_id", "K5_part", "K3_part", "K2_part", "K1_part", "IDNummer"]
    print(evidence[cols_to_show].head(10).to_string(index=False))


main()


=== Hit-and-Run Investigation Result ===
Accident date: 2010-08-11  (DD.MM.YYYY input: 11.08.2010)
Target K5 body part number: K5-112-1122-79

Matched registration(s) valid on the accident date:

- IDNummer: 12-1-12-82
  Municipality (Gemeinden): ASCHERSLEBEN
  Registration effective date (Zulassung): 2009-01-02

Evidence chain for IDNummer discovery (first few matching rows):
row_id        K5_part            K3_part            K2_part            K1_part   IDNummer
  82.0 K5-112-1122-79 K3SG1-105-1051-129 K2ST1-109-1092-519 K1BE1-104-1041-409 12-1-12-82
