In [None]:
# 01 — Data Preprocessing Demo (Phase 1, MIT)

This notebook demonstrates:
- loading the sample BAAC-like CSV,
- ID normalization,
- timezone-aware timestamp parsing,
- aligned multi-value “data explosion” (public variant),
- basic temporal features and a quick quality report.

_Advanced components will arrive in Phase 2 (EUPL/GPL)._


In [None]:
import pandas as pd
from preprocessing import (
    normalize_id_column, parse_datetime_column, add_accident_time_parts,
    derive_age_from_year_of_birth, basic_quality_report, detect_multivalue_columns,
    explode_aligned_columns, one_hot_multivalue_columns
)

CSV_PATH = "../data/accidents-corporels-de-la-circulation-millesime_eng_columns_selected_data_translated_sample.csv"
df = pd.read_csv(CSV_PATH)
len(df), df.columns.tolist()


In [None]:
# Normalize IDs and parse datetime (UTC)
df = normalize_id_column(df, id_col="ID_accident")
df = parse_datetime_column(df, source_col="Date_and_hour", target_col="dt", utc=True)
df = add_accident_time_parts(df, dt_col="dt")
df.head(3)


In [None]:
# Derive age from Year_of_birth and Accident_Year (safe clipping)
df = derive_age_from_year_of_birth(df, yob_col="Year_of_birth", reference_year_col="Accident_Year", target_col="Age")
df[["Year_of_birth", "Accident_Year", "Age"]].head(5)


In [None]:
# Minimal quality report
qr = basic_quality_report(df)
qr.head(10)


In [None]:
## Aligned Multi-Value Explosion (Public Variant)
We jointly explode `Security_measures` and `User_of_security_measures` as aligned lists (comma-separated).


In [None]:
mv_cols = ["Security_measures", "User_of_security_measures"]
for c in mv_cols:
    assert c in df.columns, f"Missing expected column: {c}"

df_exploded = explode_aligned_columns(
    df,
    columns=mv_cols,
    sep=",",
    strict_equal_lengths=False  # pad short lists with None
)
df_exploded[mv_cols].head(10)


In [None]:
# Optional: simple one-hot on Security_measures after explosion (public-friendly threshold)
df_oh = one_hot_multivalue_columns(df_exploded, columns=["Security_measures"], sep=",", min_count=10)
[x for x in df_oh.columns if x.startswith("Security_measures__")][:10]
