In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import os
import re
from typing import Optional, List, Dict, Any, Tuple

def _ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def _sanitize(s: str) -> str:
    """Remove punctuation except spaces/hyphens, collapse spaces, trim."""
    s = re.sub(r"[^a-zA-Z0-9\s\-]", "", str(s))
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _apply_candidate_filters(
    df: pd.DataFrame,
    *,
    current_year: int,
    min_annual_gallons: Optional[float] = None
) -> pd.DataFrame:
    """Filter candidate vehicles. If min_annual_gallons is given, estimate annual gallons
    as (age*gallons)/age (skip age<=0) and keep only rows >= threshold."""
    if df.empty:
        return df
    out = df.copy()
    if min_annual_gallons is not None:
        age = (current_year - pd.to_numeric(out["Model Year"], errors="coerce")).astype(float)
        with np.errstate(divide="ignore", invalid="ignore"):
            annual = pd.to_numeric(out["age * gallons"], errors="coerce") / age
        out["__annual_gallons__"] = annual
        out = out[(out["__annual_gallons__"].notna()) & (out["__annual_gallons__"] >= float(min_annual_gallons))].copy()
        out.drop(columns="__annual_gallons__", inplace=True, errors="ignore")
    return out

def generate_vehicle_cascades(
    data: pd.DataFrame,
    department_name: str,
    vehicle_model: str,
    initial_age_diff: int = 3,
    subsequent_age_diff: int = 5,
    max_vehicle_age: int = 20,
    current_year: int = 2025,
    output_folder: str = '/Users/rohandatta/Documents/Fleet Electrification/All_Electric_Cascades',
    # NEW (optional) flags — keep defaults to preserve old behavior
    keep_singletons: bool = False,
    min_annual_gallons: Optional[float] = None
):
    """
    Generate cascades for a single Department × Vehicle Model (original behavior),
    with fixes:
      - optional singletons
      - optional candidate filters
      - filename sanitizing
      - structured CSV logging per Dept–Model
    """

    _ensure_dir(output_folder)

    # Clean numeric columns (as before)
    data = data.copy()
    data['Model Year'] = pd.to_numeric(data['Model Year'], errors='coerce')
    data['age * gallons'] = pd.to_numeric(data['age * gallons'], errors='coerce')

    # Filter by department and model (strip-safe)
    dept_col = 'Department'
    model_col = 'Vehicle Model'
    filtered_data = data[
        (data[dept_col].fillna('').str.strip() == department_name) &
        (data[model_col].fillna('').str.strip() == vehicle_model)
    ]

    log_rows: List[Dict[str, Any]] = []

    if filtered_data.empty:
        msg = f"No vehicles found for department '{department_name}' and model '{vehicle_model}'."
        print(msg)
        log_rows.append({"event": "no_data", "department": department_name, "model": vehicle_model, "msg": msg})
        return pd.DataFrame()

    all_cascades: List[List[Tuple[str, float, float]]] = []
    global_used_vehicles = set()

    # ---- Seed cascades (same logic; now logs & filters) ----
    for _, starter_row in filtered_data.iterrows():
        starter_vehicle = starter_row['VIN']
        starter_year = starter_row['Model Year']
        starter_value = starter_row['age * gallons']

        if pd.isna(starter_vehicle) or pd.isna(starter_year):
            log_rows.append({"event": "skip_starter_invalid", "vin": starter_vehicle, "year": starter_year, "reason": "missing VIN/year"})
            continue

        if starter_vehicle in global_used_vehicles:
            log_rows.append({"event": "skip_starter_used", "vin": starter_vehicle, "year": starter_year})
            continue

        cascade = [(starter_vehicle, starter_year, starter_value)]
        # Do NOT mark starter as used yet unless we actually keep this cascade

        # build available pool and initial candidates (older by >= initial_age_diff)
        available_vehicles = data[~data['VIN'].isin(global_used_vehicles)].copy()
        candidates = available_vehicles[
            ((starter_year - available_vehicles['Model Year']) >= initial_age_diff) &
            (available_vehicles[model_col].fillna('').str.strip() == vehicle_model)
        ].copy()

        pre_cnt = len(candidates)
        candidates = _apply_candidate_filters(
            candidates,
            current_year=current_year,
            min_annual_gallons=min_annual_gallons
        )
        post_cnt = len(candidates)

        if candidates.empty:
            # if desired, keep this as a single-step cascade
            if keep_singletons:
                all_cascades.append(cascade)
                global_used_vehicles.add(starter_vehicle)
                log_rows.append({
                    "event": "keep_singleton",
                    "starter_vin": starter_vehicle, "starter_year": starter_year,
                    "note": f"no initial candidates (pre={pre_cnt}, post_filter={post_cnt}); kept singleton"
                })
            else:
                log_rows.append({
                    "event": "no_initial_candidate",
                    "starter_vin": starter_vehicle, "starter_year": starter_year,
                    "reason": f"no candidates (pre={pre_cnt}, post_filter={post_cnt})"
                })
            continue

        # choose worst offender by 'age * gallons'
        next_vehicle_row = candidates.loc[candidates['age * gallons'].idxmax()]
        next_vehicle = next_vehicle_row['VIN']
        next_year = next_vehicle_row['Model Year']
        next_value = next_vehicle_row['age * gallons']

        cascade.append((next_vehicle, next_year, next_value))
        all_cascades.append(cascade)
        global_used_vehicles.add(starter_vehicle)
        global_used_vehicles.add(next_vehicle)

        log_rows.append({
            "event": "seed_cascade",
            "starter_vin": starter_vehicle, "starter_year": starter_year,
            "chosen_vin": next_vehicle, "chosen_year": next_year,
            "candidates_considered": pre_cnt, "candidates_after_filter": post_cnt
        })

    # ---- Extend cascades iteratively (same logic; now with filters & logs) ----
    while True:
        cascade_extended = False
        for cascade in all_cascades:
            last_vin, last_year, _ = cascade[-1]
            if current_year - last_year >= max_vehicle_age:
                log_rows.append({
                    "event": "stop_extend_max_age",
                    "last_vin": last_vin, "last_year": last_year,
                    "reason": f"age={current_year - last_year} >= {max_vehicle_age}"
                })
                continue

            available_vehicles = data[~data['VIN'].isin(global_used_vehicles)].copy()
            candidates = available_vehicles[
                ((last_year - available_vehicles['Model Year']) >= subsequent_age_diff) &
                (available_vehicles[model_col].fillna('').str.strip() == vehicle_model)
            ].copy()

            pre_cnt = len(candidates)
            candidates = _apply_candidate_filters(
                candidates,
                current_year=current_year,
                min_annual_gallons=min_annual_gallons
            )
            post_cnt = len(candidates)

            if candidates.empty:
                log_rows.append({
                    "event": "no_extend_candidate",
                    "from_vin": last_vin, "from_year": last_year,
                    "reason": f"no candidates (pre={pre_cnt}, post_filter={post_cnt})"
                })
                continue

            next_vehicle_row = candidates.loc[candidates['age * gallons'].idxmax()]
            next_vehicle = next_vehicle_row['VIN']
            next_year = next_vehicle_row['Model Year']
            next_value = next_vehicle_row['age * gallons']
            cascade.append((next_vehicle, next_year, next_value))
            global_used_vehicles.add(next_vehicle)
            cascade_extended = True

            log_rows.append({
                "event": "extend_cascade",
                "from_vin": last_vin, "from_year": last_year,
                "chosen_vin": next_vehicle, "chosen_year": next_year,
                "candidates_considered": pre_cnt, "candidates_after_filter": post_cnt
            })

        if not cascade_extended:
            break

    # ---- Emit results (same shape as original) ----
    if not all_cascades:
        msg = f"⚠️ No valid cascades for {department_name} - {vehicle_model}. Skipping file."
        print(msg)
        log_rows.append({"event": "no_cascades", "department": department_name, "model": vehicle_model, "msg": msg})
        return

    cascade_rows = []
    for idx, cascade in enumerate(all_cascades, 1):
        row = {
            'Cascade ID': idx,
            'Department': department_name,
            'Model': vehicle_model
        }
        for step_idx, (vehicle_id, year, value) in enumerate(cascade, 1):
            row[f'Step {step_idx} Vehicle'] = vehicle_id
            row[f'Step {step_idx} Year'] = year
            row[f'Step {step_idx} Age*Gallons'] = value
        cascade_rows.append(row)

    cascades_df = pd.DataFrame(cascade_rows)

    # Safe filename (fix)
    safe_vehicle_model = _sanitize(vehicle_model)
    safe_department_name = _sanitize(department_name)

    _ensure_dir(output_folder)
    output_file = f'{output_folder}/{safe_department_name}_{safe_vehicle_model}.csv'
    cascades_df.to_csv(output_file, index=False)
    print(f"✅ Saved cascades for {department_name} - {vehicle_model} to {output_file}")

    # Write a structured log per Dept–Model (fix)
    log_df = pd.DataFrame(log_rows)
    log_file = f'{output_folder}/{safe_department_name}_{safe_vehicle_model}__log.csv'
    log_df.to_csv(log_file, index=False)
    print(f"📝 Wrote log to {log_file}")

    return cascades_df  # (kept original return)

def filter_out_departments(df, departments_to_exclude):
    if departments_to_exclude is None:
        departments_to_exclude = []
    return df[~df['Department'].fillna('').str.strip().isin(departments_to_exclude)].copy()

if __name__ == "__main__":
    # --- ORIGINAL-STYLE MAIN (unchanged behavior, plus new optional knobs) ---
    # You can switch 'data_file' to your cleaned/filtered fleet file.
    data_file = 'filtered_fleet_noGTPD.csv'
    output_folder = 'All_Electric_Cascades'

    df = pd.read_csv(data_file)
    df = filter_out_departments(df, None)

    dept_col = 'Department'
    model_col = 'Vehicle Model'
    all_depts = df[dept_col].fillna('').str.strip().unique()

    # ---- New knobs (adjust as needed) ----
    KEEP_SINGLETONS = False            # set True if you want to keep 1-step cascades
    MIN_ANNUAL_GALLONS = None          # e.g., 250.0 to require min annual gallons for candidates
    CURRENT_YEAR = 2025

    for department_name in all_depts:
        dept_vehicles = df[df[dept_col].fillna('').str.strip() == department_name]
        all_dept_models = dept_vehicles[model_col].fillna('').str.strip().unique()
        for vehicle_model in all_dept_models:
            generate_vehicle_cascades(
                df,
                department_name,
                vehicle_model,
                initial_age_diff=3,
                subsequent_age_diff=5,
                max_vehicle_age=20,
                current_year=CURRENT_YEAR,
                output_folder=output_folder,
                keep_singletons=KEEP_SINGLETONS,
                min_annual_gallons=MIN_ANNUAL_GALLONS
            )


⚠️ No valid cascades for Mechanical Engineering - Chevrolet Blazer. Skipping file.
⚠️ No valid cascades for Mechanical Engineering - Chrysler Town & Country. Skipping file.
✅ Saved cascades for Mechanical Engineering - Ford F-150 to All_Electric_Cascades/Mechanical Engineering_Ford F-150.csv
📝 Wrote log to All_Electric_Cascades/Mechanical Engineering_Ford F-150__log.csv
⚠️ No valid cascades for I&S (Staging) - Chevrolet Express. Skipping file.
✅ Saved cascades for I&S (Staging) - Ford Econoline to All_Electric_Cascades/IS Staging_Ford Econoline.csv
📝 Wrote log to All_Electric_Cascades/IS Staging_Ford Econoline__log.csv
⚠️ No valid cascades for I&S (Zero Waste) - Chevrolet Silverado 1500. Skipping file.
⚠️ No valid cascades for I&S (Zero Waste) - Isuzu Npr Hd. Skipping file.
⚠️ No valid cascades for I&S (Zero Waste) - John Deere Gator Xuv865R. Skipping file.
✅ Saved cascades for Ce - Chevrolet Silverado 1500 to All_Electric_Cascades/Ce_Chevrolet Silverado 1500.csv
📝 Wrote log to All_Ele

⚠️ No valid cascades for I&S (Materials Management) - Isuzu Npr Hd. Skipping file.
