In [13]:
# LoRA Fine Tuning DINOv2-B 

#!/usr/bin/env python3
"""
Full final pipeline (clean, align, train, cross-eval, combined-eval).

Drop-in script. Edit file paths below and run:
    python full_pipeline_final.py
"""

import os
import time
import json
import joblib
import traceback
from typing import Tuple, List, Dict

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

# Optional: XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except Exception:
    XGBOOST_AVAILABLE = False

# ----------------------------
# CONFIG - edit paths / settings
# ----------------------------
VEG_CSV = "../data/output/vegas_merged_full.csv"
SH_CSV = "../data/output/shanghai_merged_full.csv"

OUTPUT_DIR = "../data/output/cross_city_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TARGET_COL = "lights_sum"        # change if your target column name differs
MIN_TEST_FRAC = 0.05
TEST_FRAC = 0.20                 # used for combined-run (80/20)
VAL_FRAC = 0.10                  # optional validation split fraction when training on one city and testing on another
RANDOM_STATE = 42

# Columns which if present should be dropped (IDs, paths, coordinates, leakage)
COMMON_DROP_SUBSTR = [
    "tile_file", "tile_crs", "geojson_file", "image_path", "tile_num",
    "minx", "miny", "maxx", "maxy", "tile", "tile_id", "city"
]
# Explicit leakage proxies to drop (if present)
LEAKAGE_COLS = ["lights_mean", "lights_nonzero", "target"]  # lights_mean is a near-duplicate of lights_sum
# ----------------------------

def read_df(path: str) -> pd.DataFrame:
    print("Loading:", path)
    df = pd.read_csv(path, low_memory=False)
    print(" -> shape:", df.shape)
    return df

def autodrop_cols(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    """Drop non-numeric / id-like / coordinate / leakage columns automatically.
       Returns cleaned df and list of dropped columns.
    """
    df2 = df.copy()
    drop_cols = []

    # 1) drop exact leakage columns if present
    for c in LEAKAGE_COLS:
        if c in df2.columns:
            drop_cols.append(c)
            df2 = df2.drop(columns=c)

    # 2) drop columns containing substrings (IDs, paths, coords)
    for s in COMMON_DROP_SUBSTR:
        for c in list(df2.columns):
            if s in c:
                if c not in drop_cols:
                    drop_cols.append(c)
                    df2 = df2.drop(columns=c)

    # 3) drop explicitly non-numeric columns (object / text) except target
    for c in list(df2.select_dtypes(include=["object", "string"]).columns):
        if c not in [TARGET_COL]:
            drop_cols.append(c)
            df2 = df2.drop(columns=c)

    # 4) drop columns that are constant (zero variance)
    const_cols = [c for c in df2.columns if df2[c].nunique(dropna=False) <= 1 and c != TARGET_COL]
    for c in const_cols:
        drop_cols.append(c)
        df2 = df2.drop(columns=c)

    return df2, sorted(set(drop_cols))

def ensure_target(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure the target column exists. If CSV has 'target' or other, map to TARGET_COL."""
    if TARGET_COL in df.columns:
        return df
    # try common alternatives
    for alt in ["target", "lights_sum", "sum_lights", "nl_sum"]:
        if alt in df.columns:
            df = df.rename(columns={alt: TARGET_COL})
            return df
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataframe columns: {df.columns.tolist()}")

def prepare_xy(df: pd.DataFrame, feat_list: List[str]=None) -> Tuple[np.ndarray, np.ndarray, List[str]]:
    """Return numeric X, y and feature names list after dropping leakage/IDs and ensuring no text columns remain."""
    df = ensure_target(df.copy())
    df_clean, dropped = autodrop_cols(df)
    # keep only numeric cols (except target)
    numeric_cols = [c for c in df_clean.columns if (np.issubdtype(df_clean[c].dtype, np.number) or c == TARGET_COL)]
    df_clean = df_clean[numeric_cols]
    # reorder with features then target
    feature_cols = [c for c in df_clean.columns if c != TARGET_COL]
    if feat_list is not None:
        # if user provided reference features, intersect and preserve order of feat_list
        feature_cols = [c for c in feat_list if c in df_clean.columns]
    X = df_clean[feature_cols].values.astype(np.float32)
    y = df_clean[TARGET_COL].values.astype(np.float32)
    return X, y, feature_cols

def align_feature_sets(df_train: pd.DataFrame, df_test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
    """Return two dataframes with the same numeric feature columns (intersect).
       Drops leakage columns and non-numeric columns first.
    """
    # ensure targets present
    df_train = ensure_target(df_train.copy())
    df_test = ensure_target(df_test.copy())

    # autodrop text/id columns but keep target
    t1, d1 = autodrop_cols(df_train)
    t2, d2 = autodrop_cols(df_test)

    # keep numeric-only (and target)
    numeric1 = [c for c in t1.columns if (np.issubdtype(t1[c].dtype, np.number) or c == TARGET_COL)]
    numeric2 = [c for c in t2.columns if (np.issubdtype(t2[c].dtype, np.number) or c == TARGET_COL)]

    # intersect features (exclude target)
    feats1 = set([c for c in numeric1 if c != TARGET_COL])
    feats2 = set([c for c in numeric2 if c != TARGET_COL])
    common_feats = sorted(list(feats1.intersection(feats2)))
    if len(common_feats) == 0:
        raise ValueError("No aligned numeric features between train/test CSVs. Check column names (f1..fN) and cleaning.")
    # build new dataframes keeping only common_feats and target
    df_train_a = t1[common_feats + [TARGET_COL]].copy()
    df_test_a  = t2[common_feats + [TARGET_COL]].copy()
    return df_train_a, df_test_a, common_feats

def fit_impute_scale(X_train: np.ndarray) -> Tuple[SimpleImputer, StandardScaler, np.ndarray]:
    imputer = SimpleImputer(strategy="median")
    scaler = StandardScaler()
    X_imp = imputer.fit_transform(X_train)
    X_scaled = scaler.fit_transform(X_imp)
    return imputer, scaler, X_scaled

def transform_with_imputer_scaler(X: np.ndarray, imputer: SimpleImputer, scaler: StandardScaler) -> np.ndarray:
    X_imp = imputer.transform(X)
    X_scaled = scaler.transform(X_imp)
    return X_scaled

def train_and_eval_models(Xtr: np.ndarray, ytr: np.ndarray, Xte: np.ndarray, yte: np.ndarray,
                          outprefix: str) -> Dict[str, Dict]:
    """Train four models and evaluate. Returns dict of metrics and saved filenames."""
    results = {}
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    models = {
        "linear": LinearRegression(),
        "random_forest": RandomForestRegressor(n_estimators=250, n_jobs=4, random_state=RANDOM_STATE),
        "mlp": MLPRegressor(hidden_layer_sizes=(512,256), max_iter=200, random_state=RANDOM_STATE)
    }
    if XGBOOST_AVAILABLE:
        models["xgboost"] = xgb.XGBRegressor(n_estimators=200, tree_method="hist", n_jobs=4, random_state=RANDOM_STATE)

    for name, model in models.items():
        try:
            t0 = time.time()
            model.fit(Xtr, ytr)
            preds = model.predict(Xte)
            r2 = r2_score(yte, preds)
            mae = mean_absolute_error(yte, preds)
            elapsed = time.time() - t0

            # save model
            model_fname = os.path.join(OUTPUT_DIR, f"{outprefix}__{name}_model.joblib")
            joblib.dump(model, model_fname)

            # save preds CSV
            preds_df = pd.DataFrame({"y_true": yte, "y_pred": preds})
            preds_csv = os.path.join(OUTPUT_DIR, f"{outprefix}__{name}_preds.csv")
            preds_df.to_csv(preds_csv, index=False)

            results[name] = {
                "r2": float(r2),
                "mae": float(mae),
                "time_s": elapsed,
                "model_file": model_fname,
                "preds_csv": preds_csv
            }
            print(f"  {name:<12} R² = {r2:.6f}  MAE = {mae:.3f}  time={elapsed:.1f}s")
        except Exception as e:
            print(f"Error training {name}: {repr(e)}")
            traceback.print_exc()
    return results

def run_city_pair(train_csv: str, test_csv: str, prefix: str, val_frac: float=VAL_FRAC):
    """Train on train_csv and evaluate on test_csv. We will:
       - align features
       - impute and scale using training set
       - optionally keep small val set from training for model selection (not used here)
    """
    print("\n=== Cross eval: TRAIN", train_csv, "-> TEST", test_csv, "===")
    df_tr = read_df(train_csv)
    df_te = read_df(test_csv)

    # Align features & ensure target
    df_tr_a, df_te_a, feat_list = align_feature_sets(df_tr, df_te)
    print("Aligned feature count =", len(feat_list))

    # optionally split off a val set from training (not required for these models)
    if 0.0 < val_frac < 0.5:
        Xtr_full, ytr_full, _ = prepare_xy(df_tr_a)
        Xtr, Xval, ytr, yval = train_test_split(Xtr_full, ytr_full, test_size=val_frac, random_state=RANDOM_STATE)
    else:
        Xtr, ytr, _ = prepare_xy(df_tr_a)
        Xval, yval = None, None

    Xte, yte, _ = prepare_xy(df_te_a)

    # Fit imputer+scaler on training
    imputer, scaler, Xtr_scaled = fit_impute_scale(Xtr)
    Xte_scaled = transform_with_imputer_scaler(Xte, imputer, scaler)

    # Save preprocessing
    prep_basename = os.path.join(OUTPUT_DIR, f"{prefix}__prep.joblib")
    joblib.dump({"imputer": imputer, "scaler": scaler, "features": feat_list}, prep_basename)

    # Run models
    results = train_and_eval_models(Xtr_scaled, ytr, Xte_scaled, yte, outprefix=prefix)
    # save summary
    with open(os.path.join(OUTPUT_DIR, f"{prefix}__summary.json"), "w") as fh:
        json.dump({"features": feat_list, "results": results}, fh, indent=2)
    return results

def run_combined_benchmark(csv_list: List[str], outprefix: str, test_frac: float=TEST_FRAC):
    """Combine CSVs, then run 80/20 train/test on combined data."""
    print("\n=== Combined training on", csv_list, "===")
    dfs = [read_df(p) for p in csv_list]
    # align across all csvs by intersection of features
    # start with first
    base = dfs[0]
    for other in dfs[1:]:
        base, other, feats = align_feature_sets(base, other)
        base = pd.concat([base, other], ignore_index=True)
    df_comb = pd.concat(dfs, ignore_index=True)
    df_comb = ensure_target(df_comb)
    # drop leaks/ids
    df_comb_clean, dropped = autodrop_cols(df_comb)
    # ensure numeric only + target
    numeric_cols = [c for c in df_comb_clean.columns if (np.issubdtype(df_comb_clean[c].dtype, np.number) or c == TARGET_COL)]
    df_comb_clean = df_comb_clean[numeric_cols]

    # split 80/20
    if not (0.0 < test_frac < 1.0):
        test_frac = max(MIN_TEST_FRAC, TEST_FRAC)
    X_all = df_comb_clean[[c for c in df_comb_clean.columns if c != TARGET_COL]].values.astype(np.float32)
    y_all = df_comb_clean[TARGET_COL].values.astype(np.float32)
    Xtr, Xte, ytr, yte = train_test_split(X_all, y_all, test_size=test_frac, random_state=RANDOM_STATE)

    # imputer + scaler
    imputer, scaler, Xtr_scaled = fit_impute_scale(Xtr)
    Xte_scaled = transform_with_imputer_scaler(Xte, imputer, scaler)
    joblib.dump({"imputer": imputer, "scaler": scaler, "features": df_comb_clean.columns.tolist()}, os.path.join(OUTPUT_DIR, f"{outprefix}__prep.joblib"))

    results = train_and_eval_models(Xtr_scaled, ytr, Xte_scaled, yte, outprefix=outprefix)
    with open(os.path.join(OUTPUT_DIR, f"{outprefix}__summary.json"), "w") as fh:
        json.dump({"results": results}, fh, indent=2)
    return results

# ----------------------------
# Main runner
# ----------------------------
def main():
    tstart = time.time()
    print("Full pipeline start. OUTPUT_DIR=", OUTPUT_DIR)
    # Basic validations
    for path in [VEG_CSV, SH_CSV]:
        if not os.path.exists(path):
            raise FileNotFoundError(f"CSV not found: {path}. Edit paths at top of script and try again.")

    # 1) Shanghai-only
    try:
        print("\n-> Running Shanghai-only training")
        run_combined_benchmark([SH_CSV], outprefix="shanghai_only")
    except Exception as e:
        print("Shanghai-only failed:", repr(e))
        traceback.print_exc()

    # 2) Vegas-only
    try:
        print("\n-> Running Vegas-only training")
        run_combined_benchmark([VEG_CSV], outprefix="vegas_only")
    except Exception as e:
        print("Vegas-only failed:", repr(e))
        traceback.print_exc()

    # 3) Cross-city: Shanghai -> Vegas
    try:
        run_city_pair(SH_CSV, VEG_CSV, prefix="shanghai_TO_vegas")
    except Exception as e:
        print("Cross sh->vg failed:", repr(e))
        traceback.print_exc()

    # 4) Cross-city: Vegas -> Shanghai
    try:
        run_city_pair(VEG_CSV, SH_CSV, prefix="vegas_TO_shanghai")
    except Exception as e:
        print("Cross vg->sh failed:", repr(e))
        traceback.print_exc()

    # 5) Combined training (80/20)
    try:
        run_combined_benchmark([SH_CSV, VEG_CSV], outprefix="combined_sh_vg")
    except Exception as e:
        print("Combined training failed:", repr(e))
        traceback.print_exc()

    print("All done. Total time: %.1f s" % (time.time() - tstart))

if __name__ == "__main__":
    main()

Full pipeline start. OUTPUT_DIR= ../data/output/cross_city_results

-> Running Shanghai-only training

=== Combined training on ['../data/output/shanghai_merged_full.csv'] ===
Loading: ../data/output/shanghai_merged_full.csv
 -> shape: (4582, 782)
  linear       R² = 0.268750  MAE = 13.926  time=0.1s
  random_forest R² = 0.476686  MAE = 10.196  time=144.1s
  mlp          R² = 0.469725  MAE = 10.608  time=7.8s
  xgboost      R² = 0.408690  MAE = 11.022  time=3.7s

-> Running Vegas-only training

=== Combined training on ['../data/output/vegas_merged_full.csv'] ===
Loading: ../data/output/vegas_merged_full.csv
 -> shape: (3847, 777)
  linear       R² = 0.137553  MAE = 53.344  time=0.1s
  random_forest R² = 0.422090  MAE = 31.094  time=132.3s
  mlp          R² = 0.352351  MAE = 36.852  time=3.0s
  xgboost      R² = 0.383343  MAE = 31.217  time=4.6s

=== Cross eval: TRAIN ../data/output/shanghai_merged_full.csv -> TEST ../data/output/vegas_merged_full.csv ===
Loading: ../data/output/shangh