In [11]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib import rc
from IPython.display import display
from scipy.stats import pearsonr
import seaborn as sns
from scipy.optimize import nnls  # for non-negative least squares
from scipy.stats import linregress
import matplotlib.patches as mpatches

rc('mathtext', default='regular')
from fun_LR_hydro_memory import *  # uses get_gauge_id()

# === Paths ===
root = '/Users/cag/Dropbox/0_Research/A_Fondecyts/2024_CAG_Ini_11240924/fdcyt_cag_GW_analysis/Analysis_cag_mid'
path_join_camels = root + '/data/gwl_dga_inner_join_with_CAMELScl_basins.csv'
path_join_bna    = root + '/data/gwl_dga_inner_join_with_BNA_basins.csv'

# === Load data ===
# Streamflow (monthly), one column per station, 'date' column as pandas datetime
q_all      = pd.read_csv(root + '/data/q_mm_dga_mon_ts_1960_2025.csv', parse_dates=['Index'])

# Predictors (monthly)
pr_camels  = pd.read_csv(root + '/data/cr2met_v2p5R1_pr_mon_CAMELScl_ts_1960_2025.csv', parse_dates=['Index'])
pr_bna     = pd.read_csv(root + '/data/cr2met_v2p5R1_pr_mon_BNA_ts_1960_2025.csv',    parse_dates=['Index'])
et_camels = pd.read_csv(root + '/data/et_wb_mm_mon_cr2met_v2.5_cr2luc_beta_camels_v2025_ts.csv', parse_dates=['date'])

stations = [col for col in q_all.columns if col != 'Index']

# === Global parameters ===
start = "1960-01-01"
end   = "2025-12-31"
min_obs = 100   # min monthly obs per station to be kept

# === Pre-allocate collectors ===
basin_pr_all = pd.DataFrame()
basin_et_all = pd.DataFrame()
basin_q_all  = pd.DataFrame()

# === Build aligned station-by-station tables ===
for cod in stations:
    q = pd.Series(q_all[cod].values, index=q_all['Index'])
    q_ser = pd.Series(q.values, index=q.index, name=cod)

    if q.notna().sum() < min_obs:
        continue  # Skip station with too few months

    # Map station -> predictor basin id (CAMELS/BNA)
    # gauge_id, predictor_source = get_gauge_id(cod, path_join_camels, path_join_bna)
    gauge_id = cod
    predictor_source = 'camels'

    if predictor_source == "camels":
        pr_ser = pd.Series(pr_camels[gauge_id].values, index=pr_camels['Index'], name=cod)
        et_ser = pd.Series(et_camels[gauge_id].values, index=et_camels['date'],   name=cod)

    elif predictor_source == "bna":
        pr_ser = pd.Series(pr_bna[gauge_id].values, index=pr_bna['Index'], name=cod)
        et_ser = pd.Series(et_bna[gauge_id].values, index=et_bna['date'],  name=cod)

    else:
        print(f"⚠️ Predictors not available for station {cod}. Skipping.")
        continue

    # Align time span to common overlap
    # start = max(q_ser.index.min(), pr_ser.index.min(), et_ser.index.min())
    # end   = min(q_ser.index.max(), pr_ser.index.max(), et_ser.index.max())

    q_ser  = q_ser.sort_index().loc[start:end]
    pr_ser = pr_ser.sort_index().loc[start:end]
    et_ser = et_ser.sort_index().loc[start:end]

    # Append to dataframes (outer join over time index)
    basin_q_all  = basin_q_all.join(q_ser,  how='outer') if not basin_q_all.empty  else q_ser.to_frame()
    basin_pr_all = basin_pr_all.join(pr_ser, how='outer') if not basin_pr_all.empty else pr_ser.to_frame()
    basin_et_all = basin_et_all.join(et_ser, how='outer') if not basin_et_all.empty else et_ser.to_frame()

# === Compute monthly anomalies (deseasonalize by calendar-month mean) ===
if basin_q_all.empty:
    print("⚠️ No stations passed filtering — nothing to save.")
else:
    q_an  = basin_q_all  - basin_q_all.groupby(basin_q_all.index.month).transform('mean')
    pr_an = basin_pr_all - basin_pr_all.groupby(basin_pr_all.index.month).transform('mean')
    et_an = basin_et_all - basin_et_all.groupby(basin_et_all.index.month).transform('mean')

    # === Save results ===
    out_dir = os.path.join(root, "data/camels_q_p_et_combined")
    os.makedirs(out_dir, exist_ok=True)

    # Raw (aligned) series
    raw_to_save = [
        ("camels_q.csv",  basin_q_all),
        ("camels_pr.csv", basin_pr_all),
        ("camels_et.csv", basin_et_all),
    ]
    for fname, df in raw_to_save:
        df = df.sort_index()
        df.index.name = "date"
        df.to_csv(os.path.join(out_dir, fname))

    # Monthly anomalies
    an_to_save = [
        ("camels_q_an.csv",  q_an),
        ("camels_pr_an.csv", pr_an),
        ("camels_et_an.csv", et_an),
    ]
    for fname, df in an_to_save:
        df = df.sort_index()
        df.index.name = "date"
        df.to_csv(os.path.join(out_dir, fname))


In [7]:
cod

'12930001'

In [8]:
min(q_ser.index.max(), pr_ser.index.max())

Timestamp('2020-12-01 00:00:00')

In [9]:
q_ser.index.max()

Timestamp('2020-12-01 00:00:00')