In [1]:
import pandas as pd
import numpy as np
import os
from functions.functions_outliers_cr2sub import *

# === Load data ===
tag = "cr2sub"
version = "v1.1"

gw_all = pd.read_csv(f"../{tag}/{tag}_{version}_gwl_mon.csv", parse_dates=['date'])
gw_all = gw_all.sort_values('date')
well_codes = [col for col in gw_all.columns if col != 'date']
date_index = pd.DatetimeIndex(gw_all['date'])

# === Global parameters ===
min_obs    = 100
win_months = 6
perc_thr   = 0.99 # P99 for jump/residual thresholds
tail_p     = 0.01 # 0.5% / 99.5% for extreme tails
skew_thr   = 0.3 
min_n      = 50
skew_tail_p = 0.01
min_nan_gap = 6   # e.g., need ≥6-month NaN deserts
max_island_len = 3

# Collector for a single wide file (one column per well)
cols = {}  # collect columns here; build DataFrame once at the end to avoid fragmentation

for cod in well_codes:
    gw_raw = pd.Series(gw_all[cod].values, index=date_index)
    if gw_raw.notna().sum() < min_obs:
        cols[cod] = pd.Series(np.nan, index=date_index)
        continue  # Skip well (fill full-NaN column)

    # ===================== Raw time series =====================
    s0 = gw_raw.sort_index()
    
    # ===================== Pass 1 (on s0) =====================
    s1, rule0_1, rule1_1, smooth1 = run_pass(s0, 
                                             win_months, 
                                             perc_thr)

    # ===================== Pass 2 (on s1) =============
    s2, rule0_2, rule1_2, smooth2 = run_pass(s1, 
                                             win_months, 
                                             perc_thr)

    # =============== Filter tails (on s2) =======
    s3, tail_idx1, tail_idx2 = final_tail_filters(s2, 
                                                  win_months=win_months, 
                                                  perc_thr=perc_thr, 
                                                  tail_p=tail_p,
                                                  skew_thr=skew_thr, 
                                                  min_n=min_n, 
                                                  skew_tail_p=skew_tail_p)
    
    # =============== Filter isolated data (on s3) =======
    gw_final, islands_idx = remove_isolated_islands(s3,
                                                    min_nan_gap=min_nan_gap,      
                                                    max_island_len=max_island_len)

    
    # =============== Append data into single dataframe =======
    cols[cod] = gw_final.reindex(date_index)

         

# Build the wide DataFrame once at the end to avoid fragmentation
gw_filtered_wide = pd.DataFrame(cols, index=date_index)
# =============== SAVE csv =======   
if not gw_filtered_wide.empty:
    gw_filtered_wide = gw_filtered_wide.sort_index()
    gw_filtered_wide.index.name = 'date'
    gw_filtered_wide.to_csv(os.path.join('../cr2sub', f"../{tag}/{tag}_{version}_gwl_mon_clean.csv"), na_rep='NA')
    