In [8]:
# Import libraries
import xarray as xr
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
from tqdm import tqdm
import matplotlib.pyplot as plt
import glob
import os
from collections import defaultdict
from causal_ccm import ccm
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import mutual_info_regression
from concurrent.futures import ProcessPoolExecutor, as_completed
import pickle

In [2]:
df_E_temp = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'E_star': [
        5, 5, 6, 5, 7, 7, 5, 5, 5, 5, 5, 6, 6, 5, 6, 4, 5, 6, 4, 7, 5, 4, 5
    ],
    'rho_at_E_star': [
        0.798060, 0.741944, 0.799989, 0.792925, 0.738950, 0.800925, 0.866587,
        0.825330, 0.882790, 0.808089, 0.806972, 0.770027, 0.854267, 0.696767,
        0.671783, 0.668285, 0.671537, 0.784627, 0.894755, 0.791073, 0.707628,
        0.707098, 0.754584
    ]
})

In [3]:
df_tau_mi = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'tau_mi': [
        28, 28, 23, 23, 26, 30, 22, 26, 27, 28, 26, 28, 30, 26, 23, 18, 16, 23, 30, 30, 15, 23, 22
    ],
    'fnn_ratio': [
        0.007994, 0.004899, 0.000000, 0.007943, 0.000000, 0.000000, 0.007165,
        0.006173, 0.006438, 0.006962, 0.003344, 0.000519, 0.001042, 0.005401,
        0.001031, 0.052965, 0.003555, 0.000000, 0.040534, 0.000000, 0.005326,
        0.056801, 0.009468
    ]
})

In [4]:
df_E_pr = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'E_star': [
        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5
    ],
    'rho_at_E_star': [
        0.890573, 0.892491, 0.780876, 0.874656, 0.898843, 0.845710, 0.860979,
        0.791408, 0.892283, 0.770685, 0.847795, 0.876009, 0.833260, 0.817477,
        0.845026, 0.784604, 0.796851, 0.780267, 0.867449, 0.788252, 0.802600,
        0.819318, 0.835008
    ]
})

In [5]:
df_tau_prcp = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'tau_mi_prcp': [
        9, 14, 7, 12, 14, 15, 10, 13, 15, 7, 8, 14, 14, 8, 4, 8, 8, 15, 14, 8, 5, 5, 7
    ],
    'fnn_ratio_prcp': [
        0.097187, 0.036981, 0.022596, 0.022233, 0.045593, 0.036013, 0.038306,
        0.037946, 0.066954, 0.037158, 0.029663, 0.029889, 0.054205, 0.040724,
        0.059780, 0.007557, 0.021619, 0.049455, 0.060790, 0.023379, 0.026046,
        0.039319, 0.045443
    ]
})

In [9]:
with open('df_ccm_por_prov.pkl', 'rb') as f:
    df_ccm_por_prov = pickle.load(f)

In [25]:
# Parameters for Pichincha embedding (as example)
prov = 'Azuay'
E_t   = int(df_E_pr.loc[df_E_pr.NAME_1 == prov, 'E_star'].iloc[0])
tau_t = int(df_tau_prcp.loc[df_tau_prcp.NAME_1 == prov, 'tau_mi_prcp'].iloc[0])

# Data for Pichincha
df_p = df_ccm_por_prov[prov].reset_index(drop=True)
dates = pd.to_datetime(df_p['fecha'])

# Sliding window size:
window_size = 180

# Containers for results
sliding_results = []

# Range of lead times to test
lead_range = range(0, 61) 

# Loop over days allowing a full window
for idx in tqdm(range(window_size - 1, len(df_p)), desc="Sliding CCM 180d"):
    sub = df_p.iloc[idx - window_size + 1 : idx + 1]
    Y = sub['prcp_z'].values
    X = sub['sst_z'].values

    best_rho = -np.inf
    best_lead = None
    best_pval = None

    # Test each lead time within the window
    for lt in lead_range:
        X_shift = np.roll(X, lt)
        c = ccm(X_shift, Y, tau_t, E_t, len(Y))
        rho, _ = c.causality()
        if rho > best_rho:
            best_rho = rho
            best_lead = lt

    sliding_results.append({
        'fecha':    dates.iloc[idx],
        'tau_lead': best_lead,
        'rho':      best_rho,
        'pval':      best_pval
    })

Sliding CCM 180d: 100%|███████████████████████████████████| 3839/3839 [26:52<00:00,  2.38it/s]


In [26]:
df_azuay = pd.DataFrame(sliding_results)
df_azuay

Unnamed: 0,fecha,tau_lead,rho,pval
0,1990-06-29,46,0.034737,
1,1990-06-30,44,0.066676,
2,1990-07-01,52,0.078859,
3,1990-07-02,51,0.088963,
4,1990-07-03,50,0.090534,
...,...,...,...,...
3834,2000-12-27,19,0.431462,
3835,2000-12-28,19,0.409273,
3836,2000-12-29,17,0.406076,
3837,2000-12-30,19,0.429624,


In [27]:
# 1. Selecciona y renombra las columnas de anomalías del primer DataFrame
df1 = (
    df_ccm_por_prov['Azuay']
    [['fecha', 'sst_z', 'prcp_z']]
    .rename(columns={
        'prcp_z': 'sst_anomaly',
        't2m_z': 'pr_anomaly'
    })
)

# 2. Selecciona y renombra las columnas relevantes del segundo DataFrame
df2 = (
    df_azuay
    [['fecha', 'tau_lead', 'rho']]
    .rename(columns={
        'tau_lead': 'time_lead'
    })
)

# 3. Haz el merge por fecha
df_final = pd.merge(df1, df2, on='fecha')

# 4. Guarda el resultado en un archivo pickle
df_final.to_pickle('ccm_azuay_pr.pkl')

# (Opcional) Ver un vistazo de las primeras filas
print(df_final.head())

       fecha     sst_z  sst_anomaly  time_lead       rho
0 1990-06-29  0.245944    -0.489747         46  0.034737
1 1990-06-30  0.234067    -0.510327         44  0.066676
2 1990-07-01  0.255393    -0.491277         52  0.078859
3 1990-07-02  0.264070    -0.443371         51  0.088963
4 1990-07-03  0.213622     0.069194         50  0.090534


In [28]:
# Parameters for Pichincha embedding (as example)
prov = 'Chimborazo'
E_t   = int(df_E_pr.loc[df_E_pr.NAME_1 == prov, 'E_star'].iloc[0])
tau_t = int(df_tau_prcp.loc[df_tau_prcp.NAME_1 == prov, 'tau_mi_prcp'].iloc[0])

# Data for Pichincha
df_p = df_ccm_por_prov[prov].reset_index(drop=True)
dates = pd.to_datetime(df_p['fecha'])

# Sliding window size:
window_size = 180

# Containers for results
sliding_results = []

# Range of lead times to test
lead_range = range(0, 61) 

# Loop over days allowing a full window
for idx in tqdm(range(window_size - 1, len(df_p)), desc="Sliding CCM 180d"):
    sub = df_p.iloc[idx - window_size + 1 : idx + 1]
    Y = sub['prcp_z'].values
    X = sub['sst_z'].values

    best_rho = -np.inf
    best_lead = None
    best_pval = None

    # Test each lead time within the window
    for lt in lead_range:
        X_shift = np.roll(X, lt)
        c = ccm(X_shift, Y, tau_t, E_t, len(Y))
        rho, _ = c.causality()
        if rho > best_rho:
            best_rho = rho
            best_lead = lt

    sliding_results.append({
        'fecha':    dates.iloc[idx],
        'tau_lead': best_lead,
        'rho':      best_rho,
        'pval':      best_pval
    })

Sliding CCM 180d: 100%|███████████████████████████████████| 3839/3839 [19:02<00:00,  3.36it/s]


In [29]:
df_chimborazo = pd.DataFrame(sliding_results)
df_chimborazo

Unnamed: 0,fecha,tau_lead,rho,pval
0,1990-06-29,34,0.233302,
1,1990-06-30,34,0.232279,
2,1990-07-01,34,0.259646,
3,1990-07-02,49,0.251663,
4,1990-07-03,34,0.221707,
...,...,...,...,...
3834,2000-12-27,0,0.320048,
3835,2000-12-28,0,0.325001,
3836,2000-12-29,0,0.326990,
3837,2000-12-30,60,0.407192,


In [30]:
# 1. Selecciona y renombra las columnas de anomalías del primer DataFrame
df1 = (
    df_ccm_por_prov['Chimborazo']
    [['fecha', 'sst_z', 'prcp_z']]
    .rename(columns={
        'prcp_z': 'sst_anomaly',
        't2m_z': 'pr_anomaly'
    })
)

# 2. Selecciona y renombra las columnas relevantes del segundo DataFrame
df2 = (
    df_chimborazo
    [['fecha', 'tau_lead', 'rho']]
    .rename(columns={
        'tau_lead': 'time_lead'
    })
)

# 3. Haz el merge por fecha
df_final = pd.merge(df1, df2, on='fecha')

# 4. Guarda el resultado en un archivo pickle
df_final.to_pickle('ccm_chimborazo_pr.pkl')

# (Opcional) Ver un vistazo de las primeras filas
print(df_final.head())

       fecha     sst_z  sst_anomaly  time_lead       rho
0 1990-06-29  0.245944    -0.696139         34  0.233302
1 1990-06-30  0.234067    -0.725074         34  0.232279
2 1990-07-01  0.255393    -0.686408         34  0.259646
3 1990-07-02  0.264070    -0.601599         49  0.251663
4 1990-07-03  0.213622     0.245609         34  0.221707


In [31]:
# Parameters for Pichincha embedding (as example)
prov = 'Guayas'
E_t   = int(df_E_pr.loc[df_E_pr.NAME_1 == prov, 'E_star'].iloc[0])
tau_t = int(df_tau_prcp.loc[df_tau_prcp.NAME_1 == prov, 'tau_mi_prcp'].iloc[0])

# Data for Pichincha
df_p = df_ccm_por_prov[prov].reset_index(drop=True)
dates = pd.to_datetime(df_p['fecha'])

# Sliding window size:
window_size = 180

# Containers for results
sliding_results = []

# Range of lead times to test
lead_range = range(0, 61) 

# Loop over days allowing a full window
for idx in tqdm(range(window_size - 1, len(df_p)), desc="Sliding CCM 180d"):
    sub = df_p.iloc[idx - window_size + 1 : idx + 1]
    Y = sub['prcp_z'].values
    X = sub['sst_z'].values

    best_rho = -np.inf
    best_lead = None
    best_pval = None

    # Test each lead time within the window
    for lt in lead_range:
        X_shift = np.roll(X, lt)
        c = ccm(X_shift, Y, tau_t, E_t, len(Y))
        rho, _ = c.causality()
        if rho > best_rho:
            best_rho = rho
            best_lead = lt

    sliding_results.append({
        'fecha':    dates.iloc[idx],
        'tau_lead': best_lead,
        'rho':      best_rho,
        'pval':      best_pval
    })

Sliding CCM 180d: 100%|███████████████████████████████████| 3839/3839 [20:31<00:00,  3.12it/s]


In [32]:
df_guayas = pd.DataFrame(sliding_results)
df_guayas

Unnamed: 0,fecha,tau_lead,rho,pval
0,1990-06-29,44,0.075368,
1,1990-06-30,41,0.127993,
2,1990-07-01,41,0.117719,
3,1990-07-02,41,0.118292,
4,1990-07-03,41,0.129317,
...,...,...,...,...
3834,2000-12-27,39,0.427819,
3835,2000-12-28,39,0.406260,
3836,2000-12-29,39,0.347640,
3837,2000-12-30,48,0.391438,


In [33]:
# 1. Selecciona y renombra las columnas de anomalías del primer DataFrame
df1 = (
    df_ccm_por_prov['Guayas']
    [['fecha', 'sst_z', 'prcp_z']]
    .rename(columns={
        'prcp_z': 'sst_anomaly',
        't2m_z': 'pr_anomaly'
    })
)

# 2. Selecciona y renombra las columnas relevantes del segundo DataFrame
df2 = (
    df_guayas
    [['fecha', 'tau_lead', 'rho']]
    .rename(columns={
        'tau_lead': 'time_lead'
    })
)

# 3. Haz el merge por fecha
df_final = pd.merge(df1, df2, on='fecha')

# 4. Guarda el resultado en un archivo pickle
df_final.to_pickle('ccm_guayas_pr.pkl')

# (Opcional) Ver un vistazo de las primeras filas
print(df_final.head())

       fecha     sst_z  sst_anomaly  time_lead       rho
0 1990-06-29  0.245944    -0.521689         44  0.075368
1 1990-06-30  0.234067    -0.532888         41  0.127993
2 1990-07-01  0.255393    -0.570063         41  0.117719
3 1990-07-02  0.264070    -0.531407         41  0.118292
4 1990-07-03  0.213622    -0.065842         41  0.129317
