In [1]:
# Import libraries
import xarray as xr
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
from tqdm import tqdm
import matplotlib.pyplot as plt
import glob
import os
from collections import defaultdict
from causal_ccm import ccm
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import mutual_info_regression
from concurrent.futures import ProcessPoolExecutor, as_completed
import pickle

In [2]:
df_E_temp = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'E_star': [
        5, 5, 6, 5, 7, 7, 5, 5, 5, 5, 5, 6, 6, 5, 6, 4, 5, 6, 4, 7, 5, 4, 5
    ],
    'rho_at_E_star': [
        0.798060, 0.741944, 0.799989, 0.792925, 0.738950, 0.800925, 0.866587,
        0.825330, 0.882790, 0.808089, 0.806972, 0.770027, 0.854267, 0.696767,
        0.671783, 0.668285, 0.671537, 0.784627, 0.894755, 0.791073, 0.707628,
        0.707098, 0.754584
    ]
})

In [3]:
df_tau_mi = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'tau_mi': [
        28, 28, 23, 23, 26, 30, 22, 26, 27, 28, 26, 28, 30, 26, 23, 18, 16, 23, 30, 30, 15, 23, 22
    ],
    'fnn_ratio': [
        0.007994, 0.004899, 0.000000, 0.007943, 0.000000, 0.000000, 0.007165,
        0.006173, 0.006438, 0.006962, 0.003344, 0.000519, 0.001042, 0.005401,
        0.001031, 0.052965, 0.003555, 0.000000, 0.040534, 0.000000, 0.005326,
        0.056801, 0.009468
    ]
})

In [4]:
df_E_pr = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'E_star': [
        4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5
    ],
    'rho_at_E_star': [
        0.890573, 0.892491, 0.780876, 0.874656, 0.898843, 0.845710, 0.860979,
        0.791408, 0.892283, 0.770685, 0.847795, 0.876009, 0.833260, 0.817477,
        0.845026, 0.784604, 0.796851, 0.780267, 0.867449, 0.788252, 0.802600,
        0.819318, 0.835008
    ]
})

In [5]:
df_tau_prcp = pd.DataFrame({
    'NAME_1': [
        'Azuay', 'Bolivar', 'Carchi', 'Cañar', 'Chimborazo', 'Cotopaxi', 'El Oro',
        'Esmeraldas', 'Guayas', 'Imbabura', 'Loja', 'Los Rios', 'Manabi',
        'Morona Santiago', 'Napo', 'Orellana', 'Pastaza', 'Pichincha', 'Santa Elena',
        'Santo Domingo de los Tsachilas', 'Sucumbios', 'Tungurahua', 'Zamora Chinchipe'
    ],
    'tau_mi_prcp': [
        9, 14, 7, 12, 14, 15, 10, 13, 15, 7, 8, 14, 14, 8, 4, 8, 8, 15, 14, 8, 5, 5, 7
    ],
    'fnn_ratio_prcp': [
        0.097187, 0.036981, 0.022596, 0.022233, 0.045593, 0.036013, 0.038306,
        0.037946, 0.066954, 0.037158, 0.029663, 0.029889, 0.054205, 0.040724,
        0.059780, 0.007557, 0.021619, 0.049455, 0.060790, 0.023379, 0.026046,
        0.039319, 0.045443
    ]
})

In [6]:
with open('df_ccm_por_prov.pkl', 'rb') as f:
    df_ccm_por_prov = pickle.load(f)

In [16]:
# Parameters for Pichincha embedding (as example)
prov = 'Cañar'
E_t   = int(df_E_pr.loc[df_E_pr.NAME_1 == prov, 'E_star'].iloc[0])
tau_t = int(df_tau_prcp.loc[df_tau_prcp.NAME_1 == prov, 'tau_mi_prcp'].iloc[0])

# Data for Pichincha
df_p = df_ccm_por_prov[prov].reset_index(drop=True)
dates = pd.to_datetime(df_p['fecha'])

# Sliding window size:
window_size = 180

# Containers for results
sliding_results = []

# Range of lead times to test
lead_range = range(0, 61) 

# Loop over days allowing a full window
for idx in tqdm(range(window_size - 1, len(df_p)), desc="Sliding CCM 180d"):
    sub = df_p.iloc[idx - window_size + 1 : idx + 1]
    Y = sub['prcp_z'].values
    X = sub['sst_z'].values

    best_rho = -np.inf
    best_lead = None
    best_pval = None

    # Test each lead time within the window
    for lt in lead_range:
        X_shift = np.roll(X, lt)
        c = ccm(X_shift, Y, tau_t, E_t, len(Y))
        rho, _ = c.causality()
        if rho > best_rho:
            best_rho = rho
            best_lead = lt

    sliding_results.append({
        'fecha':    dates.iloc[idx],
        'tau_lead': best_lead,
        'rho':      best_rho,
        'pval':      best_pval
    })

Sliding CCM 180d: 100%|███████████████████████████████████| 3839/3839 [23:30<00:00,  2.72it/s]


In [17]:
df_canar = pd.DataFrame(sliding_results)
df_canar

Unnamed: 0,fecha,tau_lead,rho,pval
0,1990-06-29,12,0.315722,
1,1990-06-30,23,0.309737,
2,1990-07-01,29,0.319131,
3,1990-07-02,12,0.291420,
4,1990-07-03,12,0.295137,
...,...,...,...,...
3834,2000-12-27,12,0.401222,
3835,2000-12-28,12,0.420157,
3836,2000-12-29,12,0.419481,
3837,2000-12-30,12,0.368096,


In [18]:
# 1. Selecciona y renombra las columnas de anomalías del primer DataFrame
df1 = (
    df_ccm_por_prov['Cañar']
    [['fecha', 'sst_z', 'prcp_z']]
    .rename(columns={
        'prcp_z': 'sst_anomaly',
        't2m_z': 'pr_anomaly'
    })
)

# 2. Selecciona y renombra las columnas relevantes del segundo DataFrame
df2 = (
    df_canar
    [['fecha', 'tau_lead', 'rho']]
    .rename(columns={
        'tau_lead': 'time_lead'
    })
)

# 3. Haz el merge por fecha
df_final = pd.merge(df1, df2, on='fecha')

# 4. Guarda el resultado en un archivo pickle
df_final.to_pickle('ccm_canar_pr.pkl')

# (Opcional) Ver un vistazo de las primeras filas
print(df_final.head())

       fecha     sst_z  sst_anomaly  time_lead       rho
0 1990-06-29  0.245944    -0.651993         12  0.315722
1 1990-06-30  0.234067    -0.702140         23  0.309737
2 1990-07-01  0.255393    -0.678350         29  0.319131
3 1990-07-02  0.264070    -0.542537         12  0.291420
4 1990-07-03  0.213622    -0.068056         12  0.295137


In [19]:
# Parameters for Pichincha embedding (as example)
prov = 'Esmeraldas'
E_t   = int(df_E_temp.loc[df_E_temp.NAME_1 == prov, 'E_star'].iloc[0])
tau_t = int(df_tau_mi.loc[df_tau_mi.NAME_1 == prov, 'tau_mi'].iloc[0])

# Data for Pichincha
df_p = df_ccm_por_prov[prov].reset_index(drop=True)
dates = pd.to_datetime(df_p['fecha'])

# Sliding window size:
window_size = 180

# Containers for results
sliding_results = []

# Range of lead times to test
lead_range = range(0, 61) 

# Loop over days allowing a full window
for idx in tqdm(range(window_size - 1, len(df_p)), desc="Sliding CCM 180d"):
    sub = df_p.iloc[idx - window_size + 1 : idx + 1]
    Y = sub['prcp_z'].values
    X = sub['sst_z'].values

    best_rho = -np.inf
    best_lead = None
    best_pval = None

    # Test each lead time within the window
    for lt in lead_range:
        X_shift = np.roll(X, lt)
        c = ccm(X_shift, Y, tau_t, E_t, len(Y))
        rho, _ = c.causality()
        if rho > best_rho:
            best_rho = rho
            best_lead = lt

    sliding_results.append({
        'fecha':    dates.iloc[idx],
        'tau_lead': best_lead,
        'rho':      best_rho,
        'pval':      best_pval
    })

Sliding CCM 180d: 100%|███████████████████████████████████| 3839/3839 [12:44<00:00,  5.02it/s]


In [20]:
df_esmeraldas = pd.DataFrame(sliding_results)
df_esmeraldas

Unnamed: 0,fecha,tau_lead,rho,pval
0,1990-06-29,48,-0.048031,
1,1990-06-30,47,-0.034248,
2,1990-07-01,47,-0.043919,
3,1990-07-02,47,-0.073930,
4,1990-07-03,47,-0.050557,
...,...,...,...,...
3834,2000-12-27,40,0.404985,
3835,2000-12-28,40,0.405781,
3836,2000-12-29,40,0.379830,
3837,2000-12-30,40,0.356503,


In [21]:
# 1. Selecciona y renombra las columnas de anomalías del primer DataFrame
df1 = (
    df_ccm_por_prov['Esmeraldas']
    [['fecha', 'sst_z', 'prcp_z']]
    .rename(columns={
        'prcp_z': 'sst_anomaly',
        't2m_z': 'pr_anomaly'
    })
)

# 2. Selecciona y renombra las columnas relevantes del segundo DataFrame
df2 = (
    df_esmeraldas
    [['fecha', 'tau_lead', 'rho']]
    .rename(columns={
        'tau_lead': 'time_lead'
    })
)

# 3. Haz el merge por fecha
df_final = pd.merge(df1, df2, on='fecha')

# 4. Guarda el resultado en un archivo pickle
df_final.to_pickle('ccm_esmeraldas_pr.pkl')

# (Opcional) Ver un vistazo de las primeras filas
print(df_final.head())

       fecha     sst_z  sst_anomaly  time_lead       rho
0 1990-06-29  0.245944     0.173124         48 -0.048031
1 1990-06-30  0.234067    -0.583100         47 -0.034248
2 1990-07-01  0.255393    -0.659802         47 -0.043919
3 1990-07-02  0.264070    -0.376608         47 -0.073930
4 1990-07-03  0.213622    -0.597366         47 -0.050557


In [22]:
# Parameters for Pichincha embedding (as example)
prov = 'Santa Elena'
E_t   = int(df_E_temp.loc[df_E_temp.NAME_1 == prov, 'E_star'].iloc[0])
tau_t = int(df_tau_mi.loc[df_tau_mi.NAME_1 == prov, 'tau_mi'].iloc[0])

# Data for Pichincha
df_p = df_ccm_por_prov[prov].reset_index(drop=True)
dates = pd.to_datetime(df_p['fecha'])

# Sliding window size:
window_size = 180

# Containers for results
sliding_results = []

# Range of lead times to test
lead_range = range(0, 61) 

# Loop over days allowing a full window
for idx in tqdm(range(window_size - 1, len(df_p)), desc="Sliding CCM 180d"):
    sub = df_p.iloc[idx - window_size + 1 : idx + 1]
    Y = sub['prcp_z'].values
    X = sub['sst_z'].values

    best_rho = -np.inf
    best_lead = None
    best_pval = None

    # Test each lead time within the window
    for lt in lead_range:
        X_shift = np.roll(X, lt)
        c = ccm(X_shift, Y, tau_t, E_t, len(Y))
        rho, _ = c.causality()
        if rho > best_rho:
            best_rho = rho
            best_lead = lt

    sliding_results.append({
        'fecha':    dates.iloc[idx],
        'tau_lead': best_lead,
        'rho':      best_rho,
        'pval':      best_pval
    })

Sliding CCM 180d: 100%|███████████████████████████████████| 3839/3839 [15:59<00:00,  4.00it/s]


In [23]:
df_elena = pd.DataFrame(sliding_results)
df_elena

Unnamed: 0,fecha,tau_lead,rho,pval
0,1990-06-29,51,0.467362,
1,1990-06-30,59,0.449860,
2,1990-07-01,22,0.385305,
3,1990-07-02,22,0.381940,
4,1990-07-03,22,0.385337,
...,...,...,...,...
3834,2000-12-27,44,0.324905,
3835,2000-12-28,31,0.324886,
3836,2000-12-29,31,0.374105,
3837,2000-12-30,31,0.365313,


In [24]:
# 1. Selecciona y renombra las columnas de anomalías del primer DataFrame
df1 = (
    df_ccm_por_prov['Santa Elena']
    [['fecha', 'sst_z', 'prcp_z']]
    .rename(columns={
        'prcp_z': 'sst_anomaly',
        't2m_z': 'pr_anomaly'
    })
)

# 2. Selecciona y renombra las columnas relevantes del segundo DataFrame
df2 = (
    df_elena
    [['fecha', 'tau_lead', 'rho']]
    .rename(columns={
        'tau_lead': 'time_lead'
    })
)

# 3. Haz el merge por fecha
df_final = pd.merge(df1, df2, on='fecha')

# 4. Guarda el resultado en un archivo pickle
df_final.to_pickle('ccm_elena_pr.pkl')

# (Opcional) Ver un vistazo de las primeras filas
print(df_final.head())

       fecha     sst_z  sst_anomaly  time_lead       rho
0 1990-06-29  0.245944    -0.344151         51  0.467362
1 1990-06-30  0.234067    -0.431346         59  0.449860
2 1990-07-01  0.255393    -0.681156         22  0.385305
3 1990-07-02  0.264070    -0.429119         22  0.381940
4 1990-07-03  0.213622    -0.387956         22  0.385337
