In [None]:
import pandas as pd
import os
import glob
from collections import defaultdict

# Define the path to the 'results' folder
results_path = 'results'

# Find folders with 'REP' in the name
folders_with_rep = [f for f in glob.glob(os.path.join(results_path, '*REP*')) if os.path.isdir(f)]

# Group by base experiment name (removing the last '__number' suffix)
grouped_exps = defaultdict(list)
for path in folders_with_rep:
    folder_name = os.path.basename(path)
    if '__' in folder_name:
        base = '__'.join(folder_name.split('__')[:-1])  # remove the last part
        grouped_exps[base].append(path)
    else:
        grouped_exps[folder_name].append(path)  # fallback if no '__' present

# Now you can iterate over each experiment group
for base_name, group_paths in grouped_exps.items():
    print(f"\nGroup: {base_name}:\t{len(group_paths)} cases")

In [None]:
key_exps = list(grouped_exps.keys())

tgt_key = key_exps[0]

all_dfs = []

for folder in grouped_exps[tgt_key]:
    # Extract exp_id from folder name
    exp_id = folder.split('__')[-1]

    # Define path to the parquet file
    parquet_path = os.path.join(folder, 'Baseline_proto.parquet')

    # Check if the file exists
    if os.path.exists(parquet_path):
        # Load the data
        df = pd.read_parquet(parquet_path)

        # Add the exp_id as a column
        df['exp_id'] = exp_id

        # Append to list
        all_dfs.append(df)
    else:
        print(f"Warning: {parquet_path} does not exist.")

# Concatenate all DataFrames
combined_df = pd.concat(all_dfs, ignore_index=True)

# Display or save the result
combined_df.head()

In [None]:
import altair as alt
import pandas as pd

alt.data_transformers.enable("vegafusion")

df_plot = combined_df.copy()

for c in ['client_1', 'client_2']:
    df_plot[c] = df_plot[c].str.replace('__proto', '')
# Add pair column
df_plot['pair'] = df_plot['client_1'] + ' vs ' + df_plot['client_2']

df_plot['epoch'] = df_plot['epoch'].astype(int)
df_plot['label'] = df_plot['label'].astype(int)

epoch_slider = alt.binding_range(min=df_plot['epoch'].min(), max=df_plot['epoch'].max(), step=1, name="Epoch")
client_dropdown = alt.binding_select(options=sorted(df_plot['client_1'].unique()), name="Client")

epoch_select = alt.selection_point(fields=['epoch'], bind=epoch_slider, value=0)
client_select = alt.selection_point(fields=['client_1'], bind=client_dropdown, value = 'District_A')

selection = alt.selection_point(fields=['pair'], bind='legend')

line = alt.Chart(df_plot).mark_line().encode(
    x='label',
    y='mean(cosine)',
    color=alt.Color('pair:N', title='Client Pair'),
    opacity=alt.when(selection).then(alt.value(1)).otherwise(alt.value(0.3))
).add_params(
    epoch_select, client_select, selection
).transform_filter(
    epoch_select & client_select
)

band = alt.Chart(df_plot).mark_errorband(extent='ci').encode(
    x='label',
    y=alt.Y('cosine').title('Miles/Gallon'),
    color=alt.Color('pair:N', title='Client Pair'),
    opacity=alt.when(selection).then(alt.value(.45)).otherwise(alt.value(0.1))
).add_params(
    epoch_select, client_select, selection
).transform_filter(
    epoch_select & client_select
)

final_plot = band + line

final_plot.properties(
        width=500,
        height=300,
        title=f'Experiment:{tgt_key}'
    ).interactive()

In [None]:
from sklearn.preprocessing import MinMaxScaler

def normalize_cosine_with_sklearn(df):
    df = df.copy()

    # Prepare an empty column for the normalized cosine
    df['cosine_norm'] = None

    # Group by (epoch, label, exp_id)
    for (epoch, label, exp_id), group in df.groupby(['epoch', 'label', 'exp_id']):
        scaler = MinMaxScaler()
        # Reshape needed because scaler expects 2D array
        normalized = scaler.fit_transform(group[['cosine']])
        # Assign back to the correct indices
        df.loc[group.index, 'cosine_norm'] = normalized.flatten()

    return df

norm_df = normalize_cosine_with_sklearn(combined_df)
norm_df.head(5)

In [None]:
df_plot = norm_df.copy()

for c in ['client_1', 'client_2']:
    df_plot[c] = df_plot[c].str.replace('__proto', '')
# Add pair column
df_plot['pair'] = df_plot['client_1'] + ' vs ' + df_plot['client_2']

df_plot['epoch'] = df_plot['epoch'].astype(int)
df_plot['label'] = df_plot['label'].astype(int)

epoch_slider = alt.binding_range(min=df_plot['epoch'].min(), max=df_plot['epoch'].max(), step=1, name="Epoch")
client_dropdown = alt.binding_select(options=sorted(df_plot['client_1'].unique()), name="Client")

epoch_select = alt.selection_point(fields=['epoch'], bind=epoch_slider, value=0)
client_select = alt.selection_point(fields=['client_1'], bind=client_dropdown, value = 'District_A')

selection = alt.selection_point(fields=['pair'], bind='legend')

line = alt.Chart(df_plot).mark_line().encode(
    x='label',
    y='mean(cosine_norm)',
    color=alt.Color('pair:N', title='Client Pair'),
    opacity=alt.when(selection).then(alt.value(1)).otherwise(alt.value(0.3))
).add_params(
    epoch_select, client_select, selection
).transform_filter(
    epoch_select & client_select
)

band = alt.Chart(df_plot).mark_errorband(extent='ci').encode(
    x='label',
    y=alt.Y('cosine_norm').title('Miles/Gallon'),
    color=alt.Color('pair:N', title='Client Pair'),
    opacity=alt.when(selection).then(alt.value(.45)).otherwise(alt.value(0.1))
).add_params(
    epoch_select, client_select, selection
).transform_filter(
    epoch_select & client_select
)

final_plot = band + line

final_plot.properties(
        width=500,
        height=300,
        title=f'Experiment:{tgt_key}'
    ).interactive()

In [None]:
qwe = combined_df.copy()
qwe = qwe[qwe['epoch'] == '14']

import seaborn as sns

for c in ['client_1', 'client_2']:
    qwe[c] = qwe[c].str.replace('__proto', '')
# Add pair column
qwe['pair'] = qwe['client_1'] + ' vs ' + qwe['client_2']
qwe = qwe[qwe['client_1'] == 'District_D']

sns.lineplot(data = qwe, x = 'label', y = 'cosine', hue = 'pair')

In [None]:
import warnings
from argparse import ArgumentParser
import pandas as pd
import pickle
from tqdm import tqdm

import subprocess
import sys
import os

from datasets import Priv_NAMES as DATASET_NAMES
from datasets import get_private_dataset
from models import get_all_models, get_model
from utils.Server import train, local_evaluate
from utils.Toolbox_analysis import create_latent_df, process_latent_df
from utils.Toolbox_postprocessing import proto_analysis, distributions_analysis
from utils.Toolbox_visualization import load_and_scale_data, combine_latents, plot_latent_heatmap, plot_time_series_and_latents

from utils.Toolbox_visualization import plot_proto_similar, plot_distribution_similar

warnings.simplefilter(action='ignore', category=FutureWarning)



def parse_args():
    parser = ArgumentParser(description='You Only Need Me', allow_abbrev=False)
    parser.add_argument('--device_id', type=int, default=0, help='The Device Id for Experiment')
    parser.add_argument('--experiment_id', type=str, default='Pipeline_Full_medium_E', help='Experiment identifier')
    parser.add_argument('--extra_coments', type=str, default='proto_month', help='Aditional info')
    parser.add_argument('--run_simulation', type=bool, default=False, help='The Device Id for Experiment')
    parser.add_argument('--detect_anomalies', type=bool, default=False)
    parser.add_argument('--generate_viz', type=bool, default=True, help='Creates and saves interactive visualizations')


    # Communication - epochs
    parser.add_argument('--communication_epoch', type=int, default=2,
                        help='The Communication Epoch in Federated Learning')
    parser.add_argument('--local_epoch', type=int, default=1, help='The Local Epoch for each Participant')

    # Participants info
    parser.add_argument('--parti_num', type=int, default=None, help='The Number for Participants. If "None" will be setted as the sum of values described in --domain')
    parser.add_argument('--online_ratio', type=float, default=1, help='The Ratio for Online Clients')
    parser.add_argument('--tgt_district', type=str, default='District_E', help='Target district name.')
    
    # Data parameter
    parser.add_argument('--dataset', type=str, default='fl_leaks', choices=DATASET_NAMES, help='Which scenario to perform experiments on.')
    parser.add_argument('--domains', type=dict, default={
                                                        'Graeme': 5,
                                                        # 'Balerma': 3,
                                                        },
                        help='Domains and respective number of participants.')

    ## Time series preprocessing
    parser.add_argument('--interval_agg', type=int, default=2 * 60 ** 2,
                        help='Agregation interval (seconds) of time series')
    parser.add_argument('--window_size', type=int, default=84, help='Rolling window length')

    # Model (AER) parameters
    parser.add_argument('--input_size', type=int, default=5, help='Number of sensors')  #TODO adaptar
    parser.add_argument('--output_size', type=int, default=5, help='Shape output - dense layer')
    parser.add_argument('--lstm_units', type=int, default=30,
                        help='Number of LSTM units (the latent space will have dimension 2 times bigger')
    

    # Federated parameters
    parser.add_argument('--model', type=str, default='fpl', help='Federated Model name.', choices=get_all_models()) #fedavg

    parser.add_argument('--structure', type=str, default='homogeneity')

    parser.add_argument('--pri_aug', type=str, default='weak',  # weak strong
                        help='Augmentation for Private Data')
    parser.add_argument('--learning_decay', type=bool, default=False, help='The Option for Learning Rate Decay')
    parser.add_argument('--averaging', type=str, default='weight', help='The Option for averaging strategy')

    parser.add_argument('--infoNCET', type=float, default=0.02, help='The InfoNCE temperature')
    parser.add_argument('--T', type=float, default=0.05, help='The Knowledge distillation temperature')
    parser.add_argument('--weight', type=int, default=1, help='The Weigth for the distillation loss')

    args, unknown = parser.parse_known_args()

    if args.parti_num is None:
        args.parti_num = sum(args.domains.values())

    return args

In [None]:
from utils.Server import local_evaluate
from utils.Toolbox_analysis import process_latent_df

In [None]:
import glob
import os

results_id = "D_2_LL_LM_3_2_2_84_proto_NCET0.2_LSTM20"  # replace with your actual results_id
results_dir = f"results/{results_id}"

# Recursively find all .parquet files
parquet_files = glob.glob(os.path.join(results_dir, '**', '*.parquet'), recursive=True)

# Print or use the list
for file in parquet_files:
    print(file)

In [None]:
df_exp_proto = pd.read_parquet(f'{results_dir}/Baseline_proto.parquet')

int_cols = ['epoch', 'label']
for c in int_cols:
    df_exp_proto[c] = df_exp_proto[c].astype(int)
    
plot_proto_similar(df_exp_proto, results_dir)

In [None]:
# data_latent = pd.read_parquet(parquet_files[1])
# data_latent = data_latent[~data_latent['client_id'].str.contains('proto')].copy()

# id_cols = ['client_id', 'label', 'epoch']
# feat_cols = [col for col in data_latent.columns if 'x_' in col]
# aux_agg = data_latent[id_cols + feat_cols]
# aux_agg = aux_agg.groupby(id_cols).mean().reset_index()
# aux_agg = aux_agg.sort_values(by=['epoch', 'label', 'client_id']).reset_index(drop=True)
# aux_agg['client_id'] += '__proto'

# int_cols = ['epoch', 'label']
# for c in int_cols:
#     aux_agg[c] = aux_agg[c].astype(int)
    
# df_exp_proto = proto_analysis(data_latent=aux_agg, normalize=True)

# aux_df = df_exp_proto.copy()
# aux_df.columns = ['epoch', 'label', 'client_2', 'client_1', 'cosine', 'manhattan', 'wavelet', 'dft', 'autocorr']

# df_exp_proto = pd.concat([df_exp_proto, aux_df[df_exp_proto.columns.tolist()]])
# df_exp_proto = df_exp_proto.sort_values(by=['epoch', 'label', 'client_1', 'client_2']).reset_index(drop=True)
# df_exp_proto.head()

In [None]:
df_exp_latent = pd.read_parquet(f'{results_dir}/Baseline_distribution.parquet')

int_cols = ['epoch', 'label']
for c in int_cols:
    df_exp_latent[c] = df_exp_latent[c].astype(int)
    
df_exp_latent

In [None]:
plot_distribution_similar(df_exp_latent, results_dir)

In [None]:
long_df

In [None]:
# latent_path = 'results/D_2_LL_LM_2_1_2_84_proto_NCET0.2_LSTM20/.pkl'
# with open(latent_path, 'rb') as f:
#     latent_dfs = pickle.load(f)

Baseline_latent_space = pd.read_parquet(parquet_files[3])

qwe = Baseline_latent_space.copy()
qwe.columns = ['client_2', 'client_1', 'cosine', 'manhattan', 'wavelet', 'dft', 'autocorr', 'id']

new_qwe = pd.concat([Baseline_latent_space, qwe[Baseline_latent_space.columns.tolist()]])
new_qwe = new_qwe.sort_values(by = ['id', 'client_1', 'client_2'])
new_qwe

In [None]:
scaled_df = load_and_scale_data(id_network = 'Graeme', id_experiment = 'D_2_LL_LM', tgt_district = 'District_D')
scaled_df.head()

In [None]:
def combine_latents(results_dir):
    """Combines all epoch latent data into a single DataFrame with metadata."""

    parquet_files = glob.glob(os.path.join(results_dir, '**', '*.parquet'), recursive=True)
    pca_umap_files = [f for f in parquet_files if 'pca' in os.path.basename(f).lower() or 'umap' in os.path.basename(f).lower()]
    

    df_all = []

    for file in pca_umap_files:
        df = pd.read_parquet(file)
        df_all.append(df)

    df_combined = pd.concat(df_all, ignore_index=True)
    
    df_combined['timestamp'] = pd.to_datetime(df_combined['timestamp'])
    df_combined.rename(columns = {'label' : 'month', 'client_id' : 'label'}, inplace = True)
    # df_combined['month'] = df_combined['timestamp'].dt.month

    int_cols = ['epoch', 'month']
    for c in int_cols:
        df_combined[c] = df_combined[c].astype(int)

    df_combined['hour'] = df_combined['timestamp'].dt.hour
    df_combined['hour_filter'] = df_combined['hour'].apply(lambda x: x - 12 if x >= 12 else x)

    return df_combined

In [None]:
results_id = "D_2_LL_LM_2_1_2_84_proto_NCET0.2_LSTM20"  # replace with your actual results_id
results_dir = f"results/{results_id}"

df_combined = combine_latents(results_dir)
df_combined.head()

In [None]:
args = parse_args()

args.extra_coments = 'proto_month_DEBUGANDO'

agg_int = int(args.interval_agg / 3600)
results_id = f'{args.experiment_id}_{args.communication_epoch}_{args.local_epoch}_{agg_int}_{args.window_size}_{args.extra_coments}'

logs_path = f"results/logs_{results_id}.pkl"
results_path = f"results/results_{results_id}.pkl"
latent_path = f"results/latent_{results_id}.pkl"
results_path

In [None]:
with open(results_path, 'rb') as f:
    results_debug = pickle.load(f)

with open(latent_path, 'rb') as f:
    latent_dfs = pickle.load(f)

with open(logs_path, 'rb') as f:
    logs = pickle.load(f)

In [None]:
args = logs['Baseline']['args']

label_clients = [
    'District_A', 'District_B', 'District_C', 'District_D', 'District_E',
    'District_2A', 'District_2B', 'District_2C'
]

priv_dataset = get_private_dataset(args)

backbones_list = priv_dataset.get_backbone(
    parti_num=args.parti_num,
    names_list=None,
    n_series=args.input_size
)

In [None]:
logs['Baseline']['model'].keys()

In [None]:
train_DL = priv_dataset.get_data_loaders()
base_index = train_DL[0]['X_index']
latent_dfs_local = {}

scenarios = ['Baseline']


for scenario in scenarios:
    global_model_history = logs[scenario]['model']['global_weights_history']
    for epoch in range(args.communication_epoch):
        aux_latents = []
        state_dict = global_model_history[epoch]
        for net in backbones_list:
            net.load_state_dict(state_dict)
    
        latent_spaces = local_evaluate(backbones_list, train_DL, priv_dataset, False, False)
        for i, client in enumerate(latent_spaces):
            client_lat = create_latent_df(
                X_index=base_index,
                x_lat=client,
                label=f"{label_clients[i]}__{epoch}",
                is_unix=True
            )
            aux_latents.append(client_lat)

        data_latent = pd.concat(aux_latents)
        data_latent[['client_id', 'epoch']] = data_latent['label'].str.split('__', expand = True)
        data_latent['label'] = data_latent['timestamp'].dt.month
    
        id_cols = ['client_id', 'label', 'epoch']
        feat_cols = [col for col in data_latent.columns if 'x_' in col]
        aux_agg = data_latent[id_cols + feat_cols]
        aux_agg = aux_agg.groupby(id_cols).mean().reset_index()
        aux_agg['client_id'] += '__proto'
    
        data_latent = pd.concat([data_latent[['timestamp'] + id_cols + feat_cols], aux_agg])
        data_latent.reset_index(drop = True, inplace = True)

        df_latent, df_pca_scaled, df_umap_scaled = process_latent_df(
                    df_latent = data_latent,
                    umap_neighbors=15,
                    umap_min_dist=0.50,
                    reduce_raw = False,
                    id_cols =id_cols,
                    return_scaled = False
                )

        latent_dfs[scenario][epoch] = {
            'latent_space': df_latent,
            'pca_scl': df_pca_scaled,
            'umap_scl': df_umap_scaled
        }


# zxc = []
# for asd in latent_dfs['Baseline'].values():
#     qwe = asd['latent_space'].copy()
#     qwe.drop(columns = ['hour', 'month'], inplace = True)
#     zxc.append(qwe)

# final_qwe = pd.concat(zxc)
# data_latent.equals(final_qwe)

# latent_dfs[scenario][epoch]['umap_scl'].tail()

In [None]:
df_proto = latent_dfs[scenario][epoch]['latent_space'].copy()
df_proto = df_proto[~df_proto['client_id'].str.contains('proto')]
df_proto['client_id'] = df_proto['client_id'].str.replace('__proto', '')
df_proto.head()

In [None]:
from scipy.spatial import procrustes
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import wasserstein_distance, energy_distance
from scipy.linalg import subspace_angles
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.stats import entropy



def spectral_procrustes(sim_target, sim_other):
    try:
        _, _, disparity = procrustes(sim_target, sim_other)
        return disparity
    except:
        return np.nan
        
def compute_kl_divergence(Xa, Xb, bins=30):
    kl_scores = []
    for i in range(Xa.shape[1]):
        hist_a, _ = np.histogram(Xa[:, i], bins=bins, density=True)
        hist_b, _ = np.histogram(Xb[:, i], bins=bins, density=True)
        hist_a += 1e-8  # smooth to avoid log(0)
        hist_b += 1e-8
        kl = entropy(hist_a, hist_b)
        kl_scores.append(kl)
    return np.mean(kl_scores)

def compute_js_divergence(Xa, Xb, bins=30):
    js_scores = []
    for i in range(Xa.shape[1]):
        hist_a, _ = np.histogram(Xa[:, i], bins=bins, density=True)
        hist_b, _ = np.histogram(Xb[:, i], bins=bins, density=True)
        hist_a += 1e-8
        hist_b += 1e-8
        m = 0.5 * (hist_a + hist_b)
        js = 0.5 * entropy(hist_a, m) + 0.5 * entropy(hist_b, m)
        js_scores.append(js)
    return np.mean(js_scores)


# --- Drift Metrics Functions

def compute_mmd(X, Y, gamma=1.0):
    K = rbf_kernel(X, X, gamma=gamma)
    L = rbf_kernel(Y, Y, gamma=gamma)
    KL = rbf_kernel(X, Y, gamma=gamma)
    return K.mean() + L.mean() - 2 * KL.mean()

def subspace_alignment(X1, X2, n_components=10):
    pca1 = PCA(n_components=n_components).fit(X1)
    pca2 = PCA(n_components=n_components).fit(X2)
    angles = subspace_angles(pca1.components_.T, pca2.components_.T)
    return np.sum(np.cos(angles))

def dtw_client_trajectory(df, client):
    grouped = df[df.client_id == client].sort_values(by="label")
    monthly = grouped.groupby("label")[features].mean().values
    return monthly

def run_dbscan(df_sub):
    db = DBSCAN(eps=0.5, min_samples=5).fit(df_sub[features])
    return db.labels_

def spectral_cluster_latent(df_sub, n_components=2):
    sim = cosine_similarity(df_sub[features])
    embedding = SpectralEmbedding(n_components=n_components, affinity='precomputed')
    X_trans = embedding.fit_transform(sim)
    return X_trans

def compute_mi(df, client_a, client_b, month):
    Xa = df[(df.label == month) & (df.client_id == client_a)][features].values
    Xb = df[(df.label == month) & (df.client_id == client_b)][features].values
    if len(Xa) == 0 or len(Xb) == 0:
        return np.nan
    est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
    Xa_d = est.fit_transform(Xa)
    Xb_d = est.fit_transform(Xb)
    mi_scores = [mutual_info_classif(Xa_d, Xb_d[:, i], discrete_features=True).mean() for i in range(Xb_d.shape[1])]
    return np.mean(mi_scores)

def compare_clients_distribution(Xa, Xb):
    mmd_val = compute_mmd(Xa, Xb, gamma=0.5)
    w_dist = np.mean([wasserstein_distance(Xa[:, i], Xb[:, i]) for i in range(Xa.shape[1])])
    e_dist = energy_distance(Xa.flatten(), Xb.flatten())
    return mmd_val, w_dist, e_dist

In [None]:
# --- Setup

df = df_proto.copy()
features = [col for col in df.columns if col.startswith("x_")]
clients = df["client_id"].unique()
months = sorted(df["label"].unique())
target_client = "District_E"

In [None]:
import time

results = []

for month in tqdm(months):
    df_month = df[df.label == month]
    X_target = df_month[df_month.client_id == target_client][features].values

    try:
        spectral_target = spectral_cluster_latent(df_month[df_month.client_id == target_client])
    except Exception:
        spectral_target = None

    for other_client in clients:
        if other_client == target_client:
            continue

        X_other = df_month[df_month.client_id == other_client][features].values
        if X_target.size == 0 or X_other.size == 0:
            continue

        row = {
            "month": month,
            "target_client": target_client,
            "other_client": other_client,
        }

        # 1. Distribution metrics
        start = time.time()
        mmd_val, w_dist, e_dist = compare_clients_distribution(X_target, X_other)
        row.update({
            "MMD": mmd_val,
            "Wasserstein": w_dist,
            "Energy": e_dist,
            "Time_Dist": time.time() - start,
        })

        # 2. Subspace alignment
        start = time.time()
        sa = subspace_alignment(X_target, X_other)
        row.update({
            "SubspaceAlignment": sa,
            "Time_Subspace": time.time() - start,
        })

        # 3. DTW
        start = time.time()
        traj_target = dtw_client_trajectory(df[df.label <= month], target_client)
        traj_other = dtw_client_trajectory(df[df.label <= month], other_client)
        min_len = min(len(traj_target), len(traj_other))
        dtw_val, _ = fastdtw(traj_target[:min_len], traj_other[:min_len], dist=euclidean)
        row.update({
            "DTW": dtw_val,
            "Time_DTW": time.time() - start,
        })

        # 4. Mutual Information
        start = time.time()
        mi = compute_mi(df, target_client, other_client, month)
        row.update({
            "MutualInfo": mi,
            "Time_MI": time.time() - start,
        })

        # 5. KL Divergence
        start = time.time()
        kl = compute_kl_divergence(X_target, X_other)
        row.update({
            "KL": kl,
            "Time_KL": time.time() - start,
        })

        # 6. JS Divergence
        start = time.time()
        jsd = compute_js_divergence(X_target, X_other)
        row.update({
            "JSD": jsd,
            "Time_JSD": time.time() - start,
        })

        results.append(row)

df_results = pd.DataFrame(results)


In [None]:
df_results

In [None]:
results = []

for month in tqdm(months):
    df_month = df[df.label == month]
    X_target = df_month[df_month.client_id == target_client][features].values

    try:
        spectral_target = spectral_cluster_latent(df_month[df_month.client_id == target_client])
    except Exception:
        spectral_target = None  # fallback

    for other_client in clients:
        if other_client == target_client:
            continue

        X_other = df_month[df_month.client_id == other_client][features].values
        if X_target.size == 0 or X_other.size == 0:
            continue

        # 1. Distribution metrics
        mmd_val, w_dist, e_dist = compare_clients_distribution(X_target, X_other)

        # 2. Subspace alignment
        sa = subspace_alignment(X_target, X_other)

        # 3. DTW over historical latent trajectory
        traj_target = dtw_client_trajectory(df[df.label <= month], target_client)
        traj_other = dtw_client_trajectory(df[df.label <= month], other_client)
        min_len = min(len(traj_target), len(traj_other))
        dtw_val, _ = fastdtw(traj_target[:min_len], traj_other[:min_len], dist=euclidean)

        # 4. Mutual Information
        mi = compute_mi(df, target_client, other_client, month)

        # 5. KL & JSD
        kl = compute_kl_divergence(X_target, X_other)
        jsd = compute_js_divergence(X_target, X_other)

        results.append({
            "month": month,
            "target_client": target_client,
            "other_client": other_client,
            "MMD": mmd_val,
            "Wasserstein": w_dist,
            "Energy": e_dist,
            "SubspaceAlignment": sa,
            "DTW": dtw_val,
            "MutualInfo": mi,
            "KL": kl,
            "JSD": jsd,
        })

df_results = pd.DataFrame(results)

In [None]:
def drift_id(tgt_district, seed_node, income_density_mapping, drift_income, drift_density):
    # 1. First part: first letter of tgt_district (e.g., "District_D" → "D")
    district_code = tgt_district.split('_')[-1]

    # 2. Second part: Get the N-th value of income_density_mapping, where N is the index of district_code in alphabet
    mapping_values = income_density_mapping.split('_')
    idx = ord(district_code.upper()) - ord('A')  # A=0, B=1, ..., D=3
    if idx < len(mapping_values):
        density_code = mapping_values[idx]
    else:
        density_code = "??"

    # 3. Third part: zip drift pairs and generate codes like "LM", "LH", etc.
    drift_code = drift_income[0].upper() + drift_density[0].upper()

    # Combine parts into final ID string
    exp_id = f"{district_code}_{seed_node}_{density_code}_{drift_code}"
    return exp_id

tgt_district = 'District_D'
income_density_mapping = 'ML_LM_LH_LL_LL'
drift_income = ['low', 'low']
drift_density = ['medium', 'high']
seed_node = '2'
for DI, DD in zip(drift_income, drift_density):
    exp_id = drift_id(tgt_district, seed_node, income_density_mapping, DI, DD)
    print(exp_id)

In [None]:
with open('results/latent_Pipeline_Full_medium_E_2_1_2_84_proto_month_DEBUGANDO.pkl', 'rb') as f:
    latent_dfs = pickle.load(f)

latent_dfs['Baseline']

In [None]:
metrics = ["MMD", "Wasserstein", "Energy", "SubspaceAlignment", "DTW", "MutualInfo", "KL", "JSD"]

for metric in metrics:
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df_results, x="month", y=metric, hue="other_client", marker="o")
    plt.title(f"{metric} Drift vs {target_client}")
    plt.xlabel("Month")
    plt.ylabel(metric)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
from sklearn.metrics.pairwise import cosine_distances
import numpy as np
import pywt

def cosine_distance_proto(p1, p2):
    return cosine_distances(p1.reshape(1, -1), p2.reshape(1, -1))[0, 0]


def manhattan_distance_proto(p1, p2):
    return np.sum(np.abs(p1 - p2))


def wavelet_distance(protos_a, protos_b, wavelet='db1'):
    dist = 0
    for i in range(protos_a.shape[1]):
        coeffs_a = pywt.dwt(protos_a[:, i], wavelet)
        coeffs_b = pywt.dwt(protos_b[:, i], wavelet)
        dist += np.linalg.norm(np.array(coeffs_a[0]) - np.array(coeffs_b[0]))  # Compare approx coefficients
    return dist

def dft_similarity(protos_a, protos_b):
    fft_a = np.fft.fft(protos_a, axis=0)
    fft_b = np.fft.fft(protos_b, axis=0)
    return np.linalg.norm(np.abs(fft_a - fft_b))

def autocorr_similarity(protos_a, protos_b, lag=1):
    def autocorr(x, lag):
        return np.corrcoef(x[:-lag], x[lag:])[0, 1]
    
    acc = 0
    for i in range(protos_a.shape[1]):
        a_corr = autocorr(protos_a[:, i], lag)
        b_corr = autocorr(protos_b[:, i], lag)
        acc += abs(a_corr - b_corr)
    return acc


In [None]:
df_proto = latent_dfs[scenario][epoch]['latent_space'].copy()
df_proto = df_proto[df_proto['client_id'].str.contains('proto')]
df_proto['client_id'] = df_proto['client_id'].str.replace('__proto', '')
df_proto.head()

In [None]:
features = [col for col in df_proto.columns if col.startswith('x_')]
clients = df_proto['client_id'].unique()

# Time-aligned prototype matrix per client
client_protos = {
    client: df_proto[df_proto['client_id'] == client]
        .sort_values('label')[features]
        .values
    for client in clients
}


In [None]:
from itertools import combinations
results = []

for c1, c2 in combinations(clients, 2):
    p1 = client_protos[c1]
    p2 = client_protos[c2]
    
    # Ensure same shape (truncate to minimum length)
    min_len = min(p1.shape[0], p2.shape[0])
    p1 = p1[:min_len]
    p2 = p2[:min_len]
    
    res = {
        "client_1": c1,
        "client_2": c2,
        "cosine": cosine_distance_proto(p1.mean(axis=0), p2.mean(axis=0)),
        "manhattan": manhattan_distance_proto(p1.mean(axis=0), p2.mean(axis=0)),
        "wavelet": wavelet_distance(p1, p2),
        "dft": dft_similarity(p1, p2),
        "autocorr": autocorr_similarity(p1, p2),
    }
    results.append(res)

df_experiment = pd.DataFrame(results)
df_experiment

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_experiment[['cosine', 'manhattan', 'wavelet', 'dft', 'autocorr']] = scaler.fit_transform(
    df_experiment[['cosine', 'manhattan', 'wavelet', 'dft', 'autocorr']]
)

pivot = df_experiment.pivot(index='client_1', columns='client_2', values='manhattan')
sns.heatmap(pivot, annot=True, cmap="viridis")
plt.title("DFT Similarity Between Clients")
plt.show()