In [None]:
import numpy as np
import pandas as pd
from gtda.time_series import SlidingWindow, TakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude
from tqdm import tqdm
import json

## Opening data

In [None]:
file_path = "../data/raw/data_sprint_2025/dengue.csv"
data_casos = pd.read_csv(file_path)
display(data_casos.head())
print(f"Columns: {data_casos.columns}")

In [None]:
file_path = "../data/raw/data_sprint_2025/climate.csv"
data_climate = pd.read_csv(file_path)
display(data_climate.head())
print(f"Columns: {data_climate.columns}")

In [None]:
file_path = "../data/raw/data_sprint_2025/environ_vars.csv"
data_environ = pd.read_csv(file_path)
display(data_environ.head())
print(f"Columns: {data_environ.columns}")

In [None]:
file_path = "../data/raw/data_sprint_2025/forecasting_climate.csv"
data_forecast_climate = pd.read_csv(file_path)
display(data_forecast_climate.head())
print(f"Columns: {data_forecast_climate.columns}")

In [None]:
file_path = "../data/raw/data_sprint_2025/ocean_climate_oscillations.csv"
data_ocean = pd.read_csv(file_path)
display(data_ocean.head())
print(f"Columns: {data_ocean.columns}")

In [None]:
file_path = "../data/raw/data_sprint_2025/datasus_population_2001_2024.csv"
data_pop = pd.read_csv(file_path)
display(data_pop.head())
print(f"Columns: {data_pop.columns}")

In [None]:
file_path = "../data/raw/data_sprint_2025/map_regional_health.csv"
data_reg_health = pd.read_csv(file_path)
display(data_reg_health.head())
print(f"Columns: {data_reg_health.columns}")

In [None]:
file_path = "../data/raw/data_sprint_2025/dados_episcanner.csv"
data_episcanner = pd.read_csv(file_path)
display(data_episcanner.head())
print(f"Columns: {data_episcanner.columns}")

## Pre-processing

In [None]:
def calculate_rolling_tda(df, target_col='casos', group_col='geocode', window_size=53, stride=1):
    """
    Compute topological features using a sliding window.
    """
    print(f"BEGGINING TDA EXTRACTION (Window={window_size}, Column='{target_col}')...")

    TE = TakensEmbedding(dimension=3, time_delay=1)
    VR = VietorisRipsPersistence(homology_dimensions=[0, 1])
    PE = PersistenceEntropy()
    AMP = Amplitude(metric='wasserstein')

    tda_results = []

    unique_geocodes = df[group_col].unique()

    for geo in tqdm(unique_geocodes, desc="Processando Cidades"):
        city_data = df[df[group_col] == geo].sort_values('time_idx')
        series = city_data[target_col].values

        if len(series) < window_size:
            continue

        SW = SlidingWindow(size=window_size, stride=stride)
        windows = SW.fit_transform(series)

        try:
            point_clouds = TE.fit_transform(windows)

            diagrams = VR.fit_transform(point_clouds)

            entropy = PE.fit_transform(diagrams)
            amplitude = AMP.fit_transform(diagrams)

        except Exception as e:
            print(f"Erro na cidade {geo}: {e}")
            continue

        valid_indices = city_data.index[window_size - 1 :]

        df_city_tda = pd.DataFrame(index=valid_indices)
        df_city_tda[group_col] = geo

        df_city_tda['tda_entropy_H0'] = entropy[:, 0]
        df_city_tda['tda_entropy_H1'] = entropy[:, 1]
        df_city_tda['tda_amplitude_H0'] = amplitude[:, 0]
        df_city_tda['tda_amplitude_H1'] = amplitude[:, 1]

        tda_results.append(df_city_tda)

    if not tda_results:
        print("No results found.")
        return df

    df_tda_final = pd.concat(tda_results)

    print("Merging on main df")
    df_merged = df.merge(
        df_tda_final,
        left_index=True,
        right_index=True,
        how='left',
        suffixes=('', '_drop')
    )

    cols_to_drop = [c for c in df_merged.columns if '_drop' in c]
    df_merged.drop(columns=cols_to_drop, inplace=True)

    tda_cols = ['tda_entropy_H0', 'tda_entropy_H1', 'tda_amplitude_H0', 'tda_amplitude_H1']
    df_merged[tda_cols] = df_merged[tda_cols].fillna(0)

    print("Finished")
    return df_merged

In [None]:
def preprocess_for_tft(
    df_casos,
    df_climate,
    df_environ,
    df_forecast_climate,
    df_ocean,
    df_pop,
    df_reg_health,
    df_episcanner
):
    """
    Process all data for TFT training
    """

    print("Beggining Pre-processing")

    df = df_casos.copy()

    df['date'] = pd.to_datetime(df['date'])
    df_climate['date'] = pd.to_datetime(df_climate['date'])
    df_ocean['date'] = pd.to_datetime(df_ocean['date'])

    df = df.sort_values(['geocode', 'date']).reset_index(drop=True)

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    min_date = df['date'].min()
    df['time_idx'] = ((df['date'] - min_date).dt.days / 7).astype(int)

    if 'epiweek' in df.columns:
        df['week_of_year'] = df['epiweek'].astype(str).str[-2:].astype(int)
    else:
        df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)

    df['week_cycle'] = df['week_of_year'].apply(lambda x: x - 40 if x >= 41 else x + 12)

    df['sin_week_cycle'] = np.sin(2 * np.pi * df['week_cycle'] / 52)
    df['cos_week_cycle'] = np.cos(2 * np.pi * df['week_cycle'] / 52)

    print("Integrating Climate")

    cols_clima = [c for c in df_climate.columns if c not in ['epiweek']]
    df = pd.merge(df, df_climate[cols_clima], on=['geocode', 'date'], how='left')

    present_climate_cols = [c for c in df_climate.columns if c in df.columns and c not in ['geocode', 'date']]
    df[present_climate_cols] = df.groupby('geocode')[present_climate_cols].ffill()

    print("Integrating Ocean Climate")

    df = pd.merge(df, df_ocean, on='date', how='left')
    df[['enso', 'iod', 'pdo']] = df[['enso', 'iod', 'pdo']].ffill()

    print("Integrating Forecasting")

    df_fc = df_forecast_climate.copy()

    df_fc['reference_month'] = pd.to_datetime(df_fc['reference_month'])

    df_fc['valid_date'] = df_fc.apply(
        lambda x: x['reference_month'] + pd.DateOffset(months=int(x['forecast_months_ahead'])),
        axis=1
    )

    df_fc['year'] = df_fc['valid_date'].dt.year
    df_fc['month'] = df_fc['valid_date'].dt.month

    rename_dict = {
        'temp_med': 'forecast_temp_med',
        'umid_med': 'forecast_umid_med',
        'precip_tot': 'forecast_precip_tot'
    }
    df_fc = df_fc.rename(columns=rename_dict)

    cols_to_use = ['geocode', 'year', 'month', 'forecast_temp_med', 'forecast_umid_med', 'forecast_precip_tot']
    df_fc_clean = df_fc[cols_to_use].groupby(['geocode', 'year', 'month']).mean().reset_index()

    df = pd.merge(df, df_fc_clean, on=['geocode', 'year', 'month'], how='left')

    for col in rename_dict.values():
        df[col] = df[col].ffill()

    print("Integrating Population")
    df = pd.merge(df, df_pop, on=['geocode', 'year'], how='left')

    df['log_pop'] = np.log1p(df['population'])
    df['log_pop'] = df.groupby('geocode')['log_pop'].ffill()

    print("Integrating Environmental Variables")

    if 'uf_code' in df.columns and 'uf_code' in df_environ.columns:
        df = df.drop(columns=['uf_code'])

    df = pd.merge(df, df_environ, on='geocode', how='left')

    cols_reg = ['geocode', 'macroregion_name', 'regional_name']
    cols_reg_exist = [c for c in cols_reg if c in df_reg_health.columns]
    df = pd.merge(df, df_reg_health[cols_reg_exist], on='geocode', how='left')

    print("Integrating Episcanner Data")

    target_cols = ['geocode', 'year', 'R0', 'peak_week', 'total_cases', 'alpha', 'beta']
    df_epi_targets = df_episcanner[target_cols].copy()

    df_epi_targets['log_total_cases'] = np.log1p(df_epi_targets['total_cases'])

    df = pd.merge(df, df_epi_targets, on=['geocode', 'year'], how='left')

    print(f"Null: {len(df)}")
    df = df.dropna(subset=['R0'])
    print(f"Complete: {len(df)}")

    df['casos'] = df['casos'].fillna(0)
    df['incidence'] = (df['casos'] / df['population']) * 100000
    df['incidence'] = df['incidence'].fillna(0)

    print("Calculating TDA Features")
    df = calculate_rolling_tda(
        df,
        target_col='incidence',
        group_col='geocode'
    )

    known_reals = [
        "time_idx",
        "week_cycle",
        "sin_week_cycle",
        "cos_week_cycle",
        "log_pop",
        "forecast_temp_med",
        "forecast_umid_med",
        "forecast_precip_tot"
    ]

    tda_features = ['tda_entropy_H0', 'tda_entropy_H1', 'tda_amplitude_H0', 'tda_amplitude_H1']

    unknown_reals = [
        "casos",
        "incidence",
        "temp_med",
        "precip_med",
        "rel_humid_med",
        "enso",
        "iod"
    ] + tda_features

    static_cats = ["koppen", "biome", "macroregion_name"]

    targets = ["R0", "peak_week", "log_total_cases", "alpha", "beta"]

    print("Finished!")
    return df, known_reals, unknown_reals, static_cats, targets

In [None]:
df_final, known, unknown, statics, targets = preprocess_for_tft(
    data_casos, data_climate, data_environ, data_forecast_climate,
    data_ocean, data_pop, data_reg_health, data_episcanner
)

## Saving

In [None]:
def save_optimized_dataset(df, filepath):
    """
    Optimize dataset memory usage and save to parquet.
    """
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype('float32')

    int_cols = df.select_dtypes(include=['int64', 'int']).columns
    for col in int_cols:
        c_min = df[col].min()
        c_max = df[col].max()

        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32)

    categorical_candidates = [
        'uf', 'biome', 'koppen', 'macroregion_name', 'regional_name',
        'month', 'geocode_name'
    ]

    for col in categorical_candidates:
        if col in df.columns:
            if df[col].nunique() / len(df) < 0.5:
                df[col] = df[col].astype('category')

    print("Saving compressed parquet")
    df.to_parquet(
        filepath,
        compression='zstd',
        index=False,
        engine='pyarrow'
    )
    print(f"Saved to: {filepath}")

In [None]:
def save_tft_config(known, unknown, statics, targets, filepath="../data/processed/tft_config.json"):
    """
    Save a JSON config file with the dataset's metadata for TFT.
    """
    config = {
        "time_varying_known_reals": known,
        "time_varying_unknown_reals": unknown,
        "static_categoricals": statics,
        "targets": targets,
        "static_reals": ["num_neighbors"]
    }

    with open(filepath, 'w') as f:
        json.dump(config, f, indent=4)

    print(f"Saved to: {filepath}")

In [None]:
save_optimized_dataset(df_final, "../data/processed/dataset_tft_completo.parquet")

In [None]:
save_tft_config(known, unknown, statics, targets, "../data/processed/tft_config.json")

## Graph Embedding

In [1]:
from src.graph_embedding import generate_graph_embeddings

In [2]:
# No seu Jupyter Notebook:
df_graph, cols = generate_graph_embeddings(
    edges_path="../data/processed/adjacencia_edges.csv",
    output_path="../data/processed/graph_embeddings.csv"
)

Generating graph embeddings
Building graph
Processing 2729 cities
Calculating embeddings




Saved to: ../data/processed/graph_embeddings.csv
