In [1]:
import numpy as np
import pandas as pd
from gtda.time_series import SlidingWindow, TakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude
from tqdm import tqdm
import json

## Opening data

In [11]:
file_path = "../data/raw/data_sprint_2025/dengue.csv"
data_casos = pd.read_csv(file_path)
display(data_casos.head())
print(f"Columns: {data_casos.columns}")

Unnamed: 0,date,epiweek,geocode,casos,uf,macroregional_geocode,regional_geocode,train_1,target_1,train_2,target_2,train_3,target_3
0,2010-01-03,201001,1100015,3,RO,1101,11005,True,False,True,False,True,False
1,2010-01-03,201001,1100023,12,RO,1102,11001,True,False,True,False,True,False
2,2010-01-03,201001,1100031,2,RO,1101,11006,True,False,True,False,True,False
3,2010-01-03,201001,1100049,159,RO,1101,11002,True,False,True,False,True,False
4,2010-01-03,201001,1100056,9,RO,1101,11006,True,False,True,False,True,False


Columns: Index(['date', 'epiweek', 'geocode', 'casos', 'uf', 'macroregional_geocode',
       'regional_geocode', 'train_1', 'target_1', 'train_2', 'target_2',
       'train_3', 'target_3'],
      dtype='object')


In [3]:
file_path = "../data/raw/data_sprint_2025/climate.csv"
data_climate = pd.read_csv(file_path)
display(data_climate.head())
print(f"Columns: {data_climate.columns}")

Unnamed: 0,date,epiweek,geocode,temp_min,temp_med,temp_max,precip_min,precip_med,precip_max,pressure_min,pressure_med,pressure_max,rel_humid_min,rel_humid_med,rel_humid_max,thermal_range,rainy_days
0,2009-12-27,200952,2700102,22.2044,25.44435,29.91435,2.0215,6.4583,14.3808,0.95505,0.9577,0.95945,53.97185,75.81715,93.8575,7.70995,2
1,2010-01-03,201001,2700102,22.168286,26.178429,31.207286,1.4378,3.7951,12.3307,0.956043,0.957986,0.959571,42.221671,67.825757,91.631429,9.039,7
2,2010-01-10,201002,2700102,22.620286,26.522171,31.730257,0.033,4.42,20.3611,0.956871,0.958829,0.9607,44.396814,67.694114,88.871786,9.109971,7
3,2010-01-17,201003,2700102,22.141086,26.435771,32.14,0.0424,2.6354,11.3664,0.956643,0.958886,0.960614,41.461671,66.964243,89.9223,9.998914,7
4,2010-01-24,201004,2700102,22.709029,27.368457,32.941043,0.2825,1.8231,4.4909,0.954871,0.957071,0.958943,37.207171,62.199757,87.383871,10.232014,7


Columns: Index(['date', 'epiweek', 'geocode', 'temp_min', 'temp_med', 'temp_max',
       'precip_min', 'precip_med', 'precip_max', 'pressure_min',
       'pressure_med', 'pressure_max', 'rel_humid_min', 'rel_humid_med',
       'rel_humid_max', 'thermal_range', 'rainy_days'],
      dtype='object')


In [5]:
file_path = "../data/raw/data_sprint_2025/environ_vars.csv"
data_environ = pd.read_csv(file_path)
display(data_environ.head())
print(f"Columns: {data_environ.columns}")

Unnamed: 0,geocode,uf_code,koppen,biome
0,1100015,11,Am,Amazônia
1,1100023,11,Am,Amazônia
2,1100031,11,Am,Amazônia
3,1100049,11,Am,Amazônia
4,1100056,11,Am,Amazônia


Columns: Index(['geocode', 'uf_code', 'koppen', 'biome'], dtype='object')


In [6]:
file_path = "../data/raw/data_sprint_2025/forecasting_climate.csv"
data_forecast_climate = pd.read_csv(file_path)
display(data_forecast_climate.head())
print(f"Columns: {data_forecast_climate.columns}")

Unnamed: 0,geocode,reference_month,forecast_months_ahead,temp_med,umid_med,precip_tot
0,1100015,2010-01-01,1,25.452503,87.700993,0.000117
1,1100015,2010-01-01,2,25.591567,87.56588,0.000112
2,1100015,2010-01-01,3,25.499011,87.66027,0.000101
3,1100015,2010-01-01,4,25.057325,86.359595,5.8e-05
4,1100015,2010-01-01,5,24.504658,81.563863,2.3e-05


Columns: Index(['geocode', 'reference_month', 'forecast_months_ahead', 'temp_med',
       'umid_med', 'precip_tot'],
      dtype='object')


In [7]:
file_path = "../data/raw/data_sprint_2025/ocean_climate_oscillations.csv"
data_ocean = pd.read_csv(file_path)
display(data_ocean.head())
print(f"Columns: {data_ocean.columns}")

Unnamed: 0,date,enso,iod,pdo
0,1993-01-04,1.12037,-0.339384,0.816528
1,1993-01-11,1.212844,0.036703,1.069586
2,1993-01-17,1.147867,0.044273,0.671869
3,1993-01-24,1.001203,-0.160355,0.543553
4,1993-01-31,1.051898,-0.420325,1.431639


Columns: Index(['date', 'enso', 'iod', 'pdo'], dtype='object')


In [8]:
file_path = "../data/raw/data_sprint_2025/datasus_population_2001_2024.csv"
data_pop = pd.read_csv(file_path)
display(data_pop.head())
print(f"Columns: {data_pop.columns}")

Unnamed: 0,geocode,year,population
0,1100015,2001,26553
1,1100023,2001,75735
2,1100031,2001,7446
3,1100049,2001,73283
4,1100056,2001,18150


Columns: Index(['geocode', 'year', 'population'], dtype='object')


In [9]:
file_path = "../data/raw/data_sprint_2025/map_regional_health.csv"
data_reg_health = pd.read_csv(file_path)
display(data_reg_health.head())
print(f"Columns: {data_reg_health.columns}")

Unnamed: 0,macroregion_code,macroregion_name,uf_code,uf,uf_name,macroregional_geocode,macroregional_name,regional_geocode,regional_name,geocode,geocode_name
0,1,Norte,12,AC,Acre,1201,MACRO UNICA - AC,12002,BAIXO ACRE E PURUS,1200013,AC - ACRELANDIA
1,1,Norte,12,AC,Acre,1201,MACRO UNICA - AC,12001,ALTO ACRE,1200054,AC - ASSIS BRASIL
2,1,Norte,12,AC,Acre,1201,MACRO UNICA - AC,12001,ALTO ACRE,1200104,AC - BRASILEIA
3,1,Norte,12,AC,Acre,1201,MACRO UNICA - AC,12002,BAIXO ACRE E PURUS,1200138,AC - BUJARI
4,1,Norte,12,AC,Acre,1201,MACRO UNICA - AC,12002,BAIXO ACRE E PURUS,1200179,AC - CAPIXABA


Columns: Index(['macroregion_code', 'macroregion_name', 'uf_code', 'uf', 'uf_name',
       'macroregional_geocode', 'macroregional_name', 'regional_geocode',
       'regional_name', 'geocode', 'geocode_name'],
      dtype='object')


In [10]:
file_path = "../data/raw/data_sprint_2025/dados_episcanner.csv"
data_episcanner = pd.read_csv(file_path)
display(data_episcanner.head())
print(f"Columns: {data_episcanner.columns}")

Unnamed: 0,disease,CID10,year,geocode,muni_name,peak_week,beta,gamma,R0,total_cases,alpha,sum_res,ep_ini,ep_end,ep_dur
0,dengue,A90,2011,1200302,Feijó,18.809435,1.013568,0.301084,3.366395,225.737239,0.702946,0.69111,201107,201117,10
1,dengue,A90,2011,1200203,Cruzeiro do Sul,15.225387,0.507522,0.328168,1.546532,141.976093,0.353392,1.494362,201046,201123,29
2,dengue,A90,2011,1200807,Porto Acre,10.092337,0.582864,0.300476,1.939806,76.503274,0.484484,1.447265,201046,201114,20
3,dengue,A90,2011,1200104,Brasiléia,17.145896,0.444267,0.300553,1.478166,405.657193,0.323486,1.210796,201046,201128,34
4,dengue,A90,2011,1200450,Senador Guiomard,11.477129,0.512876,0.30004,1.709358,803.882522,0.414985,1.290211,201046,201118,24


Columns: Index(['disease', 'CID10', 'year', 'geocode', 'muni_name', 'peak_week', 'beta',
       'gamma', 'R0', 'total_cases', 'alpha', 'sum_res', 'ep_ini', 'ep_end',
       'ep_dur'],
      dtype='object')


## Pre-processing

In [13]:
def calculate_rolling_tda(df, target_col='casos', group_col='geocode', window_size=53, stride=1):
    """
    Compute topological features using a sliding window.
    """
    print(f"BEGGINING TDA EXTRACTION (Window={window_size}, Column='{target_col}')...")

    TE = TakensEmbedding(dimension=3, time_delay=1)
    VR = VietorisRipsPersistence(homology_dimensions=[0, 1])
    PE = PersistenceEntropy()
    AMP = Amplitude(metric='wasserstein')

    tda_results = []

    unique_geocodes = df[group_col].unique()

    for geo in tqdm(unique_geocodes, desc="Processando Cidades"):
        city_data = df[df[group_col] == geo].sort_values('time_idx')
        series = city_data[target_col].values

        if len(series) < window_size:
            continue

        SW = SlidingWindow(size=window_size, stride=stride)
        windows = SW.fit_transform(series)

        try:
            point_clouds = TE.fit_transform(windows)

            diagrams = VR.fit_transform(point_clouds)

            entropy = PE.fit_transform(diagrams)
            amplitude = AMP.fit_transform(diagrams)

        except Exception as e:
            print(f"Erro na cidade {geo}: {e}")
            continue

        valid_indices = city_data.index[window_size - 1 :]

        df_city_tda = pd.DataFrame(index=valid_indices)
        df_city_tda[group_col] = geo

        df_city_tda['tda_entropy_H0'] = entropy[:, 0]
        df_city_tda['tda_entropy_H1'] = entropy[:, 1]
        df_city_tda['tda_amplitude_H0'] = amplitude[:, 0]
        df_city_tda['tda_amplitude_H1'] = amplitude[:, 1]

        tda_results.append(df_city_tda)

    if not tda_results:
        print("No results found.")
        return df

    df_tda_final = pd.concat(tda_results)

    print("Merging on main df")
    df_merged = df.merge(
        df_tda_final,
        left_index=True,
        right_index=True,
        how='left',
        suffixes=('', '_drop')
    )

    cols_to_drop = [c for c in df_merged.columns if '_drop' in c]
    df_merged.drop(columns=cols_to_drop, inplace=True)

    tda_cols = ['tda_entropy_H0', 'tda_entropy_H1', 'tda_amplitude_H0', 'tda_amplitude_H1']
    df_merged[tda_cols] = df_merged[tda_cols].fillna(0)

    print("Finished")
    return df_merged

In [14]:
def preprocess_for_tft(
    df_casos,
    df_climate,
    df_environ,
    df_forecast_climate,
    df_ocean,
    df_pop,
    df_reg_health,
    df_episcanner
):
    """
    Process all data for TFT training
    """

    print("Beggining Pre-processing")

    df = df_casos.copy()

    df['date'] = pd.to_datetime(df['date'])
    df_climate['date'] = pd.to_datetime(df_climate['date'])
    df_ocean['date'] = pd.to_datetime(df_ocean['date'])

    df = df.sort_values(['geocode', 'date']).reset_index(drop=True)

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    min_date = df['date'].min()
    df['time_idx'] = ((df['date'] - min_date).dt.days / 7).astype(int)

    if 'epiweek' in df.columns:
        df['week_of_year'] = df['epiweek'].astype(str).str[-2:].astype(int)
    else:
        df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)

    df['week_cycle'] = df['week_of_year'].apply(lambda x: x - 40 if x >= 41 else x + 12)

    df['sin_week_cycle'] = np.sin(2 * np.pi * df['week_cycle'] / 52)
    df['cos_week_cycle'] = np.cos(2 * np.pi * df['week_cycle'] / 52)

    print("Integrating Climate")

    cols_clima = [c for c in df_climate.columns if c not in ['epiweek']]
    df = pd.merge(df, df_climate[cols_clima], on=['geocode', 'date'], how='left')

    present_climate_cols = [c for c in df_climate.columns if c in df.columns and c not in ['geocode', 'date']]
    df[present_climate_cols] = df.groupby('geocode')[present_climate_cols].ffill()

    print("Integrating Ocean Climate")

    df = pd.merge(df, df_ocean, on='date', how='left')
    df[['enso', 'iod', 'pdo']] = df[['enso', 'iod', 'pdo']].ffill()

    print("Integrating Forecasting")

    df_fc = df_forecast_climate.copy()

    df_fc['reference_month'] = pd.to_datetime(df_fc['reference_month'])

    df_fc['valid_date'] = df_fc.apply(
        lambda x: x['reference_month'] + pd.DateOffset(months=int(x['forecast_months_ahead'])),
        axis=1
    )

    df_fc['year'] = df_fc['valid_date'].dt.year
    df_fc['month'] = df_fc['valid_date'].dt.month

    rename_dict = {
        'temp_med': 'forecast_temp_med',
        'umid_med': 'forecast_umid_med',
        'precip_tot': 'forecast_precip_tot'
    }
    df_fc = df_fc.rename(columns=rename_dict)

    cols_to_use = ['geocode', 'year', 'month', 'forecast_temp_med', 'forecast_umid_med', 'forecast_precip_tot']
    df_fc_clean = df_fc[cols_to_use].groupby(['geocode', 'year', 'month']).mean().reset_index()

    df = pd.merge(df, df_fc_clean, on=['geocode', 'year', 'month'], how='left')

    for col in rename_dict.values():
        df[col] = df[col].ffill()

    print("Integrating Population")
    df = pd.merge(df, df_pop, on=['geocode', 'year'], how='left')

    df['log_pop'] = np.log1p(df['population'])
    df['log_pop'] = df.groupby('geocode')['log_pop'].ffill()

    print("Integrating Environmental Variables")

    if 'uf_code' in df.columns and 'uf_code' in df_environ.columns:
        df = df.drop(columns=['uf_code'])

    df = pd.merge(df, df_environ, on='geocode', how='left')

    cols_reg = ['geocode', 'macroregion_name', 'regional_name']
    cols_reg_exist = [c for c in cols_reg if c in df_reg_health.columns]
    df = pd.merge(df, df_reg_health[cols_reg_exist], on='geocode', how='left')

    print("Integrating Episcanner Data")

    target_cols = ['geocode', 'year', 'R0', 'peak_week', 'total_cases', 'alpha', 'beta']
    df_epi_targets = df_episcanner[target_cols].copy()

    df_epi_targets['log_total_cases'] = np.log1p(df_epi_targets['total_cases'])

    df = pd.merge(df, df_epi_targets, on=['geocode', 'year'], how='left')

    print(f"Null: {len(df)}")
    df = df.dropna(subset=['R0'])
    print(f"Complete: {len(df)}")

    df['casos'] = df['casos'].fillna(0)
    df['incidence'] = (df['casos'] / df['population']) * 100000
    df['incidence'] = df['incidence'].fillna(0)

    print("Calculating TDA Features")
    df = calculate_rolling_tda(
        df,
        target_col='incidence',
        group_col='geocode'
    )

    known_reals = [
        "time_idx",
        "week_cycle",
        "sin_week_cycle",
        "cos_week_cycle",
        "log_pop",
        "forecast_temp_med",
        "forecast_umid_med",
        "forecast_precip_tot"
    ]

    tda_features = ['tda_entropy_H0', 'tda_entropy_H1', 'tda_amplitude_H0', 'tda_amplitude_H1']

    unknown_reals = [
        "casos",
        "incidence",
        "temp_med",
        "precip_med",
        "rel_humid_med",
        "enso",
        "iod"
    ] + tda_features

    static_cats = ["koppen", "biome", "macroregion_name"]

    targets = ["R0", "peak_week", "log_total_cases", "alpha", "beta"]

    print("Finished!")
    return df, known_reals, unknown_reals, static_cats, targets

In [15]:
df_final, known, unknown, statics, targets = preprocess_for_tft(
    data_casos, data_climate, data_environ, data_forecast_climate,
    data_ocean, data_pop, data_reg_health, data_episcanner
)

Iniciando Pré-processamento para TFT (Nowcast Paramétrico)
Integrando Clima Histórico
Integrando Dados Oceânicos
Integrando Previsão Climática (Forecast)
Integrando População
Integrando Dados Estáticos
Integrando Targets do Episcanner (Target anual repetido semanalmente)
Linhas antes da limpeza de targets nulos: 4471809
Linhas após limpeza (apenas anos com dados do Episcanner): 1322423
BEGGINING TDA EXTRACTION (Window=53, Column='incidence')...


Processando Cidades: 100%|██████████| 4662/4662 [25:22<00:00,  3.06it/s]  


Merging on main df
Finished
Pré-processamento Concluído!


## Saving

In [2]:
def save_optimized_dataset(df, filepath):
    """
    Optimize dataset memory usage and save to parquet.
    """
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype('float32')

    int_cols = df.select_dtypes(include=['int64', 'int']).columns
    for col in int_cols:
        c_min = df[col].min()
        c_max = df[col].max()

        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32)

    categorical_candidates = [
        'uf', 'biome', 'koppen', 'macroregion_name', 'regional_name',
        'month', 'geocode_name'
    ]

    for col in categorical_candidates:
        if col in df.columns:
            if df[col].nunique() / len(df) < 0.5:
                df[col] = df[col].astype('category')

    print("Saving compressed parquet")
    df.to_parquet(
        filepath,
        compression='zstd',
        index=False,
        engine='pyarrow'
    )
    print(f"Saved to: {filepath}")

In [18]:
def save_tft_config(known, unknown, statics, targets, filepath="../data/processed/tft_config.json"):
    """
    Save a JSON config file with the dataset's metadata for TFT.
    """
    config = {
        "time_varying_known_reals": known,
        "time_varying_unknown_reals": unknown,
        "static_categoricals": statics,
        "targets": targets,
        "static_reals": ["num_neighbors"]
    }

    with open(filepath, 'w') as f:
        json.dump(config, f, indent=4)

    print(f"Saved to: {filepath}")

In [None]:
save_optimized_dataset(df_final, "../data/processed/dataset_tft_completo.parquet")

In [20]:
save_tft_config(known, unknown, statics, targets, "../data/processed/tft_config.json")

Saved to: ../data/processed/tft_config.json
