# DATA CLEANING
Preparando os dados e gerando um .csv limpo

In [16]:
import numpy as np
import pandas as pd
import os

def transform_binary(df, column):
    df[column] = (
        df[column]
        .replace({2: 0, 9: pd.NA})
        .astype("Int64")
    )
    return df

def transform_evolution(df):
    df["EVOLUCAO"] = (
        df["EVOLUCAO"]
        .replace({2: 1, 1: 0, 3: 0, 9: pd.NA})
        .astype("Int64")
    )
    return df

def to_datetime(df, column):
    df[column] = pd.to_datetime(df[column], format="%Y-%m-%d", errors="coerce")

def drop_columns(df):
  null_pct = df.isna().mean()
  cols_to_drop = null_pct[null_pct > 0.5].index

  df = df.drop(columns=cols_to_drop)

  df.columns = (
    df.columns
      .str.replace("\ufeff", "", regex=False)
      .str.strip()
      .str.upper()
  )

  cols_drop = ["NU_NOTIFIC", "DT_SIN_PRI", "SEM_NOT", "SEM_PRI", "ID_REGIONA", "CO_REGIONA", "ID_MUNICIP", "CO_MUN_NOT", "NU_IDADE_N", "TP_IDADE", "COD_IDADE", "ID_PAIS",
             "CO_PAIS","SG_UF", "ID_RG_RESI", "CO_RG_RESI", "ID_MN_RESI", "CO_MUN_RES", "CS_SEXO","CS_RACA", "CS_GESTANT", "CS_ESCOL_N", "CS_ZONA", "NOSOCOMIAL", "AVE_SUINO", "FEBRE", "TOSSE",
             "GARGANTA","DISPNEIA","DESC_RESP", "HISTO_VGM", "SATURACAO", "DIARREIA", "VOMITO", "OUTRO_SIN", "ANTIVIRAL", "TRAT_COV", "HOSPITAL", "SG_UF_INTE", "ID_RG_INTE",
             "CO_RG_INTE",
             "CO_MU_INTE", "SURTO_SG","ID_MN_INTE", "NM_UN_INTE", "RAIOX_RES","SUPORT_VEN","AMOSTRA", "TP_AMOSTRA", "DT_COLETA", "PCR_RESUL","DT_PCR","PCR_VSR","PCR_PARA1",
             "PCR_PARA2","PCR_PARA3","PCR_PARA4","PCR_ADENO","PCR_METAP","PCR_BOCA","PCR_RINO","PCR_OUTRO","CLASSI_FIN","CRITERIO","DT_ENCERRA","DT_DIGITA","PCR_SARS2",
             "DOR_ABD","FADIGA","PERD_OLFT","PERD_PALA","TOMO_RES","RES_AN","AN_SARS2","AN_VSR","AN_PARA1","AN_PARA2","AN_PARA3","AN_ADENO","AN_OUTRO","POV_CT","TEM_CPF",
             "ESTRANG","FNT_IN_COV","CO_DETEC","REINF"]
  return df.drop(columns=cols_drop, errors="ignore")

def save_csv(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)
    print(f"CSV saved in '{path}'")


def main():
  url = "https://s3.sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/2025/INFLUD25-22-12-2025.csv"
  df = pd.read_csv(url, sep=';', encoding='latin1')


  df = drop_columns(df)

  print(df.columns)
  transform_evolution(df)
  transform_binary(df, "UTI")
  transform_binary(df, "VACINA")
  transform_binary(df, "VACINA_COV")
  to_datetime(df, "DT_NOTIFIC")
  to_datetime(df, "DT_INTERNA")
  to_datetime(df, "DT_NASC")
  to_datetime(df, "DT_EVOLUCA")

  df = df.rename(columns={
    "DT_NOTIFIC": "data_notificacao",
    "DT_EVOLUCA": "data_evolucao",
    "DT_INTERNA": "data_internacao",
    "DT_NASC": "data_nascimento",
    "VACINA_COV": "vacina_covid",
    "UTI": "uti",
    "VACINA": "vacina",
    "EVOLUCAO": "evolucao",
    "SG_UF_NOT": "uf_notificacao",
  })

  save_csv(df, "data/processed/srag_cleaned.csv")

if __name__ == "__main__":
    main()

  df = pd.read_csv(url, sep=';', encoding='latin1')


Index(['DT_NOTIFIC', 'SG_UF_NOT', 'DT_NASC', 'VACINA', 'DT_INTERNA', 'UTI',
       'EVOLUCAO', 'DT_EVOLUCA', 'VACINA_COV'],
      dtype='object')
CSV saved in 'data/processed/srag_cleaned.csv'


In [17]:

df1 = pd.read_csv("/content/data/processed/srag_cleaned.csv")


In [19]:
df1.columns

Index(['data_notificacao', 'uf_notificacao', 'data_nascimento', 'vacina',
       'data_internacao', 'uti', 'evolucao', 'data_evolucao', 'vacina_covid'],
      dtype='object')

In [20]:
df1

Unnamed: 0,data_notificacao,uf_notificacao,data_nascimento,vacina,data_internacao,uti,evolucao,data_evolucao,vacina_covid
0,2024-12-29,SP,1961-09-19,0.0,,0.0,0.0,2025-01-01,1.0
1,2024-12-29,SP,2024-11-08,0.0,,,0.0,2025-01-06,0.0
2,2024-12-29,SP,1936-07-25,0.0,2024-12-29,1.0,1.0,2025-01-21,1.0
3,2024-12-30,AC,2024-09-06,0.0,2024-12-29,1.0,0.0,,1.0
4,2024-12-30,PB,1958-02-17,1.0,,0.0,0.0,2025-01-07,1.0
...,...,...,...,...,...,...,...,...,...
323472,2025-12-21,SP,1964-06-09,1.0,2025-12-21,0.0,,,1.0
323473,2025-12-21,SP,1966-01-25,1.0,2025-12-21,0.0,,,1.0
323474,2025-12-21,SP,2025-05-02,1.0,,,,,0.0
323475,2025-12-21,CE,1936-08-17,1.0,,,,,1.0
