# ETL DataSUS APAC Quimioterapia

ELT com PySUS

Author: Rodrigo Barreiro

# Testing if PySUS is working

In [1]:
import pandas as pd
from collections import Counter
from pathlib import Path
import re
from pysus import SIH

In [3]:
sih = SIH().load() # Loads the files from DATASUS

In [4]:
sih

SIH - Sistema de Informações Hospitalares

In [5]:
sih.metadata

{'long_name': 'Sistema de Informações Hospitalares',
 'source': ('https://datasus.saude.gov.br/acesso-a-informacao/morbidade-hospitalar-do-sus-sih-sus/',
  'https://datasus.saude.gov.br/acesso-a-informacao/producao-hospitalar-sih-sus/'),
 'description': 'A finalidade do AIH (Sistema SIHSUS) é a de transcrever todos os atendimentos que provenientes de internações hospitalares que foram financiadas pelo SUS, e após o processamento, gerarem relatórios para os gestores que lhes possibilitem fazer os pagamentos dos estabelecimentos de saúde. Além disso, o nível Federal recebe mensalmente uma base de dados de todas as internações autorizadas (aprovadas ou não para pagamento) para que possam ser repassados às Secretarias de Saúde os valores de Produção de Média e Alta complexidade além dos valores de CNRAC, FAEC e de Hospitais Universitários – em suas variadas formas de contrato de gestão.'}

In [6]:
sih.groups

{'RD': 'AIH Reduzida',
 'RJ': 'AIH Rejeitada',
 'ER': 'AIH Rejeitada com erro',
 'SP': 'Serviços Profissionais',
 'CH': 'Cadastro Hospitalar',
 'CM': ''}

> **AIH Reduzida**
> Esta base contém as AIH aprovadas e também os valores efetivamente pagos por mês de
> competência. Ela inclui os procedimentos processados e validados pelo Ministério da Saúde
> entre os apresentados por todos os estabelecimentos prestadores de serviços para o SUS.

In [8]:
print(f'All files for AIH Reduzida: {len(sia.get_files(["RD"]))}')

All files for AIH Reduzida: 10727


## Calculating Files and Size

In [9]:
sih.get_files("RD", uf="SP", year=2022)

[RDSP2201.dbc,
 RDSP2202.dbc,
 RDSP2203.dbc,
 RDSP2204.dbc,
 RDSP2205.dbc,
 RDSP2206.dbc,
 RDSP2207.dbc,
 RDSP2208.dbc,
 RDSP2209.dbc,
 RDSP2210.dbc,
 RDSP2211.dbc,
 RDSP2212.dbc]

In [11]:
all_AIH_RD = len(sih.get_files("RD", year=2022))
print(f'Number of files for RD in 2022: {all_AIH_RD}')

Number of files for RD in 2022: 324


In [12]:
rd_files_2022 = sih.get_files("RD", year=2022)
Counter([ str(x)[2:4] for x in rd_files_2022 ])

Counter({'AC': 12,
         'AL': 12,
         'AM': 12,
         'AP': 12,
         'BA': 12,
         'CE': 12,
         'DF': 12,
         'ES': 12,
         'GO': 12,
         'MA': 12,
         'MG': 12,
         'MS': 12,
         'MT': 12,
         'PA': 12,
         'PB': 12,
         'PE': 12,
         'PI': 12,
         'PR': 12,
         'RJ': 12,
         'RN': 12,
         'RO': 12,
         'RR': 12,
         'RS': 12,
         'SC': 12,
         'SE': 12,
         'SP': 12,
         'TO': 12})

In [14]:
# Countig file size
files = sih.get_files("RD", year=2022)
sizes = [ sih.describe(my_file)['size'] for my_file in files ]


# Example list
sizes = ['37.9 kB', '1.2 MB', '550 B', '2.1 MB', '900 kB']

# Conversion to bytes
unit_multipliers = {
    'B': 1,
    'kB': 1024,
    'MB': 1024 ** 2,
    'GB': 1024 ** 3
}

total_bytes = 0

for s in sizes:
    match = re.match(r'([\d.]+)\s*(B|kB|MB|GB)', s)
    if match:
        number, unit = match.groups()
        bytes_size = float(number) * unit_multipliers[unit]
        total_bytes += bytes_size
    else:
        print(f"Unrecognized size format: {s}")

# Optional: convert back to human-readable
def human_readable(size_bytes):
    for unit in ['B', 'kB', 'MB', 'GB']:
        if size_bytes < 1024 or unit == 'GB':
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024

print(f"Total: {human_readable(total_bytes)}")
print("NOTE: This is the size of dbc file (compressed).")

Total: 4.22 MB
NOTE: This is the size of dbc file (compressed).


## Extract
### Downlaod all APAC Quimioterapy from 2022 of all BR States

In [None]:
# # Download all files of 2022
# sih.download(files, local_dir='../data/raw/aih-rd/')

RDTO2212.parquet: 100%|██████████| 27.3k/27.3k [00:00<00:00, 35.3kB/s]


[../data/raw/aih-rd/RDAC2201.parquet,
 ../data/raw/aih-rd/RDAC2202.parquet,
 ../data/raw/aih-rd/RDAC2203.parquet,
 ../data/raw/aih-rd/RDAC2204.parquet,
 ../data/raw/aih-rd/RDAC2205.parquet,
 ../data/raw/aih-rd/RDAC2206.parquet,
 ../data/raw/aih-rd/RDAC2207.parquet,
 ../data/raw/aih-rd/RDAC2208.parquet,
 ../data/raw/aih-rd/RDAC2209.parquet,
 ../data/raw/aih-rd/RDAC2210.parquet,
 ../data/raw/aih-rd/RDAC2211.parquet,
 ../data/raw/aih-rd/RDAC2212.parquet,
 ../data/raw/aih-rd/RDAL2201.parquet,
 ../data/raw/aih-rd/RDAL2202.parquet,
 ../data/raw/aih-rd/RDAL2203.parquet,
 ../data/raw/aih-rd/RDAL2204.parquet,
 ../data/raw/aih-rd/RDAL2205.parquet,
 ../data/raw/aih-rd/RDAL2206.parquet,
 ../data/raw/aih-rd/RDAL2207.parquet,
 ../data/raw/aih-rd/RDAL2208.parquet,
 ../data/raw/aih-rd/RDAL2209.parquet,
 ../data/raw/aih-rd/RDAL2210.parquet,
 ../data/raw/aih-rd/RDAL2211.parquet,
 ../data/raw/aih-rd/RDAL2212.parquet,
 ../data/raw/aih-rd/RDAM2201.parquet,
 ../data/raw/aih-rd/RDAM2202.parquet,
 ../data/raw

Check if all files were downloaded

In [16]:
download_dir = Path('../data/raw/aih-rd')
all_downloaded_files = [f.name for f in download_dir.iterdir() if f.suffix == '.parquet']
print(f'All downloaded files: {len(all_downloaded_files)} of {all_AIH_RD} of SIA')

All downloaded files: 324 of 324 of SIA


## Transform
### Convert to dataframe and merge all files into one

In [18]:
all_data_df_list = [ x.to_dataframe() for x in sia.download(files, local_dir='../data/raw/aih-rd/') ] 


585122it [00:00, 3946254.38it/s]   
RDGO2204.dbf:   0%|          | 0.00/1.00 [00:19<?, ?B/s]

In [19]:
print("Check type:")
print([type(x) for x in all_data_df_list[:3]])
print('\n')
print("Check dataframe n:")
print(len(all_data_df_list))


Check type:
[<class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.frame.DataFrame'>]


Check dataframe n:
324


In [22]:
# Concatenate to a single dataframe
combined_df = pd.concat(all_data_df_list, ignore_index=True)
print(combined_df.info())
print(f"Rows (Atendimentos) {combined_df.shape[0]:,.0f} \nColumns (Campos) {combined_df.shape[1]:,.0f} ")

del all_data_df_list

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12520914 entries, 0 to 12520913
Columns: 113 entries, UF_ZI to TPDISEC9
dtypes: Int64(1), string(112)
memory usage: 10.6 GB
None
Rows (Atendimentos) 12,520,914 
Columns (Campos) 113 


In [23]:
print(combined_df.columns.tolist())

['UF_ZI', 'ANO_CMPT', 'MES_CMPT', 'ESPEC', 'CGC_HOSP', 'N_AIH', 'IDENT', 'CEP', 'MUNIC_RES', 'NASC', 'SEXO', 'UTI_MES_IN', 'UTI_MES_AN', 'UTI_MES_AL', 'UTI_MES_TO', 'MARCA_UTI', 'UTI_INT_IN', 'UTI_INT_AN', 'UTI_INT_AL', 'UTI_INT_TO', 'DIAR_ACOM', 'QT_DIARIAS', 'PROC_SOLIC', 'PROC_REA', 'VAL_SH', 'VAL_SP', 'VAL_SADT', 'VAL_RN', 'VAL_ACOMP', 'VAL_ORTP', 'VAL_SANGUE', 'VAL_SADTSR', 'VAL_TRANSP', 'VAL_OBSANG', 'VAL_PED1AC', 'VAL_TOT', 'VAL_UTI', 'US_TOT', 'DT_INTER', 'DT_SAIDA', 'DIAG_PRINC', 'DIAG_SECUN', 'COBRANCA', 'NATUREZA', 'NAT_JUR', 'GESTAO', 'RUBRICA', 'IND_VDRL', 'MUNIC_MOV', 'COD_IDADE', 'IDADE', 'DIAS_PERM', 'MORTE', 'NACIONAL', 'NUM_PROC', 'CAR_INT', 'TOT_PT_SP', 'CPF_AUT', 'HOMONIMO', 'NUM_FILHOS', 'INSTRU', 'CID_NOTIF', 'CONTRACEP1', 'CONTRACEP2', 'GESTRISCO', 'INSC_PN', 'SEQ_AIH5', 'CBOR', 'CNAER', 'VINCPREV', 'GESTOR_COD', 'GESTOR_TP', 'GESTOR_CPF', 'GESTOR_DT', 'CNES', 'CNPJ_MANT', 'INFEHOSP', 'CID_ASSO', 'CID_MORTE', 'COMPLEX', 'FINANC', 'FAEC_TP', 'REGCT', 'RACA_COR', '

In [24]:
combined_df.head(10)

Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,...,DIAGSEC9,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,120000,2022,1,3,63602940000170,1222100025960,1,69907501,120040,19500723,...,,0,0,0,0,0,0,0,0,0
1,120000,2022,1,3,63602940000170,1222100025982,1,69907566,120040,19831215,...,,1,0,0,0,0,0,0,0,0
2,120000,2022,1,5,4034526000577,1222100006886,1,69911114,120040,19771118,...,,0,0,0,0,0,0,0,0,0
3,120000,2022,1,5,4034526000577,1222100006919,1,69918256,120040,19820507,...,,0,0,0,0,0,0,0,0,0
4,120000,2022,1,5,4034526000577,1222100008250,1,69907840,120040,20010608,...,,0,0,0,0,0,0,0,0,0
5,120000,2022,1,5,4034526000577,1222100008680,1,69923899,120040,19820506,...,,0,0,0,0,0,0,0,0,0
6,120000,2022,1,5,4034526000577,1222100009229,1,69915290,120040,19771228,...,,0,0,0,0,0,0,0,0,0
7,120000,2022,1,7,4034526000496,1221100457687,1,69921000,120080,20081114,...,,0,0,0,0,0,0,0,0,0
8,120000,2022,1,7,4034526000496,1221100459678,1,69940000,120050,20101207,...,,0,0,0,0,0,0,0,0,0
9,120000,2022,1,3,4034526001468,1221100598828,1,69940000,120050,19811128,...,,0,0,0,0,0,0,0,0,0


### Filter bad columns
In a [detailed analysis](https://repositorio.ipea.gov.br/bitstream/11058/9409/1/Uma_analise_da_base_de_dados_do_sistema_de_informacao_hospitalar.pdf) done by IPEA some columns could be removed due to high missing values, no data variability or dicrepancies between data (eg. incorrect filled columns)


In [26]:
columns_to_remove = pd.read_csv('../data/external/aih_block_listed_columns.csv')['aih_block_listed_columns'].tolist()

combined_df = combined_df.drop(columns=columns_to_remove)

In [27]:
def slugify(column_name):
    column_name = column_name.lower()
    column_name = re.sub(r'[^\w\s-]', '', column_name)
    column_name = re.sub(r'[\s]+', '-', column_name)
    return column_name

combined_df.columns = [slugify(col) for col in combined_df.columns]


### Add CID

In [28]:
cid10_data = pd.read_csv('../data/external/CID-10-CATEGORIAS.CSV.utf8',sep = ';')
cid10_data = cid10_data[['CAT','DESCRICAO']]

cid10_data.rename(columns={"CAT": 'CID10', "DESCRICAO":"cid10_main_descricao"}, inplace=True)
cid10_data

Unnamed: 0,CID10,cid10_main_descricao
0,A00,Cólera
1,A01,Febres tifóide e paratifóide
2,A02,Outras infecções por Salmonella
3,A03,Shiguelose
4,A04,Outras infecções intestinais bacterianas
...,...,...
2040,U80,Agente resistente à penicilina e antibióticos ...
2041,U81,Agente resistente à vancomicina e antibióticos...
2042,U88,Agente resistente a múltiplos antibióticos
2043,U89,Agente resistente a outros antibióticos e a an...


In [30]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12520914 entries, 0 to 12520913
Data columns (total 82 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   uf_zi       string
 1   ano_cmpt    string
 2   mes_cmpt    string
 3   espec       string
 4   n_aih       string
 5   ident       string
 6   nasc        string
 7   sexo        Int64 
 8   uti_mes_in  string
 9   uti_mes_an  string
 10  uti_mes_al  string
 11  uti_mes_to  string
 12  marca_uti   string
 13  uti_int_in  string
 14  uti_int_an  string
 15  uti_int_al  string
 16  uti_int_to  string
 17  diar_acom   string
 18  qt_diarias  string
 19  proc_rea    string
 20  val_sh      string
 21  val_sp      string
 22  val_sadt    string
 23  val_rn      string
 24  val_acomp   string
 25  val_ortp    string
 26  val_sangue  string
 27  val_sadtsr  string
 28  val_transp  string
 29  val_obsang  string
 30  val_ped1ac  string
 31  val_tot     string
 32  val_uti     string
 33  us_tot      string
 34  dt_inter    stri

In [31]:
combined_df['cid_principal_main_categ'] = combined_df['diag_princ'].str[:3]
combined_df['cid_principal_main_categ']

0           J18
1           T88
2           F06
3           F20
4           F29
           ... 
12520909    K40
12520910    K40
12520911    K42
12520912    K40
12520913    K40
Name: cid_principal_main_categ, Length: 12520914, dtype: string

In [32]:
combined_df = pd.merge(combined_df, cid10_data, how = 'left', left_on = 'cid_principal_main_categ', right_on = 'CID10')

### Load

Our load will be just export to `.csv`. Nice :)

In [33]:
combined_df.to_csv('../data/processed/aih-rd-2022.csv', index = False)

In [34]:
file_path = Path('../data/processed/aih-rd-2022.csv')
size_bytes = file_path.stat().st_size
print('../data/processed/aih-rd-2022.csv')
print(f"Size in GB: {size_bytes / (1024 ** 3):.2f}")

../data/processed/aih-rd-2022.csv
Size in GB: 6.18


In [35]:
toy_data = combined_df.sample(frac = 0.1, axis = 0)
toy_data.to_csv('../data/processed/aih-rd-2022-toy.csv', index = False)

file_path = Path('../data/processed/aih-rd-2022-toy.csv')
size_bytes = file_path.stat().st_size
print('../data/processed/aih-rd-2022-toy.csv')
print(f"Size in GB: {size_bytes / (1024 ** 3):.2f}")

../data/processed/aih-rd-2022-toy.csv
Size in GB: 0.62
