# CNES 2025-03 â€” Sanity Checks

This notebook performs sanity checks on the CNES CSV extract for 2025-03 located in `local_storage/csv/cnes_extract_202503`.

It will:
- Detect delimiter and encoding per file
- Load the dataframes safely
- Report row/column counts, memory usage, dtypes
- Compute basic missingness stats
- Save a summary report under `exploration/output/`



In [None]:
from pathlib import Path
import time
import pandas as pd

# Paths
PROJECT_ROOT = Path("/Users/caio.maximiano/pessoal/cnes-project-analysis")
if not PROJECT_ROOT.exists():
    PROJECT_ROOT = Path.cwd().resolve()

DATA_DIR = PROJECT_ROOT / "local_storage/csv/cnes_extract_202503"
OUTPUT_DIR = PROJECT_ROOT / "exploration" / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

DATA_DIR


PosixPath('/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/cnes_extract_202503')

In [None]:
# List CSV files
csv_files = sorted([p for p in DATA_DIR.glob("*.csv")])
len(csv_files), csv_files[:5]


(108,
 [PosixPath('/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/cnes_extract_202503/rlAdmGerenciaCnes202503.csv'),
  PosixPath('/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/cnes_extract_202503/rlAtividadeObrigatoria202503.csv'),
  PosixPath('/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/cnes_extract_202503/rlCooperativa202503.csv'),
  PosixPath('/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/cnes_extract_202503/rlEquipeAldeia202503.csv'),
  PosixPath('/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/cnes_extract_202503/rlEquipeNasfEsf202503.csv')])

In [None]:
# Robust CSV loader that tries common separators and encodings
from typing import Optional, Tuple, Dict, Any

COMMON_SEPARATORS = [",", ";", "|", "\t"]
COMMON_ENCODINGS = ["utf-8", "latin-1", "cp1252"]


def try_read_csv(path: Path, nrows: Optional[int] = None) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    last_error = None
    for enc in COMMON_ENCODINGS:
        for sep in COMMON_SEPARATORS:
            try:
                start = time.time()
                df = pd.read_csv(path, sep=sep, encoding=enc, nrows=nrows, low_memory=False)
                elapsed = time.time() - start
                return df, {"encoding": enc, "sep": sep, "load_time_sec": round(elapsed, 4)}
            except Exception as e:
                last_error = e
                continue
    raise RuntimeError(f"Failed to read {path.name}. Last error: {last_error}")


def dataframe_memory_mb(df: pd.DataFrame) -> float:
    return round(df.memory_usage(deep=True).sum() / (1024 ** 2), 3)



In [9]:
# Iterate files, compute sanity metrics, and save a summary
from collections import OrderedDict

summary_rows = []

for csv_path in csv_files:
    print(f"Loading {csv_path.name} ...")
    df, meta = try_read_csv(csv_path)

    num_rows, num_cols = df.shape
    mem_mb = dataframe_memory_mb(df)
    dtypes = df.dtypes.astype(str).to_dict()
    missing_pct = (df.isna().mean() * 100).round(2).sort_values(ascending=False)

    row = OrderedDict(
        file_name=csv_path.name,
        sep=meta["sep"],
        encoding=meta["encoding"],
        load_time_sec=meta["load_time_sec"],
        num_rows=num_rows,
        num_columns=num_cols,
        memory_mb=mem_mb,
        columns=", ".join(df.columns.tolist()),
    )
    summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)
summary_path = OUTPUT_DIR / "cnes_202503_sanity_summary.csv"
summary_df.to_csv(summary_path, index=False)
summary_df.head(10)


Loading rlAdmGerenciaCnes202503.csv ...
Loading rlAtividadeObrigatoria202503.csv ...
Loading rlCooperativa202503.csv ...
Loading rlEquipeAldeia202503.csv ...
Loading rlEquipeNasfEsf202503.csv ...
Loading rlEstabAtenPsico202503.csv ...
Loading rlEstabAtendPrestConv202503.csv ...
Loading rlEstabAvaliacao202503.csv ...
Loading rlEstabCentralReg202503.csv ...
Loading rlEstabColetaSelRejeito202503.csv ...
Loading rlEstabComissaoOutro202503.csv ...
Loading rlEstabComplementar202503.csv ...
Loading rlEstabEndCompl202503.csv ...
Loading rlEstabEqpEmbarcacao202503.csv ...
Loading rlEstabEqpUnidApoio202503.csv ...
Loading rlEstabEquipamento202503.csv ...
Loading rlEstabEquipeMun202503.csv ...
Loading rlEstabEquipeProf202503.csv ...
Loading rlEstabInstFisiAssist202503.csv ...
Loading rlEstabOrgParc202503.csv ...
Loading rlEstabPoloAldeia202503.csv ...
Loading rlEstabProfComissao202503.csv ...
Loading rlEstabProgFundo202503.csv ...
Loading rlEstabRegimeRes202503.csv ...
Loading rlEstabRepresentant

Unnamed: 0,file_name,sep,encoding,load_time_sec,num_rows,num_columns,memory_mb,columns
0,rlAdmGerenciaCnes202503.csv,",",latin-1,0.0121,4594,5,0.712,"NU_CNPJ_ADM;""CO_UNIDADE"";""TO_CHAR(DT_VIGENCIA_..."
1,rlAtividadeObrigatoria202503.csv,",",utf-8,0.0007,14,1,0.001,"CO_TIPO_ESTABELECIMENTO;""CO_ATIVIDADE_OBRIGATO..."
2,rlCooperativa202503.csv,",",utf-8,0.0029,2308,3,0.27,"CO_UNIDADE;""CO_COOPERATIVA"";""CO_CBO"";""CO_USUAR..."
3,rlEquipeAldeia202503.csv,",",utf-8,0.0007,300,3,0.039,"CO_MUNICIPIO;""CO_AREA"";""CO_SEQ_EQUIPE"";""CO_ALD..."
4,rlEquipeNasfEsf202503.csv,;,latin-1,0.0909,86467,17,34.202,"CO_MUNICIPIO, CO_AREA, SEQ_EQUIPE, CO_MUNICIPI..."
5,rlEstabAtenPsico202503.csv,",",latin-1,0.0013,1373,3,0.229,"CO_UNIDADE;""TP_ESTRUTURA"";""ST_PARCERIA_ONG"";""N..."
6,rlEstabAtendPrestConv202503.csv,",",latin-1,0.5452,1004536,3,108.831,"CO_UNIDADE;""CO_ATENDIMENTO_PRESTADO"";""CO_CONVE..."
7,rlEstabAvaliacao202503.csv,",",latin-1,0.0014,1737,4,0.23,"CO_UNIDADE;""CO_AVALIACAO"";""CO_CLASSIFICACAO"";""..."
8,rlEstabCentralReg202503.csv,",",latin-1,0.0049,3210,3,0.943,"CO_UNIDADE;""CO_SEQ_CENTRAL"";""NO_CENTRAL"";""CO_S..."
9,rlEstabColetaSelRejeito202503.csv,",",latin-1,0.464,851518,3,88.224,"CO_UNIDADE;""CO_COLETA_REJEITO"";""TO_CHAR(DT_ATU..."


In [None]:
summary_df.


AttributeError: 'DataFrame' object has no attribute 'orderBy'

In [11]:
# Save per-column missingness for each file

miss_rows = []
for csv_path in csv_files:
    df, meta = try_read_csv(csv_path)
    miss_pct = (df.isna().mean() * 100).round(2)
    for col, pct in miss_pct.items():
        miss_rows.append({
            "file_name": csv_path.name,
            "column_name": col,
            "missing_pct": pct,
            "sep": meta["sep"],
            "encoding": meta["encoding"],
        })

miss_df = pd.DataFrame(miss_rows)
miss_path = OUTPUT_DIR / "cnes_202503_missingness_by_column.csv"
miss_df.to_csv(miss_path, index=False)
miss_df.head(20)


Unnamed: 0,file_name,column_name,missing_pct,sep,encoding
0,rlAdmGerenciaCnes202503.csv,"NU_CNPJ_ADM;""CO_UNIDADE"";""TO_CHAR(DT_VIGENCIA_...",0.0,",",latin-1
1,rlAdmGerenciaCnes202503.csv,"'DD/MM/YYYY')"";""TO_CHAR(DT_VIGENCIA_FINAL",100.0,",",latin-1
2,rlAdmGerenciaCnes202503.csv,"'DD/MM/YYYY')"";""TO_CHAR(DT_ATUALIZACAO",100.0,",",latin-1
3,rlAdmGerenciaCnes202503.csv,"'DD/MM/YYYY')"";""CO_USUARIO"";""TO_CHAR(DT_ATUALI...",100.0,",",latin-1
4,rlAdmGerenciaCnes202503.csv,"'DD/MM/YYYY')""",100.0,",",latin-1
5,rlAtividadeObrigatoria202503.csv,"CO_TIPO_ESTABELECIMENTO;""CO_ATIVIDADE_OBRIGATO...",0.0,",",utf-8
6,rlCooperativa202503.csv,"CO_UNIDADE;""CO_COOPERATIVA"";""CO_CBO"";""CO_USUAR...",0.0,",",utf-8
7,rlCooperativa202503.csv,"'DD/MM/YYYY')"";""TO_CHAR(DT_ATUALIZACAO_ORIGEM",100.0,",",utf-8
8,rlCooperativa202503.csv,"'DD/MM/YYYY')""",100.0,",",utf-8
9,rlEquipeAldeia202503.csv,"CO_MUNICIPIO;""CO_AREA"";""CO_SEQ_EQUIPE"";""CO_ALD...",0.0,",",utf-8
