In [1]:
import os
import zipfile
import pandas as pd
import basedosdados as bd

INPUT = os.path.join(os.getcwd(), "input")
OUTPUT = os.path.join(os.getcwd(), "output")

os.makedirs(INPUT, exist_ok=True)
os.makedirs(OUTPUT, exist_ok=True)

In [2]:
def read_sheet(sheet_name: str, skiprows: int = 9) -> pd.DataFrame:
    return pd.read_excel(
        os.path.join(
            INPUT,
            "Demanda_23546-049990_2024_06_DOC_EDU_ESPECIAL_BAS__2012_A_2023.xlsx"
        ),
        skiprows=skiprows,
        sheet_name=sheet_name,
        dtype=str
    )

In [3]:
excel_data = pd.ExcelFile(os.path.join(
            INPUT,
            "Demanda_23546-049990_2024_06_DOC_EDU_ESPECIAL_BAS__2012_A_2023.xlsx"
        ))

# Get the sheet names
sheet_names = excel_data.sheet_names

In [4]:
dfs = {
    sheet_name: read_sheet(sheet_name)
    for sheet_name in sheet_names
}

In [5]:
dfs

{'Planilha1':                                             NU_ANO_CENSO CO_REGIAO NO_REGIAO  \
 0                                                   2012       NaN    Brasil   
 1                                                   2012         1     Norte   
 2                                                   2012         1     Norte   
 3                                                   2012         1     Norte   
 4                                                   2012         1     Norte   
 ...                                                  ...       ...       ...   
 57988                                                NaN       NaN       NaN   
 57989     Fonte: INEP – Censo Escolar da Educação Básica       NaN       NaN   
 57990  Notas:  1 - Os docentes referem-se aos indivíd...       NaN       NaN   
 57991                  2 - Os docentes são contados u...       NaN       NaN   
 57992                  3 - Não inclui auxiliares da E...       NaN       NaN   
 
       CO_UF 

In [6]:
for sheet_name, df in dfs.items():
    print(f"Sheet: {sheet_name}")
    print(df.columns)  # This will print the column names of each DataFrame
    print()  # Adds a blank line for readability

Sheet: Planilha1
Index(['NU_ANO_CENSO', 'CO_REGIAO', 'NO_REGIAO', 'CO_UF', 'SG_UF', 'NO_UF',
       'CO_MUNICIPIO', 'NO_MUNICIPIO', 'DOCEE', 'DOCFED', 'DOCEST', 'DOCMUNI',
       'DOCPRIV'],
      dtype='object')



In [7]:
RENAME_COLUMNS = {
    'NU_ANO_CENSO':'ano', 
    'SG_UF':'sigla_uf', 
    'CO_MUNICIPIO':'id_municipio', 
    'DOCFED':'Federal', 
    'DOCEST':'Estadual', 
    'DOCMUNI':'Municipal',
    'DOCPRIV':'Privada'  
}

In [8]:
def drop_unused_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols_drop = [
        col
        for col in df.columns
        if col.startswith("NO_") 
        or col.startswith("CO_")
        or col.startswith('DOCEE') 
    ]

    return df.drop(columns=cols_drop)

dfs = {
    name: drop_unused_columns(
        df.rename(columns=RENAME_COLUMNS, errors="raise")
    )
    for name, df in dfs.items()
}

In [9]:
for sheet_name, df in dfs.items():
    print(f"Sheet: {sheet_name}")
    print(df.columns)  # This will print the column names of each DataFrame
    print()  # Adds a blank line for readability

Sheet: Planilha1
Index(['ano', 'sigla_uf', 'id_municipio', 'Federal', 'Estadual', 'Municipal',
       'Privada'],
      dtype='object')



In [10]:
melted_dataframe = pd.concat(
    [
        df.pipe(
            lambda d: d.loc[(d["id_municipio"].notna()) & (d["id_municipio"] != " ")]
        )
        .pipe(
            lambda d: pd.melt(
                d,
                id_vars=["ano", "sigla_uf", 'id_municipio'],
                value_vars=d.columns.difference(["id_uf", "nome"]).tolist(),  # Convert to list
                var_name="rede",
                value_name="quantidade_docente_formacao_continuada",
            )
        )
    ]
)

In [11]:
melted_dataframe = melted_dataframe.sort_values(
    by=['ano', 'sigla_uf','id_municipio', 'rede'], 
    ascending=[True, True, True, True])

In [12]:
melted_dataframe

Unnamed: 0,ano,sigla_uf,id_municipio,rede,quantidade_docente_formacao_continuada
45,2012,AC,1200013,Estadual,2
57638,2012,AC,1200013,Federal,0
115231,2012,AC,1200013,Municipal,12
172824,2012,AC,1200013,Privada,0
46,2012,AC,1200104,Estadual,3
...,...,...,...,...,...
225613,2023,TO,1721307,Privada,0
52835,2023,TO,1722107,Estadual,0
110428,2023,TO,1722107,Federal,0
168021,2023,TO,1722107,Municipal,1


In [28]:
path = os.path.join(
        OUTPUT, "educacao_especial_formacao_docente"
    )

os.makedirs(path, exist_ok=True)
melted_dataframe.astype(str).to_csv(os.path.join(path, "data.csv"), index=False)