In [2]:
import os

import basedosdados as bd
import pandas as pd

INPUT = os.path.join("models", "br_inep_educacao_especial", "data")
OUTPUT = os.path.join("models", "br_inep_educacao_especial", "output")

os.makedirs(INPUT, exist_ok=True)
os.makedirs(OUTPUT, exist_ok=True)

In [6]:
def read_sheet(
    df: pd.ExcelFile, sheet_name: str, skiprows: int
) -> pd.DataFrame:
    return pd.read_excel(
        df,
        skiprows=skiprows,
        sheet_name=sheet_name,
    )

In [None]:
# Load the Excel file into a pandas ExcelFile object
excel_data = pd.ExcelFile(os.path.join(INPUT, "txa-21-22-23.xlsx"))

# Get the sheet names
print(excel_data.sheet_names)

['TXA_ANO_2020_21_22_23_educacao_']


In [81]:
df = excel_data.parse()

In [82]:
df

Unnamed: 0,NU_ANO_CENSO,TP_TIPO_CLASSE,CO_REGIAO,NO_REGIAO,CO_UF,SG_UF,TP_DEPENDENCIA,NO_DEPENDENCIA,NO_CATEGORIA,1_CAT_FUN,...,3_CAT_FUN_06,3_CAT_FUN_07,3_CAT_FUN_08,3_CAT_FUN_09,3_CAT_MED,3_CAT_MED_01,3_CAT_MED_02,3_CAT_MED_03,3_CAT_MED_04,3_CAT_MED_NS
0,2020,0 - Todas as turmas,0,Brasil,,,0,Total,Total,98.0,...,1.3,1.2,1.1,1.1,2.3,2.7,2.1,1.9,8.0,2.7
1,2020,0 - Todas as turmas,0,Brasil,,,0,Total,Modalidade: educação especial,93.4,...,1.9,1.5,1.4,1.4,2.6,3.3,2.3,1.7,4.9,1.9
2,2020,0 - Todas as turmas,0,Brasil,,,1,Federal,Total,99.2,...,0.1,0.0,0.0,0.1,1.8,2.3,1.3,1.4,2.6,2.3
3,2020,0 - Todas as turmas,0,Brasil,,,1,Federal,Modalidade: educação especial,99.4,...,0.0,0.0,0.0,0.0,2.7,3.2,2.5,1.7,4.3,2.7
4,2020,0 - Todas as turmas,0,Brasil,,,2,Estadual,Total,97.7,...,1.3,1.2,1.1,1.2,2.6,3.0,2.4,2.2,10.0,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3147,2023,1 - Classe comum,5,Centro-Oeste,53.0,DF,2,Estadual,Modalidade: educação especial,88.6,...,1.8,1.6,2.2,3.0,3.2,3.6,2.7,3.1,,
3148,2023,1 - Classe comum,5,Centro-Oeste,53.0,DF,4,Privada,Total,99.1,...,0.0,0.0,0.0,0.0,0.5,0.0,0.1,0.3,,12.7
3149,2023,1 - Classe comum,5,Centro-Oeste,53.0,DF,4,Privada,Modalidade: educação especial,99.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
3150,2023,1 - Classe comum,5,Centro-Oeste,53.0,DF,5,Pública,Total,93.9,...,1.0,1.4,1.2,1.8,3.8,4.0,4.5,3.0,,


In [83]:
# Print the column names of the DataFrame to see what was read from the Excel sheet
print(df.columns)

Index(['NU_ANO_CENSO', 'TP_TIPO_CLASSE', 'CO_REGIAO', 'NO_REGIAO', 'CO_UF',
       'SG_UF', 'TP_DEPENDENCIA', 'NO_DEPENDENCIA', 'NO_CATEGORIA',
       '1_CAT_FUN', '1_CAT_FUN_AI', '1_CAT_FUN_AF', '1_CAT_FUN_01',
       '1_CAT_FUN_02', '1_CAT_FUN_03', '1_CAT_FUN_04', '1_CAT_FUN_05',
       '1_CAT_FUN_06', '1_CAT_FUN_07', '1_CAT_FUN_08', '1_CAT_FUN_09',
       '1_CAT_MED', '1_CAT_MED_01', '1_CAT_MED_02', '1_CAT_MED_03',
       '1_CAT_MED_04', '1_CAT_MED_NS', '2_CAT_FUN', '2_CAT_FUN_AI',
       '2_CAT_FUN_AF', '2_CAT_FUN_01', '2_CAT_FUN_02', '2_CAT_FUN_03',
       '2_CAT_FUN_04', '2_CAT_FUN_05', '2_CAT_FUN_06', '2_CAT_FUN_07',
       '2_CAT_FUN_08', '2_CAT_FUN_09', '2_CAT_MED', '2_CAT_MED_01',
       '2_CAT_MED_02', '2_CAT_MED_03', '2_CAT_MED_04', '2_CAT_MED_NS',
       '3_CAT_FUN', '3_CAT_FUN_AI', '3_CAT_FUN_AF', '3_CAT_FUN_01',
       '3_CAT_FUN_02', '3_CAT_FUN_03', '3_CAT_FUN_04', '3_CAT_FUN_05',
       '3_CAT_FUN_06', '3_CAT_FUN_07', '3_CAT_FUN_08', '3_CAT_FUN_09',
       '3_CAT_MED',

In [84]:
# -----------------------------
# Rename and filter columns
# -----------------------------
# This block renames the DataFrame columns according to the RENAME_COLUMNS dictionary
# and keeps only the renamed columns. It overwrites the original df variable, so
# df will contain only the columns specified in RENAME_COLUMNS.

RENAME_COLUMNS = {
    "NU_ANO_CENSO": "ano",
    "NO_REGIAO": "regiao",
    "1_CAT_FUN_AI": "taxaaprovacao_anosiniciais",
    "1_CAT_FUN_AF": "taxaaprovacao_anosfinais",
    "1_CAT_MED": "taxaaprovacao_ensinomedio",
    "2_CAT_FUN_AI": "taxareprovacao_anosiniciais",
    "2_CAT_FUN_AF": "taxareprovacao_anosfinais",
    "2_CAT_MED": "taxareprovacao_ensinomedio",
    "3_CAT_FUN_AI": "taxaabandono_anosiniciais",
    "3_CAT_FUN_AF": "taxaabandono_anosfinais",
    "3_CAT_MED": "taxaabandono_ensinomedio",
}


def keep_only_renamed(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns=RENAME_COLUMNS)

    cols_keep = list(RENAME_COLUMNS.values())

    cols_existentes = [col for col in cols_keep if col in df.columns]

    return df[cols_existentes]


df = keep_only_renamed(df)
print(df.columns)

Index(['ano', 'regiao', 'taxaaprovacao_anosiniciais',
       'taxaaprovacao_anosfinais', 'taxaaprovacao_ensinomedio',
       'taxareprovacao_anosiniciais', 'taxareprovacao_anosfinais',
       'taxareprovacao_ensinomedio', 'taxaabandono_anosiniciais',
       'taxaabandono_anosfinais', 'taxaabandono_ensinomedio'],
      dtype='object')


In [85]:
# Filters only years equal to or greater than 2022
df = df[df["ano"] >= 2022]
df

Unnamed: 0,ano,regiao,taxaaprovacao_anosiniciais,taxaaprovacao_anosfinais,taxaaprovacao_ensinomedio,taxareprovacao_anosiniciais,taxareprovacao_anosfinais,taxareprovacao_ensinomedio,taxaabandono_anosiniciais,taxaabandono_anosfinais,taxaabandono_ensinomedio
1576,2022,Brasil,95.7,92.1,86.6,3.8,6.0,7.7,0.5,1.9,5.7
1577,2022,Brasil,85.8,88.8,86.2,12.8,8.0,7.4,1.4,3.2,6.4
1578,2022,Brasil,96.9,95.3,88.3,2.8,4.5,9.5,0.3,0.2,2.2
1579,2022,Brasil,87.2,91.2,82.1,9.8,6.1,14.4,3.0,2.7,3.5
1580,2022,Brasil,96.8,92.7,85.0,2.7,5.7,8.4,0.5,1.6,6.6
...,...,...,...,...,...,...,...,...,...,...,...
3147,2023,Centro-Oeste,85.6,92.6,87.5,14.1,5.3,9.3,0.3,2.1,3.2
3148,2023,Centro-Oeste,99.7,98.2,96.4,0.3,1.8,3.1,0.0,0.0,0.5
3149,2023,Centro-Oeste,99.4,98.8,98.1,0.6,1.2,1.9,0.0,0.0,0.0
3150,2023,Centro-Oeste,95.1,92.5,84.6,4.7,6.2,11.6,0.2,1.3,3.8


In [87]:
# Filters the DataFrame to keep only rows where 'regiao' is "Brasil"
# and melts the DataFrame from wide to long format (one row per metric)
# Each row will have: 'ano', 'regiao', 'metrica' (original metric name), and 'valor' (corresponding value)
melted_dataframe = pd.concat(
    [
        df.pipe(lambda d: d.loc[(d["regiao"] == "Brasil")]).pipe(
            lambda d: pd.melt(
                d,
                id_vars=["ano", "regiao"],
                value_vars=d.columns.difference(
                    ["ano", "regiao"]
                ).tolist(),  # Convert to list
                var_name="metrica",
                value_name="valor",
            )
        )
    ]
)

In [88]:
melted_dataframe

Unnamed: 0,ano,regiao,metrica,valor
0,2022,Brasil,taxaabandono_anosfinais,1.9
1,2022,Brasil,taxaabandono_anosfinais,3.2
2,2022,Brasil,taxaabandono_anosfinais,0.2
3,2022,Brasil,taxaabandono_anosfinais,2.7
4,2022,Brasil,taxaabandono_anosfinais,1.6
...,...,...,...,...
427,2023,Brasil,taxareprovacao_ensinomedio,4.6
428,2023,Brasil,taxareprovacao_ensinomedio,2.3
429,2023,Brasil,taxareprovacao_ensinomedio,2.4
430,2023,Brasil,taxareprovacao_ensinomedio,5.7


In [89]:
melted_dataframe["etapa_ensino"] = melted_dataframe["metrica"].apply(
    lambda v: v.split("_")[-1]
)  # Extracts 'anosiniciais', 'anosfinais', or 'ensinomedio'
melted_dataframe["tipo_metrica"] = melted_dataframe["metrica"].apply(
    lambda v: v.split("_")[0]
)  # Extracts 'taxaaprovacao', 'taxareprovacao', 'taxaabandono'
melted_dataframe["valor"] = pd.to_numeric(
    melted_dataframe["valor"], errors="coerce"
)

# Pivoting the melted DataFrame to get desired columns
df_final = melted_dataframe.pivot_table(
    index=["ano", "regiao", "etapa_ensino"],
    columns="tipo_metrica",
    values="valor",
).reset_index()

In [90]:
melted_dataframe

Unnamed: 0,ano,regiao,metrica,valor,etapa_ensino,tipo_metrica
0,2022,Brasil,taxaabandono_anosfinais,1.9,anosfinais,taxaabandono
1,2022,Brasil,taxaabandono_anosfinais,3.2,anosfinais,taxaabandono
2,2022,Brasil,taxaabandono_anosfinais,0.2,anosfinais,taxaabandono
3,2022,Brasil,taxaabandono_anosfinais,2.7,anosfinais,taxaabandono
4,2022,Brasil,taxaabandono_anosfinais,1.6,anosfinais,taxaabandono
...,...,...,...,...,...,...
427,2023,Brasil,taxareprovacao_ensinomedio,4.6,ensinomedio,taxareprovacao
428,2023,Brasil,taxareprovacao_ensinomedio,2.3,ensinomedio,taxareprovacao
429,2023,Brasil,taxareprovacao_ensinomedio,2.4,ensinomedio,taxareprovacao
430,2023,Brasil,taxareprovacao_ensinomedio,5.7,ensinomedio,taxareprovacao


In [None]:
# Dictionary used to rename columns in the melted DataFrame
# to a more standardized format.
RENAME_COLUMNS_MELTED = {
    "taxaabandono": "taxa_abandono",
    "taxaaprovacao": "taxa_aprovacao",
    "taxareprovacao": "taxa_reprovacao",
}
# Dictionary mapping shorthand stage names of education
# to their full descriptive names.
etapa_ensino = {
    "anosiniciais": "Ensino Fundamental – Anos Iniciais",
    "anosfinais": "Ensino Fundamental – Anos Finais",
    "ensinomedio": "Ensino Médio Regular",
}

In [92]:
df_final

tipo_metrica,ano,regiao,etapa_ensino,taxaabandono,taxaaprovacao,taxareprovacao
0,2022,Brasil,anosfinais,1.979167,91.870833,6.15
1,2022,Brasil,anosiniciais,0.85,92.704167,6.445833
2,2022,Brasil,ensinomedio,4.616667,88.275,7.108333
3,2023,Brasil,anosfinais,1.204167,94.0375,4.758333
4,2023,Brasil,anosiniciais,0.466667,94.8375,4.695833
5,2023,Brasil,ensinomedio,2.679167,91.7375,5.583333


In [None]:
# Rename columns in df_final using the mapping defined in RENAME_COLUMNS_MELTED
df_final = df_final.rename(columns=RENAME_COLUMNS_MELTED)
# Replace shorthand values in the 'etapa_ensino' column
# with their full descriptive names using the etapa_ensino dictionary
df_final["etapa_ensino"] = df_final["etapa_ensino"].replace(etapa_ensino)

In [None]:
# Rename the 'sigla' column to 'regiao' and drop the 'nome' column
df_final = df_final.drop(columns=["regiao"])

In [None]:
# Select and keep only the specified columns from df_final
# This ensures the DataFrame contains only the relevant variables for analysis
df_final = df_final[
    [
        "ano",
        "etapa_ensino",
        "taxa_aprovacao",
        "taxa_reprovacao",
        "taxa_abandono",
    ]
]

In [97]:
df_final

tipo_metrica,ano,etapa_ensino,taxa_aprovacao,taxa_reprovacao,taxa_abandono
0,2022,Ensino Fundamental – Anos Finais,91.870833,6.15,1.979167
1,2022,Ensino Fundamental – Anos Iniciais,92.704167,6.445833,0.85
2,2022,Ensino Médio Regular,88.275,7.108333,4.616667
3,2023,Ensino Fundamental – Anos Finais,94.0375,4.758333,1.204167
4,2023,Ensino Fundamental – Anos Iniciais,94.8375,4.695833,0.466667
5,2023,Ensino Médio Regular,91.7375,5.583333,2.679167


In [None]:
# Define the output file path by joining the OUTPUT directory with a subfolder name
path = os.path.join(OUTPUT, "educacao_especial_brasil_taxa_rendimento")
# Create the directory if it does not already exist
os.makedirs(path, exist_ok=True)
# Convert all values in df_final to string (astype(str)),
# then save it as a CSV file inside the specified folder.
df_final.astype(str).to_csv(os.path.join(path, "2022-2023.csv"), index=False)

In [None]:
# Read a table directly from BigQuery into a pandas DataFrame using the basedosdados library.
# The SQL query selects all columns from the table:
#   basedosdados.br_inep_educacao_especial.uf_taxa_rendimento
# The parameter billing_project_id specifies which GCP project will be billed for the query.
df_bq = bd.read_sql(
    "select * from basedosdados.br_inep_educacao_especial.brasil_taxa_rendimento",
    billing_project_id="basedosdados-dev",
)

Downloading: 100%|[32m██████████[0m|


In [100]:
df_bq

Unnamed: 0,ano,etapa_ensino,taxa_aprovacao,taxa_reprovacao,taxa_abandono
0,2010,Ensino Médio Regular,79.4,13.2,7.4
1,2016,Ensino Médio Regular,82.5,11.8,5.7
2,2014,Ensino Médio Regular,82.0,11.9,6.1
3,2008,Ensino Médio Regular,79.3,12.9,7.8
4,2019,Ensino Médio Regular,86.1,9.4,4.5
5,2017,Ensino Médio Regular,82.8,11.2,6.0
6,2013,Ensino Médio Regular,81.7,11.5,6.8
7,2012,Ensino Médio Regular,80.1,12.8,7.1
8,2021,Ensino Médio Regular,91.7,3.8,4.5
9,2007,Ensino Médio Regular,79.6,13.0,7.4


In [None]:
# Concatenate two DataFrames.
df_updated = pd.concat([df_bq, df_final])

In [None]:
# Convert all values in df_updated to strings and save as a CSV file.
df_updated.astype(str).to_csv(
    os.path.join(path, "brasil_taxa_rendimento.csv"), index=False
)

NameError: name 'os' is not defined

In [None]:
# Create a Table object representing a BigQuery table in the specified dataset.
tb_brasil = bd.Table(
    dataset_id="br_inep_educacao_especial", table_id="brasil_taxa_rendimento"
)
# Upload the local CSV file to the BigQuery table.
# Parameters:
# - if_storage_data_exists='replace': replace the data in storage if it already exists
# - if_table_exists='replace': replace the table if it already exists
# - source_format='csv': specify that the source file is a CSV
tb_brasil.create(
    os.path.join(path, "brasil_taxa_rendimento.csv"),
    if_storage_data_exists="replace",
    if_table_exists="replace",
    source_format="csv",
)

Uploading files:   0%|          | 0/1 [00:00<?, ?it/s][32m2025-08-26 15:14:42.122[0m | [32m[1mSUCCESS [0m | [36mbasedosdados.upload.storage[0m:[36mupload[0m:[36m233[0m - [32m[1m File brasil_taxa_rendimento.csv_staging was uploaded![0m
Uploading files: 100%|██████████| 1/1 [00:02<00:00,  2.24s/it]
[32m2025-08-26 15:14:46.152[0m | [1mINFO    [0m | [36mbasedosdados.upload.table[0m:[36mdelete[0m:[36m809[0m - [1m Table brasil_taxa_rendimento_staging was deleted![0m
[32m2025-08-26 15:14:46.469[0m | [32m[1mSUCCESS [0m | [36mbasedosdados.upload.table[0m:[36mcreate[0m:[36m690[0m - [32m[1mTable brasil_taxa_rendimento was created in staging![0m
