In [1]:
import os

import basedosdados as bd
import pandas as pd

INPUT = os.path.join("models", "br_inep_educacao_especial", "data")
OUTPUT = os.path.join("models", "br_inep_educacao_especial", "output")

os.makedirs(INPUT, exist_ok=True)
os.makedirs(OUTPUT, exist_ok=True)



In [21]:
def read_sheet(
    df: pd.ExcelFile, sheet_name: str, skiprows: int
) -> pd.DataFrame:
    return pd.read_excel(
        df,
        skiprows=skiprows,
        sheet_name=sheet_name,
    )

In [None]:
# Load the Excel file into a pandas ExcelFile object.
# Data downloaded via LAI request
excel_data = pd.ExcelFile(os.path.join(INPUT, "txa-21-22-23.xlsx"))

# Get the sheet names
print(excel_data.sheet_names)

['TXA_ANO_2020_21_22_23_educacao_']


In [None]:
# Parse the Excel file into a DataFrame.
# If no sheet name is specified, it loads the first sheet by default.
df = excel_data.parse()
df

In [44]:
# Print the column names of the DataFrame to see what was read from the Excel sheet
print(df.columns)

Index(['NU_ANO_CENSO', 'TP_TIPO_CLASSE', 'CO_REGIAO', 'NO_REGIAO', 'CO_UF',
       'SG_UF', 'TP_DEPENDENCIA', 'NO_DEPENDENCIA', 'NO_CATEGORIA',
       '1_CAT_FUN', '1_CAT_FUN_AI', '1_CAT_FUN_AF', '1_CAT_FUN_01',
       '1_CAT_FUN_02', '1_CAT_FUN_03', '1_CAT_FUN_04', '1_CAT_FUN_05',
       '1_CAT_FUN_06', '1_CAT_FUN_07', '1_CAT_FUN_08', '1_CAT_FUN_09',
       '1_CAT_MED', '1_CAT_MED_01', '1_CAT_MED_02', '1_CAT_MED_03',
       '1_CAT_MED_04', '1_CAT_MED_NS', '2_CAT_FUN', '2_CAT_FUN_AI',
       '2_CAT_FUN_AF', '2_CAT_FUN_01', '2_CAT_FUN_02', '2_CAT_FUN_03',
       '2_CAT_FUN_04', '2_CAT_FUN_05', '2_CAT_FUN_06', '2_CAT_FUN_07',
       '2_CAT_FUN_08', '2_CAT_FUN_09', '2_CAT_MED', '2_CAT_MED_01',
       '2_CAT_MED_02', '2_CAT_MED_03', '2_CAT_MED_04', '2_CAT_MED_NS',
       '3_CAT_FUN', '3_CAT_FUN_AI', '3_CAT_FUN_AF', '3_CAT_FUN_01',
       '3_CAT_FUN_02', '3_CAT_FUN_03', '3_CAT_FUN_04', '3_CAT_FUN_05',
       '3_CAT_FUN_06', '3_CAT_FUN_07', '3_CAT_FUN_08', '3_CAT_FUN_09',
       '3_CAT_MED',

In [None]:
# -----------------------------
# Rename and filter columns
# -----------------------------
# This block renames the DataFrame columns according to the RENAME_COLUMNS dictionary
# and keeps only the renamed columns. It overwrites the original df variable, so
# df will contain only the columns specified in RENAME_COLUMNS.

RENAME_COLUMNS = {
    "NU_ANO_CENSO": "ano",
    "SG_UF": "sigla_uf",
    "1_CAT_FUN_AI": "taxaaprovacao_anosiniciais",
    "1_CAT_FUN_AF": "taxaaprovacao_anosfinais",
    "1_CAT_MED": "taxaaprovacao_ensinomedio",
    "2_CAT_FUN_AI": "taxareprovacao_anosiniciais",
    "2_CAT_FUN_AF": "taxareprovacao_anosfinais",
    "2_CAT_MED": "taxareprovacao_ensinomedio",
    "3_CAT_FUN_AI": "taxaabandono_anosiniciais",
    "3_CAT_FUN_AF": "taxaabandono_anosfinais",
    "3_CAT_MED": "taxaabandono_ensinomedio",
}


def keep_only_renamed(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns=RENAME_COLUMNS)

    cols_keep = list(RENAME_COLUMNS.values())

    cols_existentes = [col for col in cols_keep if col in df.columns]

    return df[cols_existentes]


df = keep_only_renamed(df)
print(df.columns)

Index(['ano', 'sigla_uf', 'taxaaprovacao_anosiniciais',
       'taxaaprovacao_anosfinais', 'taxaaprovacao_ensinomedio',
       'taxareprovacao_anosiniciais', 'taxareprovacao_anosfinais',
       'taxareprovacao_ensinomedio', 'taxaabandono_anosiniciais',
       'taxaabandono_anosfinais', 'taxaabandono_ensinomedio'],
      dtype='object')


In [46]:
# Filters only years equal to or greater than 2022
df = df[df["ano"] >= 2022]
df

Unnamed: 0,ano,sigla_uf,taxaaprovacao_anosiniciais,taxaaprovacao_anosfinais,taxaaprovacao_ensinomedio,taxareprovacao_anosiniciais,taxareprovacao_anosfinais,taxareprovacao_ensinomedio,taxaabandono_anosiniciais,taxaabandono_anosfinais,taxaabandono_ensinomedio
1576,2022,,95.7,92.1,86.6,3.8,6.0,7.7,0.5,1.9,5.7
1577,2022,,85.8,88.8,86.2,12.8,8.0,7.4,1.4,3.2,6.4
1578,2022,,96.9,95.3,88.3,2.8,4.5,9.5,0.3,0.2,2.2
1579,2022,,87.2,91.2,82.1,9.8,6.1,14.4,3.0,2.7,3.5
1580,2022,,96.8,92.7,85.0,2.7,5.7,8.4,0.5,1.6,6.6
...,...,...,...,...,...,...,...,...,...,...,...
3147,2023,DF,85.6,92.6,87.5,14.1,5.3,9.3,0.3,2.1,3.2
3148,2023,DF,99.7,98.2,96.4,0.3,1.8,3.1,0.0,0.0,0.5
3149,2023,DF,99.4,98.8,98.1,0.6,1.2,1.9,0.0,0.0,0.0
3150,2023,DF,95.1,92.5,84.6,4.7,6.2,11.6,0.2,1.3,3.8


In [None]:
# Filters the DataFrame to keep only rows where 'sigla_uf' is " "
# and melts the DataFrame from wide to long format (one row per metric)
# Each row will have: 'ano', 'sigla_uf', 'metrica' (original metric name), and 'valor' (corresponding value)
melted_dataframe = pd.concat(
    [
        df.pipe(
            lambda d: d.loc[(d["sigla_uf"].notna()) & (d["sigla_uf"] != " ")]
        ).pipe(
            lambda d: pd.melt(
                d,
                id_vars=["ano", "sigla_uf"],
                value_vars=d.columns.difference(
                    ["ano", "sigla_uf"]
                ).tolist(),  # Convert to list
                var_name="metrica",
                value_name="valor",
            )
        )
    ]
)

In [49]:
melted_dataframe

Unnamed: 0,ano,sigla_uf,metrica,valor
0,2022,RO,taxaabandono_anosfinais,1.9
1,2022,RO,taxaabandono_anosfinais,2.6
2,2022,RO,taxaabandono_anosfinais,
3,2022,RO,taxaabandono_anosfinais,
4,2022,RO,taxaabandono_anosfinais,2.1
...,...,...,...,...
11587,2023,DF,taxareprovacao_ensinomedio,9.3
11588,2023,DF,taxareprovacao_ensinomedio,3.1
11589,2023,DF,taxareprovacao_ensinomedio,1.9
11590,2023,DF,taxareprovacao_ensinomedio,11.6


In [50]:
melted_dataframe["etapa_ensino"] = melted_dataframe["metrica"].apply(
    lambda v: v.split("_")[-1]
)  # Extracts 'anosiniciais', 'anosfinais', or 'ensinomedio'
melted_dataframe["tipo_metrica"] = melted_dataframe["metrica"].apply(
    lambda v: v.split("_")[0]
)  # Extracts 'taxaaprovacao', 'taxareprovacao', 'taxaabandono'
melted_dataframe["valor"] = pd.to_numeric(
    melted_dataframe["valor"], errors="coerce"
)

# Pivoting the melted DataFrame to get desired columns
df_final = melted_dataframe.pivot_table(
    index=["ano", "sigla_uf", "etapa_ensino"],
    columns="tipo_metrica",
    values="valor",
).reset_index()

In [51]:
melted_dataframe

Unnamed: 0,ano,sigla_uf,metrica,valor,etapa_ensino,tipo_metrica
0,2022,RO,taxaabandono_anosfinais,1.9,anosfinais,taxaabandono
1,2022,RO,taxaabandono_anosfinais,2.6,anosfinais,taxaabandono
2,2022,RO,taxaabandono_anosfinais,,anosfinais,taxaabandono
3,2022,RO,taxaabandono_anosfinais,,anosfinais,taxaabandono
4,2022,RO,taxaabandono_anosfinais,2.1,anosfinais,taxaabandono
...,...,...,...,...,...,...
11587,2023,DF,taxareprovacao_ensinomedio,9.3,ensinomedio,taxareprovacao
11588,2023,DF,taxareprovacao_ensinomedio,3.1,ensinomedio,taxareprovacao
11589,2023,DF,taxareprovacao_ensinomedio,1.9,ensinomedio,taxareprovacao
11590,2023,DF,taxareprovacao_ensinomedio,11.6,ensinomedio,taxareprovacao


In [None]:
# Remove all rows where the column 'valor' has missing (NaN) values.
melted_dataframe = melted_dataframe.dropna(subset=["valor"])

In [60]:
melted_dataframe

Unnamed: 0,ano,sigla_uf,metrica,valor,etapa_ensino,tipo_metrica
0,2022,RO,taxaabandono_anosfinais,1.9,anosfinais,taxaabandono
1,2022,RO,taxaabandono_anosfinais,2.6,anosfinais,taxaabandono
4,2022,RO,taxaabandono_anosfinais,2.1,anosfinais,taxaabandono
5,2022,RO,taxaabandono_anosfinais,2.8,anosfinais,taxaabandono
6,2022,RO,taxaabandono_anosfinais,1.2,anosfinais,taxaabandono
...,...,...,...,...,...,...
11587,2023,DF,taxareprovacao_ensinomedio,9.3,ensinomedio,taxareprovacao
11588,2023,DF,taxareprovacao_ensinomedio,3.1,ensinomedio,taxareprovacao
11589,2023,DF,taxareprovacao_ensinomedio,1.9,ensinomedio,taxareprovacao
11590,2023,DF,taxareprovacao_ensinomedio,11.6,ensinomedio,taxareprovacao


In [None]:
# Dictionary used to rename columns in the melted DataFrame
# to a more standardized format.
RENAME_COLUMNS_MELTED = {
    "taxaabandono": "taxa_abandono",
    "taxaaprovacao": "taxa_aprovacao",
    "taxareprovacao": "taxa_reprovacao",
}

# Dictionary mapping shorthand stage names of education
# to their full descriptive names.
etapa_ensino = {
    "anosiniciais": "Ensino Fundamental – Anos Iniciais",
    "anosfinais": "Ensino Fundamental – Anos Finais",
    "ensinomedio": "Ensino Médio Regular",
}

In [62]:
df_final

tipo_metrica,ano,sigla_uf,etapa_ensino,taxa_aprovacao,taxa_reprovacao,taxa_abandono
0,2022,AC,Ensino Fundamental – Anos Finais,92.866667,4.175000,2.958333
1,2022,AC,Ensino Fundamental – Anos Iniciais,95.233333,3.591667,1.175000
2,2022,AC,Ensino Médio Regular,87.690000,6.650000,5.660000
3,2022,AL,Ensino Fundamental – Anos Finais,88.990000,7.850000,3.160000
4,2022,AL,Ensino Fundamental – Anos Iniciais,94.830000,4.090000,1.080000
...,...,...,...,...,...,...
157,2023,SP,Ensino Fundamental – Anos Iniciais,94.816667,4.845833,0.337500
158,2023,SP,Ensino Médio Regular,94.387500,3.650000,1.962500
159,2023,TO,Ensino Fundamental – Anos Finais,94.685000,4.665000,0.650000
160,2023,TO,Ensino Fundamental – Anos Iniciais,92.570000,7.175000,0.255000


In [None]:
# Rename columns in df_final using the mapping defined in RENAME_COLUMNS_MELTED
df_final = df_final.rename(columns=RENAME_COLUMNS_MELTED)

# Replace shorthand values in the 'etapa_ensino' column
# with their full descriptive names using the etapa_ensino dictionary
df_final["etapa_ensino"] = df_final["etapa_ensino"].replace(etapa_ensino)

In [None]:
# Display all the unique values present in the column 'sigla_uf'
df_final["sigla_uf"].unique()

array(['AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG',
       'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 'RJ', 'RN', 'RO', 'RR',
       'RS', 'SC', 'SE', 'SP', 'TO'], dtype=object)

In [None]:
# Select and keep only the specified columns from df_final
# This ensures the DataFrame contains only the relevant variables for analysis
df_final = df_final[
    [
        "ano",
        "sigla_uf",
        "etapa_ensino",
        "taxa_aprovacao",
        "taxa_reprovacao",
        "taxa_abandono",
    ]
]

In [66]:
df_final

tipo_metrica,ano,sigla_uf,etapa_ensino,taxa_aprovacao,taxa_reprovacao,taxa_abandono
0,2022,AC,Ensino Fundamental – Anos Finais,92.866667,4.175000,2.958333
1,2022,AC,Ensino Fundamental – Anos Iniciais,95.233333,3.591667,1.175000
2,2022,AC,Ensino Médio Regular,87.690000,6.650000,5.660000
3,2022,AL,Ensino Fundamental – Anos Finais,88.990000,7.850000,3.160000
4,2022,AL,Ensino Fundamental – Anos Iniciais,94.830000,4.090000,1.080000
...,...,...,...,...,...,...
157,2023,SP,Ensino Fundamental – Anos Iniciais,94.816667,4.845833,0.337500
158,2023,SP,Ensino Médio Regular,94.387500,3.650000,1.962500
159,2023,TO,Ensino Fundamental – Anos Finais,94.685000,4.665000,0.650000
160,2023,TO,Ensino Fundamental – Anos Iniciais,92.570000,7.175000,0.255000


In [None]:
# Define the output file path by joining the OUTPUT directory with a subfolder name
path = os.path.join(OUTPUT, "educacao_especial_brasil_taxa_rendimento")
# Create the directory if it does not already exist
os.makedirs(path, exist_ok=True)
# Convert all values in df_final to string (astype(str)),
# then save it as a CSV file inside the specified folder.
df_final.astype(str).to_csv(
    os.path.join(path, "2022-2023_uf_txa.csv"), index=False
)

In [None]:
# Read a table directly from BigQuery into a pandas DataFrame using the basedosdados library.
# The SQL query selects all columns from the table:
#   basedosdados.br_inep_educacao_especial.uf_taxa_rendimento
# The parameter billing_project_id specifies which GCP project will be billed for the query.
df_bq = bd.read_sql(
    "select * from basedosdados.br_inep_educacao_especial.uf_taxa_rendimento",
    billing_project_id="basedosdados-dev",
)
df_bq

Downloading: 100%|[32m██████████[0m|


In [None]:
# Concatenate two DataFrames.
df_updated = pd.concat([df_bq, df_final])

In [None]:
# Convert all values in df_updated to strings and save as a CSV file.
df_updated.astype(str).to_csv(
    os.path.join(path, "uf_taxa_rendimento.csv"), index=False
)

In [None]:
# Create a Table object representing a BigQuery table in the specified dataset.
tb_brasil = bd.Table(
    dataset_id="br_inep_educacao_especial", table_id="uf_taxa_rendimento"
)
# Upload the local CSV file to the BigQuery table.
# Parameters:
# - if_storage_data_exists='replace': replace the data in storage if it already exists
# - if_table_exists='replace': replace the table if it already exists
# - source_format='csv': specify that the source file is a CSV
tb_brasil.create(
    os.path.join(path, "uf_taxa_rendimento.csv"),
    if_storage_data_exists="replace",
    if_table_exists="replace",
    source_format="csv",
)

Uploading files:   0%|          | 0/1 [00:00<?, ?it/s][32m2025-08-26 15:55:05.285[0m | [32m[1mSUCCESS [0m | [36mbasedosdados.upload.storage[0m:[36mupload[0m:[36m233[0m - [32m[1m File uf_taxa_rendimento.csv_staging was uploaded![0m
Uploading files: 100%|██████████| 1/1 [00:02<00:00,  2.24s/it]
[32m2025-08-26 15:55:09.642[0m | [1mINFO    [0m | [36mbasedosdados.upload.table[0m:[36mdelete[0m:[36m809[0m - [1m Table uf_taxa_rendimento_staging was deleted![0m
[32m2025-08-26 15:55:10.068[0m | [32m[1mSUCCESS [0m | [36mbasedosdados.upload.table[0m:[36mcreate[0m:[36m690[0m - [32m[1mTable uf_taxa_rendimento was created in staging![0m
