In [20]:
import os

import basedosdados as bd
import pandas as pd

INPUT = os.path.join("models", "br_inep_educacao_especial", "data")
OUTPUT = os.path.join("models", "br_inep_educacao_especial", "output")

os.makedirs(INPUT, exist_ok=True)
os.makedirs(OUTPUT, exist_ok=True)

In [22]:
def read_sheet(sheet_name: str, skiprows: int = 3) -> pd.DataFrame:
    return pd.read_excel(
        os.path.join(INPUT, "TDI_ANO_2020_21_22_23_24.xlsx"),
        skiprows=skiprows,
        sheet_name=sheet_name,
    )

In [23]:
# Load the Excel file into a pandas ExcelFile object
excel_data = pd.ExcelFile(os.path.join(INPUT, "TDI_ANO_2020_21_22_23_24.xlsx"))

# Get the sheet names
print(excel_data.sheet_names)

['TDI_ANO_2020_21_22_23_24_educac']


In [None]:
# Parse the Excel file into a DataFrame.
# If no sheet name is specified, it loads the first sheet by default.
df = excel_data.parse()

In [25]:
# Print the column names of the DataFrame to see what was read from the Excel sheet
print(df.columns)

Index(['NU_ANO_CENSO', 'TP_TIPO_CLASSE', 'CO_REGIAO', 'NO_REGIAO', 'CO_UF',
       'SG_UF', 'TP_DEPENDENCIA', 'NO_DEPENDENCIA', 'NO_CATEGORIA',
       'FUN_CAT_0', 'FUN_AI_CAT_0', 'FUN_AF_CAT_0', 'FUN_01_CAT_0',
       'FUN_02_CAT_0', 'FUN_03_CAT_0', 'FUN_04_CAT_0', 'FUN_05_CAT_0',
       'FUN_06_CAT_0', 'FUN_07_CAT_0', 'FUN_08_CAT_0', 'FUN_09_CAT_0',
       'MED_CAT_0', 'MED_01_CAT_0', 'MED_02_CAT_0', 'MED_03_CAT_0',
       'MED_04_CAT_0'],
      dtype='object')


In [26]:
# -----------------------------
# Rename and filter columns
# -----------------------------
# This block renames the DataFrame columns according to the RENAME_COLUMNS dictionary
# and keeps only the renamed columns. It overwrites the original df variable, so
# df will contain only the columns specified in RENAME_COLUMNS.

RENAME_COLUMNS = {
    "NU_ANO_CENSO": "ano",
    "NO_CATEGORIA": "categoria",
    "NO_REGIAO": "regiao",
    "TP_TIPO_CLASSE": "classe",
    "NO_DEPENDENCIA": "dependencia",
    "FUN_AI_CAT_0": "Ensino Fundamental – Anos Iniciais",
    "FUN_AF_CAT_0": "Ensino Fundamental – Anos Finais",
    "MED_CAT_0": "Ensino Médio Regular",
}


def keep_only_renamed(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns=RENAME_COLUMNS)

    cols_keep = list(RENAME_COLUMNS.values())

    cols_existentes = [col for col in cols_keep if col in df.columns]

    return df[cols_existentes]


df = keep_only_renamed(df)
print(df.columns)

Index(['ano', 'categoria', 'regiao', 'classe', 'dependencia',
       'Ensino Fundamental – Anos Iniciais',
       'Ensino Fundamental – Anos Finais', 'Ensino Médio Regular'],
      dtype='object')


In [None]:
# Filter the DataFrame 'df' to keep only rows that meet all of the following conditions:
df = df[
    (df["ano"] >= 2022)  # Year is 2022 or later
    & (
        df["classe"] != "0 - Todas as turmas"
    )  # Exclude rows where 'classe' equals "0 - Todas as turmas"
    & (
        df["categoria"] == "Modalidade: educação especial"
    )  # Keep only rows for special education
    & (
        df["dependencia"] == "Total"
    )  # Include only rows where 'dependencia' is "Total"
    & (df["regiao"] == "Brasil")  # Include only rows for the whole country
]
df

Unnamed: 0,ano,categoria,regiao,classe,dependencia,Ensino Fundamental – Anos Iniciais,Ensino Fundamental – Anos Finais,Ensino Médio Regular
1971,2022,Modalidade: educação especial,Brasil,1 - Classe comum,Total,21.2,44.6,45.1
2759,2023,Modalidade: educação especial,Brasil,1 - Classe comum,Total,18.6,39.4,41.8
3547,2024,Modalidade: educação especial,Brasil,1 - Classe comum,Total,15.2,34.6,38.6


In [29]:
# Filters the DataFrame to keep only rows where 'regiao' is "Brasil"
# and melts the DataFrame from wide to long format (one row per metric)
# Each row will have: 'ano', 'regiao', 'metrica' (original metric name), and 'valor' (corresponding value)
melted_dataframe = pd.concat(
    [
        df.pipe(lambda d: d.loc[(d["regiao"] == "Brasil")]).pipe(
            lambda d: pd.melt(
                d,
                id_vars=["ano", "regiao"],
                value_vars=d.columns.difference(
                    ["ano", "regiao"]
                ).tolist(),  # Convert to list
                var_name="metrica",
                value_name="tdi",
            )
        )
    ]
)

In [30]:
melted_dataframe

Unnamed: 0,ano,regiao,metrica,tdi
0,2022,Brasil,Ensino Fundamental – Anos Finais,44.6
1,2023,Brasil,Ensino Fundamental – Anos Finais,39.4
2,2024,Brasil,Ensino Fundamental – Anos Finais,34.6
3,2022,Brasil,Ensino Fundamental – Anos Iniciais,21.2
4,2023,Brasil,Ensino Fundamental – Anos Iniciais,18.6
5,2024,Brasil,Ensino Fundamental – Anos Iniciais,15.2
6,2022,Brasil,Ensino Médio Regular,45.1
7,2023,Brasil,Ensino Médio Regular,41.8
8,2024,Brasil,Ensino Médio Regular,38.6
9,2022,Brasil,categoria,Modalidade: educação especial


In [31]:
melted_dataframe["etapa_ensino"] = melted_dataframe["metrica"].apply(
    lambda v: v.split("_")[-1]
)  # Extracts 'anosiniciais', 'anosfinais', or 'ensinomedio'
melted_dataframe["tipo_metrica"] = melted_dataframe["metrica"].apply(
    lambda v: v.split("_")[0]
)  # Extracts 'tdi'
melted_dataframe["tdi"] = pd.to_numeric(
    melted_dataframe["tdi"], errors="coerce"
)

# Pivoting the melted DataFrame to get desired columns
df_final = melted_dataframe.pivot_table(
    index=["ano", "regiao", "etapa_ensino"],
    columns="tipo_metrica",
    values="tdi",
).reset_index()

In [None]:
# Remove all rows where the column 'valor' has missing (NaN) values.
melted_dataframe = melted_dataframe.dropna(subset=["tdi"])

In [None]:
# Dictionary used to rename columns in the melted DataFrame
# to a more standardized format.
RENAME_COLUMNS_MELTED = {"tdi": "tdi", "metrica": "etapa_ensino"}

In [None]:
# Select only the relevant columns for analysis
melted_dataframe = melted_dataframe[["ano", "etapa_ensino", "tdi"]]

In [None]:
# Define the output path by joining the OUTPUT directory with a subfolder
path = os.path.join(OUTPUT, "educacao_especial_brasil_distorcao_idade_serie")

# Create the directory if it doesn't exist (exist_ok=True avoids errors if it already exists)
os.makedirs(path, exist_ok=True)

# Convert all values in melted_dataframe to strings and save as a CSV file
# The file is named "brasil_tdi_2022_2024.csv" and will not include the DataFrame index
melted_dataframe.astype(str).to_csv(
    os.path.join(path, "brasil_tdi_2022_2024.csv"), index=False
)

In [None]:
# Read a table directly from BigQuery into a pandas DataFrame using the basedosdados library.
# The SQL query selects all columns from the table:
#   basedosdados.br_inep_educacao_especial.uf_taxa_rendimento
# The parameter billing_project_id specifies which GCP project will be billed for the query.
df_bq = bd.read_sql(
    "select * from basedosdados.br_inep_educacao_especial.brasil_distorcao_idade_serie",
    billing_project_id="basedosdados-dev",
)

Downloading: 100%|[32m██████████[0m|


In [42]:
df_bq

Unnamed: 0,ano,etapa_ensino,tdi
0,2009,Ensino Médio Regular,56.0
1,2008,Ensino Médio Regular,57.3
2,2011,Ensino Médio Regular,56.1
3,2021,Ensino Médio Regular,48.3
4,2007,Ensino Médio Regular,65.3
5,2013,Ensino Médio Regular,55.4
6,2015,Ensino Médio Regular,53.7
7,2018,Ensino Médio Regular,53.2
8,2010,Ensino Médio Regular,56.4
9,2016,Ensino Médio Regular,53.9


In [None]:
# Concatenate two DataFrames:
df_updated = pd.concat([df_bq, melted_dataframe])

In [44]:
df_updated

Unnamed: 0,ano,etapa_ensino,tdi
0,2009,Ensino Médio Regular,56.0
1,2008,Ensino Médio Regular,57.3
2,2011,Ensino Médio Regular,56.1
3,2021,Ensino Médio Regular,48.3
4,2007,Ensino Médio Regular,65.3
5,2013,Ensino Médio Regular,55.4
6,2015,Ensino Médio Regular,53.7
7,2018,Ensino Médio Regular,53.2
8,2010,Ensino Médio Regular,56.4
9,2016,Ensino Médio Regular,53.9


In [None]:
# Convert all values in df_updated to strings and save as a CSV file.
df_updated.astype(str).to_csv(
    os.path.join(path, "brasil_distorcao_idade_serie.csv"), index=False
)

In [None]:
# Create a Table object representing a BigQuery table in the specified dataset.
tb_brasil = bd.Table(
    dataset_id="br_inep_educacao_especial",
    table_id="brasil_distorcao_idade_serie",
)
# Upload the local CSV file to the BigQuery table.
# Parameters:
# - if_storage_data_exists='replace': replace the data in storage if it already exists
# - if_table_exists='replace': replace the table if it already exists
# - source_format='csv': specify that the source file is a CSV
tb_brasil.create(
    os.path.join(path, "brasil_distorcao_idade_serie.csv"),
    if_storage_data_exists="replace",
    if_table_exists="replace",
    source_format="csv",
)

Uploading files:   0%|          | 0/1 [00:00<?, ?it/s][32m2025-08-26 20:31:22.064[0m | [32m[1mSUCCESS [0m | [36mbasedosdados.upload.storage[0m:[36mupload[0m:[36m233[0m - [32m[1m File brasil_distorcao_idade_serie.csv_staging was uploaded![0m
Uploading files: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
[32m2025-08-26 20:31:27.680[0m | [1mINFO    [0m | [36mbasedosdados.upload.table[0m:[36mdelete[0m:[36m809[0m - [1m Table brasil_distorcao_idade_serie_staging was deleted![0m
[32m2025-08-26 20:31:28.086[0m | [32m[1mSUCCESS [0m | [36mbasedosdados.upload.table[0m:[36mcreate[0m:[36m690[0m - [32m[1mTable brasil_distorcao_idade_serie was created in staging![0m
