In [1]:
import urllib
import requests
from pathlib import Path
from bs4 import BeautifulSoup
import os

In [2]:
def obtem_dados_censo(url: str) -> Path:
    """
    Realiza o download dos dados do censo escolar

    :param url: endereço do site do inep com os dados do censo
    :return: caminho para os dados de saída
    """
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, features="lxml")

    links = {
        tag["href"].split("_")[-1]: tag["href"]
        for tag in soup.find_all("a", {"class": "external-link"})
    }

    caminho_saida = Path("../dados/censo_escolar")
    caminho_saida.mkdir(parents=True, exist_ok=True)

    for dado in links:
        r = requests.get(links[dado], stream=True)
        with open(caminho_saida / dado, "wb") as arq:
            arq.write(r.content)


In [3]:
os.chdir("..")

In [4]:
%load_ext autoreload
%autoreload 2
import src.aquisicao.inep.base_inep as etl_base_inep

In [5]:
inep_etl = etl_base_inep.BaseINEPETL("dados/censo_escolar", "saida/aquisicao", "censo-escolar")

In [6]:
inep_etl.le_pagina_inep()

{'2022.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2022.zip',
 '2021.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2021.zip',
 '2020.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2020.zip',
 '2019.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2019.zip',
 '2018.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2018.zip',
 '2017.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2017.zip',
 '2016.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2016.zip',
 '2015.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2015.zip',
 '2014.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2014.zip',
 '2013.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2013.zip',
 '2012.zip': 'https://download.inep.gov.br/dados_abertos/microdados_ce

In [7]:
inep_etl.dicionario_para_baixar()

{}

In [8]:
inep_etl.download_conteudo()

In [9]:
os.listdir()

['.git',
 '.github',
 '.gitignore',
 '.ipynb_checkpoints',
 'dados',
 'LICENSE',
 'notebooks',
 'README.md',
 'requirements.txt',
 'run.py',
 'saida',
 'src']

In [10]:
import zipfile
import pandas as pd

In [11]:
dados = pd.read_csv("dados/censo_escolar/microdados_ed_basica_2022.csv", encoding="latin-1", sep=";")

  dados = pd.read_csv("dados/censo_escolar/microdados_ed_basica_2022.csv", encoding="latin-1", sep=";")


In [12]:
dados.head()

Unnamed: 0,NU_ANO_CENSO,NO_REGIAO,CO_REGIAO,NO_UF,SG_UF,CO_UF,NO_MUNICIPIO,CO_MUNICIPIO,NO_MESORREGIAO,CO_MESORREGIAO,...,QT_TUR_FUND_AF,QT_TUR_MED,QT_TUR_PROF,QT_TUR_PROF_TEC,QT_TUR_EJA,QT_TUR_EJA_FUND,QT_TUR_EJA_MED,QT_TUR_ESP,QT_TUR_ESP_CC,QT_TUR_ESP_CE
0,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,0.0,0.0,0.0,0.0,10.0,6.0,4.0,3.0,3.0,0.0
2,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0


In [13]:
dados.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224649 entries, 0 to 224648
Columns: 385 entries, NU_ANO_CENSO to QT_TUR_ESP_CE
dtypes: float64(358), int64(11), object(16)
memory usage: 659.9+ MB


In [14]:
arq_zip = zipfile.ZipFile("dados/censo_escolar/2022.zip")

In [15]:
arq_zip.namelist()

['Microdados do Censo Escolar da Educaç╞o Básica 2022/',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/ANEXO I - Dicionário de Dados/',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/ANEXO I - Dicionário de Dados/dicionário_dados_educaç╞o_básica.xlsx',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/ANEXO I - Dicionário de Dados/~$Dicionário de Dados da Educaç╞o Básica.xlsx',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/ANEXO I - Dicionário de Dados/~$dicionário_dados_educaç╞o_básica.xlsx',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/ANEXO II -  Questionários do Censo da Educaç╞o Basica/',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/ANEXO II -  Questionários do Censo da Educaç╞o Basica/Aluno.pdf',
 'Microdados do Censo Escolar da Educaç╞o Básica 2022/Anexos/ANEXO II -  Questionários do Censo da Educaç╞o Basica/Escola.pdf',
 'Microda

In [16]:
buffer = arq_zip.open("Microdados do Censo Escolar da Educaç╞o Básica 2022/dados/microdados_ed_basica_2022.csv")

In [17]:
buffer

<zipfile.ZipExtFile name='Microdados do Censo Escolar da Educaç╞o Básica 2022/dados/microdados_ed_basica_2022.csv' mode='r' compress_type=deflate>

In [18]:
dados = pd.read_csv(buffer, encoding="latin-1", sep=";")

  dados = pd.read_csv(buffer, encoding="latin-1", sep=";")


In [19]:
dados.head()

Unnamed: 0,NU_ANO_CENSO,NO_REGIAO,CO_REGIAO,NO_UF,SG_UF,CO_UF,NO_MUNICIPIO,CO_MUNICIPIO,NO_MESORREGIAO,CO_MESORREGIAO,...,QT_TUR_FUND_AF,QT_TUR_MED,QT_TUR_PROF,QT_TUR_PROF_TEC,QT_TUR_EJA,QT_TUR_EJA_FUND,QT_TUR_EJA_MED,QT_TUR_ESP,QT_TUR_ESP_CC,QT_TUR_ESP_CE
0,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,0.0,0.0,0.0,0.0,10.0,6.0,4.0,3.0,3.0,0.0
2,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,2022,Norte,1,Rondônia,RO,11,Alta Floresta D'Oeste,1100015,Leste Rondoniense,2,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0


In [22]:
[arq for arq in arq_zip.namelist() if ".csv" in arq][0]

'Microdados do Censo Escolar da Educaç╞o Básica 2022/dados/microdados_ed_basica_2022.csv'