# 1. Backend em Python
Raspa dados (*web-scrap*) para visualização de repositório da [Fundação Sistema Estadual de Análise de Dados Estatísticos](https://www.seade.gov.br/)

## 1.1. Instala dependências Python na vm

In [None]:
pip install urllib3 plotly plotly_express kaleido && pip freeze | grep -e urllib3 -e plotly -e plotly_express -e kaleido

## 1.2. Realiza raspagem de dados

In [4]:
from bs4 import BeautifulSoup
import datetime
import json
import os
import pytz
import sys
import urllib3

urllib3.disable_warnings()

# Desativar a verificação do certificado SSL
urllib3_pool_manager = urllib3.PoolManager(cert_reqs='CERT_NONE')

base_url = "https://repositorio.seade.gov.br/"

supported_formats = ["csv", "CSV"]


def get_page_html_by_url(target_url):
  http_response = urllib3_pool_manager.request("GET", target_url)

  return BeautifulSoup(http_response.data, 'html.parser')

def get_products_dict_from_products_page(target_html):
  category_items = target_html.find_all('li', class_='media-item')

  result = {}
  for item in category_items:
      title = item.find('h2', class_='media-heading').text.strip()
      link = item.find('a', class_='media-view')['href']
      result[title] = link

  return result

def get_product_page_main_title(target_page):
  parent_element = target_page.find('form', id='group-datasets-search-form')
  target_element = parent_element.find_next('h1')

  return f"{target_element.text.strip()}"


def get_dataset_list_from_datasets_page(target_html):
  dataset_ul_element = target_html.find("ul", class_="dataset-list")
  dataset_li_elements = dataset_ul_element.find_all("li", class_="dataset-item")

  datasets = []
  for dataset_li_element in dataset_li_elements:
    li_div_element = dataset_li_element.find_next("div", class_="dataset-content")
    url = li_div_element.a["href"]
    name = li_div_element.a.text
    description = li_div_element.find_next("div").text
    resources = dataset_li_element.find_next('ul', class_="dataset-resources")

    formats = []
    if resources is not None:
        for format in resources.find_all('a'):
          formats.append(format['data-format'])
    else:
        continue

    dataset = {
          'url': url,
          'name': name,
          'description': description,
          'formats': formats
      }
    datasets.append(dataset)

  return datasets


def filter_dataset_list_by_format(dataset_list):
  result = []
  for dataset_item in dataset_list:
    for format in dataset_item['formats']:
      if format in supported_formats:
        result.append(dataset_item)

  return result


def get_dataset_resources_detail_from_dataset_page(target_html):
  resource_ul_element = target_html.find("ul", class_="resource-list")
  resource_li_elements = resource_ul_element.find_all("li", class_="resource-item")

  resource_files_list = []
  for resource_li_element in resource_li_elements:
    try:
      resource_name = resource_li_element.a["title"]
      resource_format = resource_li_element.a.find("span", class_="format-label").text
      resource_description = resource_li_element.p.text.strip()
      resource_url = resource_li_element.div.ul.find("a", class_="resource-url-analytics")["href"]

      if resource_format not in supported_formats:
        continue

      last_dot_index = resource_url.rfind('.')
      if resource_url[last_dot_index + 1:] not in supported_formats:
        continue

      try:
        response = urllib3_pool_manager.request("GET", resource_url)
        is_url_available = response.status == 200
      except Exception as e:
        print(f"ERROR3 falha ao acessar recurso:: {resource_url}")
        print(f"\nContinuando...")
        is_url_available = False

      if is_url_available != True:
        continue

      resource_file_info = {
        "resource_title": resource_name,
        "format": resource_format,
        "download_url": resource_url,
        "desctiption": resource_description
      }
      resource_files_list.append(resource_file_info)
    except Exception as e:
      print(f"\nERROR4 falha ao indexar:: {resource_name}, {resource_format}, {resource_url}")
      print(f"\nContinuando...")

  return resource_files_list


def get_dataset_resources_detail_list_from_dataset_list(dataset_filtered_list):
  result = []
  for dataset_item in dataset_filtered_list:
    try:
      target_html = get_page_html_by_url(f"{base_url}{dataset_item['url']}")
      dataset_resources_detail = get_dataset_resources_detail_from_dataset_page(target_html)

      if len(dataset_resources_detail) > 0:
        dataset = {
            "name": dataset_item["name"],
            "entity": "",
            "description": dataset_item["description"],
            "resources": dataset_resources_detail,
            "last_update": ""
        }
        result.append(dataset)

    except Exception as e:
      print(f"ERROR2: {base_url}{product_item[1]}")
      continue

  return result


def generate_json_file(products_dict):
  products = []

  datetime_now = datetime.datetime.now(pytz.timezone('Etc/GMT+3'))
  print(f"Raspagem de dados p/ pré-processamento iniciado em: {datetime_now}")
  print(f"\n\nEstimativa de finalização: {datetime_now + datetime.timedelta(minutes=15)}\n\n")
  total_items = len(products_dict.items())
  for idx, product_item in enumerate(products_dict.items()):

    # Calcula o percentual de conclusão
    progress_percent = (idx + 1) / total_items * 100
    sys.stdout.write(f"\rCarregando... {progress_percent:.2f}%")
    sys.stdout.flush()

    dataset_list = []
    try:
      product_page_html = get_page_html_by_url(f"{base_url}{product_item[1]}")
      product_page_main_title = get_product_page_main_title(product_page_html)
      dataset_complete_list = get_dataset_list_from_datasets_page(product_page_html)
      dataset_filtered_list = filter_dataset_list_by_format(dataset_complete_list)

      if len(dataset_filtered_list) < 1:
        continue
    except Exception as e:
      print(f"ERROR1: {base_url}{product_item[1]}")
      continue

    dataset_list = get_dataset_resources_detail_list_from_dataset_list(dataset_filtered_list)
    content_category = product_item[0]
    content_title = product_page_main_title

    if len(dataset_list) < 1:
        continue

    category = {
        "category": content_category,
        "title": content_title,
        "detail": f"{len(dataset_list)} compatível(is) com web-scrap (formato .csv)",
        "datasets": dataset_list
    }

    products.append(category)

    result = {
        "categories": products
    }

  json_file_path = "seade-repositorio.json"
  if os.path.exists(json_file_path):
      os.remove(json_file_path)

  with open(json_file_path, "w", encoding="utf-8") as outfile:
        json.dump(result, outfile, indent=4, ensure_ascii=False)

  return json_file_path

###

products_page_html = get_page_html_by_url(f"{base_url}{'/group'}")
products_page_html

products_dict = get_products_dict_from_products_page(products_page_html)
products_dict

product_page_html = get_page_html_by_url(f"{base_url}{products_dict['Seade Municípios']}")
product_page_html

product_page_main_title = get_product_page_main_title(product_page_html)
product_page_main_title

products_dict = get_products_dict_from_products_page(products_page_html)
print(f"Concluído com sucesso! {generate_json_file(products_dict)}")

Raspagem de dados p/ pré-processamento iniciado em: 2024-04-21 19:52:29.284664-03:00


Estimativa de finalização: 2024-04-21 20:07:29.284664-03:00


Carregando... 66.67%
ERROR4 falha ao indexar:: Dicionário de dados, CSV, https://repositorio.seade.gov.br/dataset/0cbb90f4-7ff8-4824-9e42-2bfb4738458c/resource/186b9c52-44ea-4b49-85f4-809ab3dcbddb/download/educacao_ideb_mun.csv

Continuando...
Carregando... 100.00%
ERROR4 falha ao indexar:: Página do produto no site do Seade, HTML, https://repositorio.seade.gov.br/dataset/6427da21-0c71-4a30-9042-d5608dd6b13f/resource/9f1a730d-ba19-4d2e-a47c-c532dcdf614e/download/leia-me_pesquisa_trajetorias_ocupacionais.pdf

Continuando...
Concluído com sucesso! seade-repositorio.json


## 1.3 Cria extensão para o front "py_extension.py"


In [24]:
%%writefile py_extension_b.py

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sweetviz
import ssl
import numpy as np
import plotly.express as px

# Seu código para gerar o relatório Sweetviz aqui

ssl._create_default_https_context = ssl._create_unverified_context

def describe_data(arquivo):
  df = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  return df.describe()

def view_data(arquivo):
  dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  eda = sweetviz.analyze(dados)
  eda.show_html()
  return "/content/SWEETVIZ_REPORT.html"

def pearson_data(arquivo):
  dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  df = dados.corr(method='pearson', min_periods=1, numeric_only=True)
  return df

def heatmap_pearson_data(arquivo):
  dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  plt.figure(figsize=(20,20))
  matriz_correlacao = dados.corr(method='pearson', min_periods=1, numeric_only=True)
  mask = np.triu(np.ones_like(matriz_correlacao, dtype=np.bool_))
  sns.heatmap(dados.corr(method='pearson', min_periods=1, numeric_only=True), mask=mask, square = True, annot=True, vmin=-1, vmax=1)
  fig = "heatmap.png"
  plt.savefig(fig)
  return f"/content/{fig}"

def scatter_data(arquivo, title):
  img_path = "scatter_plot.png"
  try:
    dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
    fig = px.scatter_matrix(dados)
    fig.update_layout(title=f'{title}')
    fig.show()
    fig.write_image(img_path)
  finally:
    return f"/content/{img_path}"

def histogram_data(arquivo):
  img_path = "histogram_plot.png"
  try:
    dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
    numerical_column = dados.select_dtypes(include=[np.number]).columns[1]
    fig = px.histogram(dados, x=numerical_column, nbins=10, title=f'Histograma de {numerical_column}')
    fig.show()
    fig.write_image(img_path)
  finally:
    return f"/content/{img_path}"

Writing py_extension_b.py


# 2. Frontend em R
Utiliza dados raspados anteriormente para facilitar visualização e início da etapa de pré-processamento

## 2.1. Instala dependências R na vm

In [6]:
!sudo apt-get install r-base
!sudo su - -c "R -e \"install.packages(c('shiny', 'rlist', 'reticulate', 'shinycssloaders'), repos='https://cran.rstudio.com/')\""
!sudo apt-get install gdebi-core
!wget https://download3.rstudio.org/ubuntu-18.04/x86_64/shiny-server-1.5.21.1012-amd64.deb
!sudo gdebi -n shiny-server-1.5.21.1012-amd64.deb
!pip install pandas sweetviz
!npm install localtunnel

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
r-base is already the newest version (4.3.3-1.2204.0).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.

R version 4.3.3 (2024-02-29) -- "Angel Food Cake"
Copyright (C) 2024 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> install.packages(c('shiny', 'rlist', 'reticulate', 'shinycssloaders'), repos='https://cran.

## 2.2. Cria servidor R Shiny (app.r)

In [31]:
%%writefile app.r
library(jsonlite)
library(rlist)
library(shiny)
library(reticulate)

py <- import("py_extension_b")

current_url <- NULL
dados <- NULL

# Carrega arquivo JSON
json_data <- jsonlite::read_json("/content/seade-repositorio.json")

# Carrega categories & input_categories para exebir na tela
state.categories <- list()
state.input_categories <- list()
for(category_item in json_data$categories) {
    cat <- list(
      category = category_item$category,
      title = category_item$title,
      detail = category_item$detail,
      datasets = category_item$datasets
    )

    state.categories <- list.append(state.categories, cat)
    state.input_categories <- list.append(state.input_categories, cat$category)
}

ui <- fluidPage(
  titlePanel("Repositório SEADE"),
  sidebarLayout(
    sidebarPanel(
      selectizeInput('e0', 'Selecione o Produto',
        choices = state.input_categories),
      uiOutput("e1"),
      uiOutput("e2")
      ),
    mainPanel(
      helpText('Detalhes do recurso:'),
      shinycssloaders::withSpinner(
        verbatimTextOutput('ex_out'),
      ),
      selectizeInput("view", "Selecione uma visualização",
        choices = c("Descrição estatística", "Matriz de dispersão", "Histograma 2a coluna", "Análise exploratória de dados",
          "Correlação de Pearson",
          "Mapa de calor (heatmap) da matriz de correlação de Pearson")),
      h3(textOutput("caption", container = span)),
      # Only show this panel if the plot type is a histogram
      verbatimTextOutput("summary"),
      conditionalPanel(
        condition = "input.view == 'Descrição estatística'",
        shinycssloaders::withSpinner(
          tableOutput("table")
        )
      ),
      conditionalPanel(
        condition = "input.view == 'Matriz de dispersão'",
        shinycssloaders::withSpinner(
          imageOutput("image1")
        )
      ),
      conditionalPanel(
        condition = "input.view == 'Histograma 2a coluna'",
        shinycssloaders::withSpinner(
          imageOutput("image2")
        )
      ),
      conditionalPanel(
        condition = "input.view == 'Análise exploratória de dados'",
        shinycssloaders::withSpinner(
          htmlOutput("html")
        )
      ),
      conditionalPanel(
        condition = "input.view == 'Correlação de Pearson'",
        shinycssloaders::withSpinner(
          tableOutput("table2")
        )
      ),
      conditionalPanel(
        condition = "input.view == 'Mapa de calor (heatmap) da matriz de correlação de Pearson'",
        shinycssloaders::withSpinner(
          imageOutput("image")
        )
      )

    )
  )
)

server <- function(input, output, session) {

  target_category <- list()

  output$caption <- renderText({
    input$view
  })

  output$summary <- renderPrint({
    target_resource <- find_target_resource()

    if (is.null(target_resource) || is.null(target_resource$url)) {
      return()
    }

    print("")
    output$table <- renderTable({
        NULL
      })
    output$image1 <- renderImage({
      list(src = "",
          contentType = 'image/png',
          alt = "ploty.scatter_matrix")
    }, deleteFile = TRUE)
    output$image2 <- renderImage({
      list(src = "",
          contentType = 'image/png',
          alt = "ploty.histogram")
    }, deleteFile = TRUE)
    output$html <- renderUI({
        includeHTML(path = "")
      })
    output$table2 <- renderTable({
      NULL
    })
    output$image <- renderImage({
      list(src = "",
          contentType = 'image/png',
          width = 900,
          height = 800,
          alt = "sns.heatmap")
    }, deleteFile = TRUE)

    if(input$view == "Descrição estatística"){
      describe_data = py$describe_data(target_resource$url)
      print(describe_data)
      output$table <- renderTable({
        describe_data
      })
    }

    if(input$view == "Matriz de dispersão"){
      scatter_image_path = py$scatter_data(target_resource$url, target_resource$title)
      output$image1 <- renderImage({
        list(src = scatter_image_path,
            contentType = 'image/png',
            alt = "ploty.scatter_matrix")
      }, deleteFile = TRUE)
    }

    if(input$view == "Histograma 2a coluna"){
      histogram_image_path = py$histogram_data(target_resource$url)
      output$image2 <- renderImage({
        list(src = histogram_image_path,
            contentType = 'image/png',
            alt = "ploty.histogram")
      }, deleteFile = TRUE)
    }

    if(input$view == "Análise exploratória de dados"){
      report_html_path = py$view_data(target_resource$url)
      output$html <- renderUI({
        includeHTML(path = report_html_path)
      })
    }

    if(input$view == "Correlação de Pearson"){
      pearson_data = py$pearson_data(target_resource$url)
      print(pearson_data)
      output$table2 <- renderTable({
        pearson_data
      })
    }

    if(input$view == "Mapa de calor (heatmap) da matriz de correlação de Pearson"){
      heatmap_image_path = py$heatmap_pearson_data(target_resource$url)
      output$image <- renderImage({
        list(src = heatmap_image_path,
            contentType = 'image/png',
            width = 900,
            height = 800,
            alt = "sns.heatmap")
      }, deleteFile = TRUE)
    }
  })

  output$e1 <- renderUI({

    # Identifica produto selecionado
    target_category <- list()
    for(item in state.categories) {
      if (input$e0 == item$category) {
        target_category <- list(
          category = item$category,
          title = item$title,
          detail = item$detail,
          datasets = item$datasets
        )
        break
      }
    }

    # Carrega input_datasets para exibir na tela
    input_datasets <- list()
    for (dataset_item in target_category$datasets) {
        input_datasets <- list.append(input_datasets, dataset_item$name)
    }

    selectizeInput(
      "inputConjunto",
      "Selecione o Conjunto de Dados",
      choices = input_datasets
      )
  })

  output$e2 <- renderUI({
    # Identifica produto selecionado

    if (is.null(input$e0) || is.null(input$inputConjunto)) {
      return()
    }

    target_category <- list()
    for(item in state.categories) {
      if (input$e0 == item$category) {
        target_category <- list(
          category = item$category,
          title = item$title,
          detail = item$detail,
          datasets = item$datasets
        )
        break
      }
    }

    #Identifica dataset selecionado
    input_resources <- list()
    for (dataset_item in target_category$datasets) {
      if (input$inputConjunto == dataset_item$name) {
        for (resource_item in dataset_item$resources) {
          input_resources <- list.append(input_resources, resource_item$resource_title)
        }
        break
      }
    }

    selectizeInput(
      "inputRecurso",
      "Selecione o Recurso",
      choices = input_resources
      )
  })

  output$ex_out <- renderPrint({

    # Identifica produto, dataset e recurso selecionado
    target_resource <- find_target_resource()

    target_resource
    #str(target_resource, simplify = FALSE)
  })

  find_target_resource <- function() {
    target_resource <- list()

    if (is.null(input$e0) || is.null(input$inputConjunto) || is.null(input$inputRecurso)){
      return (target_resource)
    }

    for (category_item in state.categories) {
      if (input$e0 == category_item$category) {
        for (dataset_item in category_item$datasets) {
          if (input$inputConjunto == dataset_item$name) {
            for (resource_item in dataset_item$resources) {
              if (input$inputRecurso == resource_item$resource_title) {
                target_resource <- list(
                  title = resource_item$resource_title,
                  format = resource_item$format,
                  url = resource_item$download_url,
                  description = resource_item$desctiption
                )
                current_url <- resource_item$download_url
                break
              }
            }
            break
          }
        }
        break
      }
    }

    return(target_resource)
  }

}

shinyApp(ui, server, options = list(host = "127.0.0.1", port = 8501))

Overwriting app.r


## 2.3. Inicia servidor R Shiny

In [8]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ipv4.icanhazip.com

Password/Enpoint IP for localtunnel is: 35.221.165.98


In [9]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ifconfig.me

Password/Enpoint IP for localtunnel is: 35.221.165.98

In [None]:
!npx localtunnel --port 8501 & Rscript app.r


Attaching package: ‘shiny’

The following object is masked from ‘package:jsonlite’:

    validate

[K[?25hnpx: installed 22 in 2.292s
[?25hyour url is: https://eager-needles-bow.loca.lt
[?25h[?25h[?25h[?25h[?25h[?25h[?25h[?25h[?25h
Listening on http://127.0.0.1:8501
Done! Use 'show' commands to display/save. : 100% 1/1 [00:00<00:00,  1.34it/s]
Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
[31m✖[39m Path: /content/SWEETVIZ_REPORT.html
[34mℹ[39m Use `tags$iframe()` to include an HTML document. You can either ensure `path` is accessible in your app or document (see e.g. `shiny::addResourcePath()`) and pass the relative path to the `src` argument. Or you can read the contents of `path` and pass the contents to `srcdoc`.
  file("") only supports open = "w+" and open = "w+b": using the former
  101: [37mdetect_html_document[39m
  100: [37mincludeHTML[39m
   99