# 1. Backend em Python
Raspa dados (*web-scrap*) para visualização de repositório da [Fundação Sistema Estadual de Análise de Dados Estatísticos](https://www.seade.gov.br/)

## 1.1. Instala dependências

In [None]:
pip install urllib3 && pip freeze | grep -e urllib3

urllib3==2.0.7


## 1.2. Realiza raspagem de dados

In [None]:
from bs4 import BeautifulSoup
import datetime
import json
import os
import pytz
import sys
import urllib3

urllib3.disable_warnings()

# Desativar a verificação do certificado SSL
urllib3_pool_manager = urllib3.PoolManager(cert_reqs='CERT_NONE')

base_url = "https://repositorio.seade.gov.br/"

supported_formats = ["csv", "CSV"]


def get_page_html_by_url(target_url):
  http_response = urllib3_pool_manager.request("GET", target_url)

  return BeautifulSoup(http_response.data, 'html.parser')

def get_products_dict_from_products_page(target_html):
  category_items = target_html.find_all('li', class_='media-item')

  result = {}
  for item in category_items:
      title = item.find('h2', class_='media-heading').text.strip()
      link = item.find('a', class_='media-view')['href']
      result[title] = link

  return result

def get_product_page_main_title(target_page):
  parent_element = target_page.find('form', id='group-datasets-search-form')
  target_element = parent_element.find_next('h1')

  return f"{target_element.text.strip()}"


def get_dataset_list_from_datasets_page(target_html):
  dataset_ul_element = target_html.find("ul", class_="dataset-list")
  dataset_li_elements = dataset_ul_element.find_all("li", class_="dataset-item")

  datasets = []
  for dataset_li_element in dataset_li_elements:
    li_div_element = dataset_li_element.find_next("div", class_="dataset-content")
    url = li_div_element.a["href"]
    name = li_div_element.a.text
    description = li_div_element.find_next("div").text
    resources = dataset_li_element.find_next('ul', class_="dataset-resources")

    formats = []
    if resources is not None:
        for format in resources.find_all('a'):
          formats.append(format['data-format'])
    else:
        continue

    dataset = {
          'url': url,
          'name': name,
          'description': description,
          'formats': formats
      }
    datasets.append(dataset)

  return datasets


def filter_dataset_list_by_format(dataset_list):
  result = []
  for dataset_item in dataset_list:
    for format in dataset_item['formats']:
      if format in supported_formats:
        result.append(dataset_item)

  return result


def get_dataset_resources_detail_from_dataset_page(target_html):
  resource_ul_element = target_html.find("ul", class_="resource-list")
  resource_li_elements = resource_ul_element.find_all("li", class_="resource-item")

  resource_files_list = []
  for resource_li_element in resource_li_elements:
    try:
      resource_name = resource_li_element.a["title"]
      resource_format = resource_li_element.a.find("span", class_="format-label").text
      resource_description = resource_li_element.p.text.strip()
      resource_url = resource_li_element.div.ul.find("a", class_="resource-url-analytics")["href"]

      if resource_format not in supported_formats:
        continue

      last_dot_index = resource_url.rfind('.')
      if resource_url[last_dot_index + 1:] not in supported_formats:
        continue

      try:
        response = urllib3_pool_manager.request("GET", resource_url)
        is_url_available = response.status == 200
      except Exception as e:
        print(f"ERROR3 falha ao acessar recurso:: {resource_url}")
        print(f"\nContinuando...")
        is_url_available = False

      if is_url_available != True:
        continue

      resource_file_info = {
        "resource_title": resource_name,
        "format": resource_format,
        "download_url": resource_url,
        "desctiption": resource_description
      }
      resource_files_list.append(resource_file_info)
    except Exception as e:
      print(f"\nERROR4 falha ao indexar:: {resource_name}, {resource_format}, {resource_url}")
      print(f"\nContinuando...")

  return resource_files_list


def get_dataset_resources_detail_list_from_dataset_list(dataset_filtered_list):
  result = []
  for dataset_item in dataset_filtered_list:
    try:
      target_html = get_page_html_by_url(f"{base_url}{dataset_item['url']}")
      dataset_resources_detail = get_dataset_resources_detail_from_dataset_page(target_html)

      if len(dataset_resources_detail) > 0:
        dataset = {
            "name": dataset_item["name"],
            "entity": "",
            "description": dataset_item["description"],
            "resources": dataset_resources_detail,
            "last_update": ""
        }
        result.append(dataset)

    except Exception as e:
      print(f"ERROR2: {base_url}{product_item[1]}")
      continue

  return result


def generate_json_file(products_dict):
  products = []

  datetime_now = datetime.datetime.now(pytz.timezone('Etc/GMT+3'))
  print(f"Raspagem de dados p/ pré-processamento iniciado em: {datetime_now}")
  print(f"\n\nEstimativa de finalização: {datetime_now + datetime.timedelta(minutes=15)}\n\n")
  total_items = len(products_dict.items())
  for idx, product_item in enumerate(products_dict.items()):

    # Calcula o percentual de conclusão
    progress_percent = (idx + 1) / total_items * 100
    sys.stdout.write(f"\rCarregando... {progress_percent:.2f}%")
    sys.stdout.flush()

    dataset_list = []
    try:
      product_page_html = get_page_html_by_url(f"{base_url}{product_item[1]}")
      product_page_main_title = get_product_page_main_title(product_page_html)
      dataset_complete_list = get_dataset_list_from_datasets_page(product_page_html)
      dataset_filtered_list = filter_dataset_list_by_format(dataset_complete_list)

      if len(dataset_filtered_list) < 1:
        continue
    except Exception as e:
      print(f"ERROR1: {base_url}{product_item[1]}")
      continue

    dataset_list = get_dataset_resources_detail_list_from_dataset_list(dataset_filtered_list)
    content_category = product_item[0]
    content_title = product_page_main_title

    if len(dataset_list) < 1:
        continue

    category = {
        "category": content_category,
        "title": content_title,
        "detail": f"{len(dataset_list)} compatível(is) com web-scrap (formato .csv)",
        "datasets": dataset_list
    }

    products.append(category)

    result = {
        "categories": products
    }

  json_file_path = "seade-repositorio.json"
  if os.path.exists(json_file_path):
      os.remove(json_file_path)

  with open(json_file_path, "w", encoding="utf-8") as outfile:
        json.dump(result, outfile, indent=4, ensure_ascii=False)

  return json_file_path

###

products_page_html = get_page_html_by_url(f"{base_url}{'/group'}")
products_page_html

products_dict = get_products_dict_from_products_page(products_page_html)
products_dict

product_page_html = get_page_html_by_url(f"{base_url}{products_dict['Seade Municípios']}")
product_page_html

product_page_main_title = get_product_page_main_title(product_page_html)
product_page_main_title

products_dict = get_products_dict_from_products_page(products_page_html)
print(f"Concluído com sucesso! {generate_json_file(products_dict)}")

Raspagem de dados p/ pré-processamento iniciado em: 2024-04-20 18:07:01.778294-03:00


Estimativa de finalização: 2024-04-20 18:22:01.778294-03:00


Carregando... 66.67%
ERROR4 falha ao indexar:: Dicionário de dados, CSV, https://repositorio.seade.gov.br/dataset/0cbb90f4-7ff8-4824-9e42-2bfb4738458c/resource/186b9c52-44ea-4b49-85f4-809ab3dcbddb/download/educacao_ideb_mun.csv

Continuando...
Carregando... 100.00%
ERROR4 falha ao indexar:: Página do produto no site do Seade, HTML, https://repositorio.seade.gov.br/dataset/6427da21-0c71-4a30-9042-d5608dd6b13f/resource/9f1a730d-ba19-4d2e-a47c-c532dcdf614e/download/leia-me_pesquisa_trajetorias_ocupacionais.pdf

Continuando...
Concluído com sucesso! seade-repositorio.json


# 2. Frontend em R
Utiliza dados raspados anteriormente para facilitar visualização e início da etapa de pré-processamento

## 2.1. Instala shiny-server e localtunnel na vm

In [None]:
!sudo apt-get install r-base
!sudo su - -c "R -e \"install.packages(c('shiny', 'ggExtra', 'rlist', 'reticulate'), repos='https://cran.rstudio.com/')\""
!sudo apt-get install gdebi-core
!wget https://download3.rstudio.org/ubuntu-18.04/x86_64/shiny-server-1.5.21.1012-amd64.deb
!sudo gdebi -n shiny-server-1.5.21.1012-amd64.deb
!pip install pandas sweetviz
!npm install localtunnel

## 2.2. Cria servidor R Shiny (app.r)

In [None]:
%%writefile app.r
library(shiny)
library(bslib)
library(dplyr)
library(ggplot2)
library(ggExtra)

penguins_csv <- "https://raw.githubusercontent.com/jcheng5/simplepenguins.R/main/penguins.csv"

df <- readr::read_csv(penguins_csv)
# Find subset of columns that are suitable for scatter plot
df_num <- df |> select(where(is.numeric), -Year)

ui <- page_sidebar(
  sidebar = sidebar(
    varSelectInput("xvar", "X variable", df_num, selected = "Bill Length (mm)"),
    varSelectInput("yvar", "Y variable", df_num, selected = "Bill Depth (mm)"),
    checkboxGroupInput(
      "species", "Filter by species",
      choices = unique(df$Species),
      selected = unique(df$Species)
    ),
    hr(), # Add a horizontal rule
    checkboxInput("by_species", "Show species", TRUE),
    checkboxInput("show_margins", "Show marginal plots", TRUE),
    checkboxInput("smooth", "Add smoother"),
  ),
  plotOutput("scatter")
)

server <- function(input, output, session) {
  subsetted <- reactive({
    req(input$species)
    df |> filter(Species %in% input$species)
  })

  output$scatter <- renderPlot({
    p <- ggplot(subsetted(), aes(!!input$xvar, !!input$yvar)) + list(
      theme(legend.position = "bottom"),
      if (input$by_species) aes(color = Species),
      geom_point(),
      if (input$smooth) geom_smooth()
    )

    if (input$show_margins) {
      margin_type <- if (input$by_species) "density" else "histogram"
      p <- ggExtra::ggMarginal(p, type = margin_type, margins = "both",
        size = 8, groupColour = input$by_species, groupFill = input$by_species)
    }

    p
  }, res = 100)
}

shinyApp(ui, server, options = list(host = "127.0.0.1", port = 8501))

Writing app.r


## 2.3. Inicia servidor R Shiny

In [None]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ipv4.icanhazip.com

Password/Enpoint IP for localtunnel is: 34.125.234.98


In [None]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ifconfig.me

In [None]:
!npx localtunnel --port 8501 & Rscript app.r

## py_extension.py


In [242]:
%%writefile py_extension_e.py

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sweetviz
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

def describe_data(arquivo):
  df = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  return df.describe()

def view_data(arquivo):
  dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  eda = sweetviz.analyze(dados)
  eda.show_html()
  return "/content/SWEETVIZ_REPORT.html"

def pearson_data(arquivo):
  dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  df = dados.corr(method='pearson', min_periods=1, numeric_only=True)
  return df

def heatmap_pearson_data(arquivo):
  dados = pd.read_csv(arquivo, encoding='iso-8859-1', sep=';', on_bad_lines='skip')
  plt.figure(figsize=(20,20))
  matriz_correlacao = dados.corr(method='pearson', min_periods=1, numeric_only=True)
  mask = np.triu(np.ones_like(matriz_correlacao, dtype=np.bool_))
  sns.heatmap(dados.corr(method='pearson', min_periods=1, numeric_only=True), mask=mask, square = True, annot=True, vmin=-1, vmax=1)
  fig = "heatmap.png"
  plt.savefig(fig)

  return "/content/heatmap.png"

Writing py_extension_e.py


## Quick test

In [155]:
# activate R magic
%load_ext rpy2.ipython

In [None]:
%%R
library(reticulate)

py <- import("py_extension")

csv_file_path <- "/content/divisoes_regionais_esp.csv"

describe_data = py$describe_data(csv_file_path)
print(describe_data)

view_data = py$view_data(csv_file_path)
print(view_data)

pearson_data = py$pearson_data(csv_file_path)
print(pearson_data)

#masked_pearson_data = py$masked_pearson_data(csv_file_path)
#print(masked_pearson_data)

heatmap_pearson_data = py$heatmap_pearson_data(csv_file_path)
#print(heatmap_pearson_data)

## Final

In [None]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ifconfig.me

Password/Enpoint IP for localtunnel is: 34.74.104.74

In [None]:
!npx localtunnel --port 8501 & Rscript app_json.r

In [252]:
%%writefile app_json.r
library(jsonlite)
library(rlist)
library(shiny)
library(reticulate)

py <- import("py_extension_e")

current_url <- NULL
dados <- NULL

# Carrega arquivo JSON
json_data <- jsonlite::read_json("/content/seade-repositorio.json")

# Carrega categories & input_categories para exebir na tela
state.categories <- list()
state.input_categories <- list()
for(category_item in json_data$categories) {
    cat <- list(
      category = category_item$category,
      title = category_item$title,
      detail = category_item$detail,
      datasets = category_item$datasets
    )

    state.categories <- list.append(state.categories, cat)
    state.input_categories <- list.append(state.input_categories, cat$category)
}

ui <- fluidPage(
  titlePanel("Repositório SEADE"),
  sidebarLayout(
    sidebarPanel(
      selectizeInput('e0', 'Selecione o Produto',
        choices = state.input_categories),
      uiOutput("e1"),
      uiOutput("e2")
      ),
    mainPanel(
      helpText('Detalhes do recurso:'),
      verbatimTextOutput('ex_out'),
      selectizeInput("view", "Selecione uma visualização",
        choices = c("Describe Data", "View Data", "#Pearson Data", "Heat Map Pearson Data")),
      h3(textOutput("caption", container = span)),
      verbatimTextOutput("summary"),
      tableOutput("table"),
      htmlOutput("html"),
      #tableOutput("table2"),
      imageOutput("image")
    )
  )
)

server <- function(input, output, session) {

  target_category <- list()

  output$caption <- renderText({
    input$view
  })

  output$summary <- renderPrint({
    target_resource <- find_target_resource()

    #shinyjs::hide("table")
    #shinyjs::hide("html")

    if(input$view == "Describe Data"){
      describe_data = py$describe_data(target_resource$url)
      output$table <- renderTable({
        describe_data
      })
    }
    if(input$view == "View Data"){
      report_html_path = py$view_data(target_resource$url)
      output$html <- renderUI({
        includeHTML(path = report_html_path)
      })
    }

    #if(input$view == "Pearson Data"){
    #  pearson_data = py$pearson_data(target_resource$url)
    #  print(pearson_data)
    #  output$table2 <- renderTable({
    #    pearson_data
    #  })
    #}

    if(input$view == "Heat Map Pearson Data"){
      heatmap_image_path = py$heatmap_pearson_data(target_resource$url)
      output$image <- renderImage({
        list(src = heatmap_image_path,
            contentType = 'image/png',
            width = 400,
            height = 300,
            alt = "This is alternate text")
      }, deleteFile = TRUE)
    }
  })

  output$e1 <- renderUI({

    # Identifica produto selecionado
    target_category <- list()
    for(item in state.categories) {
      if (input$e0 == item$category) {
        target_category <- list(
          category = item$category,
          title = item$title,
          detail = item$detail,
          datasets = item$datasets
        )
        break
      }
    }

    # Carrega input_datasets para exibir na tela
    input_datasets <- list()
    for (dataset_item in target_category$datasets) {
        input_datasets <- list.append(input_datasets, dataset_item$name)
    }

    selectizeInput(
      "inputConjunto",
      "Selecione o Conjunto de Dados",
      choices = input_datasets
      )
  })

  output$e2 <- renderUI({
    # Identifica produto selecionado
    target_category <- list()
    for(item in state.categories) {
      if (input$e0 == item$category) {
        target_category <- list(
          category = item$category,
          title = item$title,
          detail = item$detail,
          datasets = item$datasets
        )
        break
      }
    }

    #Identifica dataset selecionado
    input_resources <- list()
    for (dataset_item in target_category$datasets) {
      if (input$inputConjunto == dataset_item$name) {
        for (resource_item in dataset_item$resources) {
          input_resources <- list.append(input_resources, resource_item$resource_title)
        }
        break
      }
    }

    selectizeInput(
      "inputRecurso",
      "Selecione o Recurso",
      choices = input_resources
      )
  })

  output$ex_out <- renderPrint({

    # Identifica produto, dataset e recurso selecionado
    target_resource <- find_target_resource()

    target_resource
    #str(target_resource, simplify = FALSE)
  })

  find_target_resource <- function() {
    target_resource <- list()

    for (category_item in state.categories) {
      if (input$e0 == category_item$category) {
        for (dataset_item in category_item$datasets) {
          if (input$inputConjunto == dataset_item$name) {
            for (resource_item in dataset_item$resources) {
              if (input$inputRecurso == resource_item$resource_title) {
                target_resource <- list(
                  title = resource_item$resource_title,
                  format = resource_item$format,
                  url = resource_item$download_url,
                  description = resource_item$desctiption
                )
                current_url <- resource_item$download_url
                break
              }
            }
            break
          }
        }
        break
      }
    }

    return(target_resource)
  }

}

shinyApp(ui, server, options = list(host = "127.0.0.1", port = 8501))

Overwriting app_json.r
