# 1. Backend em Python
Raspa dados (*web-scrap*) para visualização de repositório da [Fundação Sistema Estadual de Análise de Dados Estatísticos](https://www.seade.gov.br/)

## 1.1. Instala dependências

In [1]:
pip install urllib3 && pip freeze | grep -e urllib3

urllib3==2.0.7


## 1.2. Realiza raspagem de dados

In [None]:
from bs4 import BeautifulSoup
import datetime
import json
import os
import pytz
import sys
import urllib3

urllib3.disable_warnings()

# Desativar a verificação do certificado SSL
urllib3_pool_manager = urllib3.PoolManager(cert_reqs='CERT_NONE')

base_url = "https://repositorio.seade.gov.br/"

supported_formats = ["csv", "CSV"]


def get_page_html_by_url(target_url):
  http_response = urllib3_pool_manager.request("GET", target_url)

  return BeautifulSoup(http_response.data, 'html.parser')

def get_products_dict_from_products_page(target_html):
  category_items = target_html.find_all('li', class_='media-item')

  result = {}
  for item in category_items:
      title = item.find('h2', class_='media-heading').text.strip()
      link = item.find('a', class_='media-view')['href']
      result[title] = link

  return result

def get_product_page_main_title(target_page):
  parent_element = target_page.find('form', id='group-datasets-search-form')
  target_element = parent_element.find_next('h1')

  return f"{target_element.text.strip()}"


def get_dataset_list_from_datasets_page(target_html):
  dataset_ul_element = target_html.find("ul", class_="dataset-list")
  dataset_li_elements = dataset_ul_element.find_all("li", class_="dataset-item")

  datasets = []
  for dataset_li_element in dataset_li_elements:
    li_div_element = dataset_li_element.find_next("div", class_="dataset-content")
    url = li_div_element.a["href"]
    name = li_div_element.a.text
    description = li_div_element.find_next("div").text
    resources = dataset_li_element.find_next('ul', class_="dataset-resources")

    formats = []
    if resources is not None:
        for format in resources.find_all('a'):
          formats.append(format['data-format'])
    else:
        continue

    dataset = {
          'url': url,
          'name': name,
          'description': description,
          'formats': formats
      }
    datasets.append(dataset)

  return datasets


def filter_dataset_list_by_format(dataset_list):
  result = []
  for dataset_item in dataset_list:
    for format in dataset_item['formats']:
      if format in supported_formats:
        result.append(dataset_item)

  return result


def get_dataset_resources_detail_from_dataset_page(target_html):
  resource_ul_element = target_html.find("ul", class_="resource-list")
  resource_li_elements = resource_ul_element.find_all("li", class_="resource-item")

  resource_files_list = []
  for resource_li_element in resource_li_elements:
    try:
      resource_name = resource_li_element.a["title"]
      resource_format = resource_li_element.a.find("span", class_="format-label").text
      resource_description = resource_li_element.p.text.strip()
      resource_url = resource_li_element.div.ul.find("a", class_="resource-url-analytics")["href"]

      if resource_format not in supported_formats:
        continue

      last_dot_index = resource_url.rfind('.')
      if resource_url[last_dot_index + 1:] not in supported_formats:
        continue

      try:
        response = urllib3_pool_manager.request("GET", resource_url)
        is_url_available = response.status == 200
      except Exception as e:
        print(f"ERROR3 falha ao acessar recurso:: {resource_url}")
        print(f"\nContinuando...")
        is_url_available = False

      if is_url_available != True:
        continue

      resource_file_info = {
        "resource_title": resource_name,
        "format": resource_format,
        "download_url": resource_url,
        "desctiption": resource_description
      }
      resource_files_list.append(resource_file_info)
    except Exception as e:
      print(f"\nERROR4 falha ao indexar:: {resource_name}, {resource_format}, {resource_url}")
      print(f"\nContinuando...")

  return resource_files_list


def get_dataset_resources_detail_list_from_dataset_list(dataset_filtered_list):
  result = []
  for dataset_item in dataset_filtered_list:
    try:
      target_html = get_page_html_by_url(f"{base_url}{dataset_item['url']}")
      dataset_resources_detail = get_dataset_resources_detail_from_dataset_page(target_html)

      if len(dataset_resources_detail) > 0:
        dataset = {
            "name": dataset_item["name"],
            "entity": "",
            "description": dataset_item["description"],
            "resources": dataset_resources_detail,
            "last_update": ""
        }
        result.append(dataset)

    except Exception as e:
      print(f"ERROR2: {base_url}{product_item[1]}")
      continue

  return result


def generate_json_file(products_dict):
  products = []

  datetime_now = datetime.datetime.now(pytz.timezone('Etc/GMT+3'))
  print(f"Raspagem de dados p/ pré-processamento iniciado em: {datetime_now}")
  print(f"\n\nEstimativa de finalização: {datetime_now + datetime.timedelta(minutes=15)}\n\n")
  total_items = len(products_dict.items())
  for idx, product_item in enumerate(products_dict.items()):

    # Calcula o percentual de conclusão
    progress_percent = (idx + 1) / total_items * 100
    sys.stdout.write(f"\rCarregando... {progress_percent:.2f}%")
    sys.stdout.flush()

    dataset_list = []
    try:
      product_page_html = get_page_html_by_url(f"{base_url}{product_item[1]}")
      product_page_main_title = get_product_page_main_title(product_page_html)
      dataset_complete_list = get_dataset_list_from_datasets_page(product_page_html)
      dataset_filtered_list = filter_dataset_list_by_format(dataset_complete_list)

      if len(dataset_filtered_list) < 1:
        continue
    except Exception as e:
      print(f"ERROR1: {base_url}{product_item[1]}")
      continue

    dataset_list = get_dataset_resources_detail_list_from_dataset_list(dataset_filtered_list)
    content_category = product_item[0]
    content_title = product_page_main_title

    if len(dataset_list) < 1:
        continue

    category = {
        "category": content_category,
        "title": content_title,
        "detail": f"{len(dataset_list)} compatível(is) com web-scrap (formato .csv)",
        "datasets": dataset_list
    }

    products.append(category)

    result = {
        "categories": products
    }

  json_file_path = "seade-repositorio.json"
  if os.path.exists(json_file_path):
      os.remove(json_file_path)

  with open(json_file_path, "w", encoding="utf-8") as outfile:
        json.dump(result, outfile, indent=4, ensure_ascii=False)

  return json_file_path

###

products_page_html = get_page_html_by_url(f"{base_url}{'/group'}")
products_page_html

products_dict = get_products_dict_from_products_page(products_page_html)
products_dict

product_page_html = get_page_html_by_url(f"{base_url}{products_dict['Seade Municípios']}")
product_page_html

product_page_main_title = get_product_page_main_title(product_page_html)
product_page_main_title

products_dict = get_products_dict_from_products_page(products_page_html)
print(f"Concluído com sucesso! {generate_json_file(products_dict)}")

Raspagem de dados p/ pré-processamento iniciado em: 2024-04-18 22:32:51.134220-03:00


Estimativa de finalização: 2024-04-18 22:47:51.134220-03:00


Carregando... 66.67%

# 2. Frontend em R
Utiliza dados raspados anteriormente para facilitar visualização e início da etapa de pré-processamento

## 2.1. Instala shiny-server e localtunnel na vm

In [None]:
!sudo apt-get install r-base
!sudo su - -c "R -e \"install.packages(c('shiny', 'ggExtra'), repos='https://cran.rstudio.com/')\""
!sudo apt-get install gdebi-core
!wget https://download3.rstudio.org/ubuntu-18.04/x86_64/shiny-server-1.5.21.1012-amd64.deb
!sudo gdebi -n shiny-server-1.5.21.1012-amd64.deb
!npm install localtunnel

## 2.2. Cria servidor R Shiny

In [None]:
%%writefile app.r
library(shiny)
library(bslib)
library(dplyr)
library(ggplot2)
library(ggExtra)

penguins_csv <- "https://raw.githubusercontent.com/jcheng5/simplepenguins.R/main/penguins.csv"

df <- readr::read_csv(penguins_csv)
# Find subset of columns that are suitable for scatter plot
df_num <- df |> select(where(is.numeric), -Year)

ui <- page_sidebar(
  sidebar = sidebar(
    varSelectInput("xvar", "X variable", df_num, selected = "Bill Length (mm)"),
    varSelectInput("yvar", "Y variable", df_num, selected = "Bill Depth (mm)"),
    checkboxGroupInput(
      "species", "Filter by species",
      choices = unique(df$Species),
      selected = unique(df$Species)
    ),
    hr(), # Add a horizontal rule
    checkboxInput("by_species", "Show species", TRUE),
    checkboxInput("show_margins", "Show marginal plots", TRUE),
    checkboxInput("smooth", "Add smoother"),
  ),
  plotOutput("scatter")
)

server <- function(input, output, session) {
  subsetted <- reactive({
    req(input$species)
    df |> filter(Species %in% input$species)
  })

  output$scatter <- renderPlot({
    p <- ggplot(subsetted(), aes(!!input$xvar, !!input$yvar)) + list(
      theme(legend.position = "bottom"),
      if (input$by_species) aes(color = Species),
      geom_point(),
      if (input$smooth) geom_smooth()
    )

    if (input$show_margins) {
      margin_type <- if (input$by_species) "density" else "histogram"
      p <- ggExtra::ggMarginal(p, type = margin_type, margins = "both",
        size = 8, groupColour = input$by_species, groupFill = input$by_species)
    }

    p
  }, res = 100)
}

shinyApp(ui, server, options = list(host = "127.0.0.1", port = 8501))

## 2.3. Inicia servidor R Shiny

In [None]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ipv4.icanhazip.com

Password/Enpoint IP for localtunnel is: 34.125.234.98


In [None]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ifconfig.me

In [None]:
!npx localtunnel --port 8501 & Rscript app.r

## Json front ....

### Quick test

In [None]:
# activate R magic
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("rlist")

In [None]:
%%R
library(jsonlite) # JSON parsing
library(rlist)

json_data <- jsonlite::read_json("/content/seade-repositorio.json")
#json_data$categories[[1]]$datasets[[1]]$resources[[3]]$desctiption

#print(json_data$categories[[1]]$category)
#print(json_data$categories[[1]]$title)
#print(json_data$categories[[1]]$detail)

categories <- list()
print(length(categories))
for(item in json_data$categories)
{
    categories <- list.append(categories, item$category)
}
print(length(categories))

In [None]:
%%R
library(dplyr)
library(jsonlite)
library(tidyjson)

# Caminho do arquivo JSON
caminho_arquivo <- "/content/seade-repositorio.json"

# Ler o conteúdo do arquivo para um vetor de strings
linhas <- readLines(caminho_arquivo)

# Combina as strings do vetor em uma única string
conteudo_json <- paste(linhas, collapse = "")

# Armazena o conteúdo JSON em uma variável do tipo string
conteudo_string <- as.character(conteudo_json)

# 1. Tidy the JSON data
#conteudo_string %>% spread_all

# 2. Tidy the JSON data
#conteudo_string %>% gather_object %>% json_types %>% count(name, type)

# 3. Tidy the JSON data
#conteudo_string %>% enter_object(categories)

# 4. Tidy the JSON data
#conteudo_string %>% enter_object(categories) %>% gather_array

# 5. Tidy the JSON data
conteudo_string %>% enter_object(categories) %>% gather_array %>% spread_all %>% json_types

In [None]:
%%R
library(jsonlite)
library(tibble)

# Caminho do arquivo JSON
caminho_arquivo <- "/content/seade-repositorio.json"

# Ler o conteúdo do arquivo para um vetor de strings
linhas <- readLines(caminho_arquivo)

# Combina as strings do vetor em uma única string
conteudo_json <- paste(linhas, collapse = "")

# Armazena o conteúdo JSON em uma variável do tipo string
conteudo_string <- as.character(conteudo_json)

# Leia o arquivo JSON do caminho especificado
json_data <- fromJSON(txt = conteudo_string)
#print("json_data::")
#print(json_data)

# Acesse os dados usando as chaves
#categories <- json_data$categories
#print("categories::")
#print(categories)

# pessoa <- json_data[[i]]$category

# wrap list in a tibble
weather <- tibble(categories)
weather

# Crie um tibble com as mesmas propriedades
meu_tibble <- tibble(
  category = sapply(categories, function(x) x$category),
  title = sapply(categories, function(x) x$title),
  detail = sapply(categories, function(x) x$detail)
)

# Exiba o tibble
print(meu_tibble)

#categories_list <- list()
# Percorra a lista 'categories' e imprima os valores
#for (category in categories) {
#    categories_list <- c(categories_list, category)
#}

#print("categories_list:: ")
#print(categories_list[1])


### Final

In [None]:
!echo -n "Password/Enpoint IP for localtunnel is: " && curl ifconfig.me

Password/Enpoint IP for localtunnel is: 34.125.3.141

In [None]:
%%writefile app_json.r
library(shiny)
library(jsonlite)

# Estrutura JSON fornecida
json_data <- '[{"categories":[{"category":"COVID-19","title":"2 conjuntos de dados encontrados","detail":"2 compatível(is)","datasets":[{"name":"Covid-19 Semanal","entity":"","description":"Dados semanais ","resources":[{"resource_title":"Casose","format":"CSV","download_url":"https://repositorio.seade.gov.br/dicvariaveis_dados_covid_sp.csv","description":"Dicionário"}],"last_update":""}]}]}]'

# Função para extrair os nomes das categorias
get_category_names <- function(json_data) {
  parsed_data <- jsonlite::fromJSON(json_data)
  categories <- parsed_data[[1]]$categories
  category_names <- sapply(categories, function(cat) cat$category)
  return(category_names)
}

# Interface do aplicativo Shiny
ui <- fluidPage(
  titlePanel("Navegação na Estrutura JSON"),
  sidebarLayout(
    sidebarPanel(
      selectInput("category", "Selecione a categoria:", choices = get_category_names(json_data)),
      selectInput("dataset", "Selecione o conjunto de dados:", choices = NULL),
      selectInput("resource", "Selecione o recurso:", choices = NULL),
      width = 3
    ),
    mainPanel(
      h4("Detalhes do recurso selecionado:"),
      verbatimTextOutput("resource_title_output"),
      verbatimTextOutput("format_output"),
      verbatimTextOutput("download_url_output"),
      verbatimTextOutput("description_output"),
      width = 9
    )
  )
)

# Servidor do aplicativo Shiny
server <- function(input, output, session) {
  observe({
    parsed_data <- jsonlite::fromJSON(json_data)
    categories <- parsed_data[[1]]$categories
    selected_category <- categories[categories$category == input$category]
    datasets <- selected_category$datasets
    dataset_names <- sapply(datasets, function(ds) ds$name)
    updateSelectInput(session, "dataset", choices = dataset_names)
  })

  observe({
    parsed_data <- jsonlite::fromJSON(json_data)
    categories <- parsed_data[[1]]$categories
    selected_category <- categories[categories$category == input$category]
    datasets <- selected_category$datasets
    selected_dataset <- datasets[datasets$name == input$dataset]
    resources <- selected_dataset$resources
    resource_titles <- sapply(resources, function(res) res$resource_title)
    updateSelectInput(session, "resource", choices = resource_titles)
  })

  output$resource_title_output <- renderPrint({
    parsed_data <- jsonlite::fromJSON(json_data)
    categories <- parsed_data[[1]]$categories
    selected_category <- categories[categories$category == input$category]
    datasets <- selected_category$datasets
    selected_dataset <- datasets[datasets$name == input$dataset]
    resources <- selected_dataset$resources
    selected_resource <- resources[resources$resource_title == input$resource]
    selected_resource$resource_title
  })

  output$format_output <- renderPrint({
    parsed_data <- jsonlite::fromJSON(json_data)
    categories <- parsed_data[[1]]$categories
    selected_category <- categories[categories$category == input$category]
    datasets <- selected_category$datasets
    selected_dataset <- datasets[datasets$name == input$dataset]
    resources <- selected_dataset$resources
    selected_resource <- resources[resources$resource_title == input$resource]
    selected_resource$format
  })

  output$download_url_output <- renderPrint({
    parsed_data <- jsonlite::fromJSON(json_data)
    categories <- parsed_data[[1]]$categories
    selected_category <- categories[categories$category == input$category]
    datasets <- selected_category$datasets
    selected_dataset <- datasets[datasets$name == input$dataset]
    resources <- selected_dataset$resources
    selected_resource <- resources[resources$resource_title == input$resource]
    selected_resource$download_url
  })

  output$description_output <- renderPrint({
    parsed_data <- jsonlite::fromJSON(json_data)
    categories <- parsed_data[[1]]$categories
    selected_category <- categories[categories$category == input$category]
    datasets <- selected_category$datasets
    selected_dataset <- datasets[datasets$name == input$dataset]
    resources <- selected_dataset$resources
    selected_resource <- resources[resources$resource_title == input$resource]
    selected_resource$description
  })
}

# Executar o aplicativo Shiny
#shinyApp(ui, server)
shinyApp(ui, server, options = list(host = "127.0.0.1", port = 8501))


In [None]:
!npx localtunnel --port 8501 & Rscript app_json.r