# Data Scraping
**CRIMES VIOLENTOS LETAIS E INTENCIONAIS – CVLI (SSPDS/CE)**  
>Autor:       Erivando Sena  
E-mail:      erivandosena@gmail.com   
Criado:        03/08/2019  
Atualizado:  14/02/2023  

### Setup R and packages

In [1]:
options(timeout=180)
path_rlibrary <- file.path(Sys.getenv("LOCALAPPDATA"), "R", "win-library", fsep="\\")
dir.create(path_rlibrary, recursive = TRUE, showWarnings = FALSE)
.libPaths(path_rlibrary)

# update packages R
suppressPackageStartupMessages({
    if (length(old.packages()) > 0) {
        update.packages(lib.loc = path_rlibrary, repos='http://cran.rstudio.com/', ask=FALSE, checkBuilt=FALSE)
    }
    # Add R in path of system
    path_r <- normalizePath(file.path(R.home("bin")))
    current_path <- Sys.getenv("PATH")
    if (!grepl(paste0(";", path_r, ";"), paste0(";", current_path, ";"))) {
        Sys.setenv(PATH = paste(path_r, current_path, sep = ";"))
        #system(sprintf('setx PATH "%s;%s" -m', path_r, current_path), intern = TRUE)
    }
})

path_ruser <- file.path(Sys.getenv("LOCALAPPDATA"), "R", fsep="\\")
Sys.setenv(R_USER=path_ruser)

# Save history
.First <- function() {
  history_file <- file.path(Sys.getenv("R_USER"), ".Rhistory")
  if (file.exists(history_file)) {
    source(history_file)
  }
}
.Last <- function() {
  history_file <- file.path(Sys.getenv("R_USER"), ".Rhistory")
  savehistory(history_file)
}

# Setup JAVA_HOME
Sys.setenv(JAVA_HOME = normalizePath(file.path(Sys.getenv("SystemDrive"), "Program Files", "Java", "jdk1.8.0_202")))
java_path <- normalizePath(file.path(Sys.getenv("JAVA_HOME"), "bin"))
Sys.setenv(PATH = paste(java_path, Sys.getenv("PATH"), sep = ";"))

# check information
cat(paste(path_ruser, "\n"))
java_version_output <- system2("java", args = c("-version"), stdout = FALSE, stderr = TRUE)
cat(paste(java_version_output, "\n"))

C:\Users\Erivando\AppData\Local\R 
java version "1.8.0_202" 
 Java(TM) SE Runtime Environment (build 1.8.0_202-b08) 
 Java HotSpot(TM) 64-Bit Server VM (build 25.202-b08, mixed mode) 


In [2]:
pacotes_analise <- c("remotes","installr","tidyverse","rvest","stringi","readxl","lubridate","xml2", "leaflet",
    "htmlwidgets","ggthemes","forecast","prophet", "mice","magrittr","kableExtra","ggmap","stringr",
    "purrr","dplyr","readr","pdftools","rJava","ropensci/tabulizer")

suppressPackageStartupMessages({
    for (pkg in pacotes_analise) {
        pkg_extra <- sub("ropensci/", "", pkg)
        if (!(pkg_extra %in% rownames(installed.packages()))){
            if (grepl("/", pkg)) {
                remotes::install_github(pkg, INSTALL_opts = "--no-multiarch")
            } else {
                install.packages(pkg)
            }
        }
    }
    for (pkg in pacotes_analise) {
        pkg_extra <- sub("ropensci/", "", pkg)
        if ((pkg_extra %in% rownames(installed.packages()))){
            library(pkg_extra, character.only = TRUE)
        }
    }
})

"package 'remotes' was built under R version 4.3.2"
"package 'installr' was built under R version 4.3.2"
"package 'tidyverse' was built under R version 4.3.2"
"package 'ggplot2' was built under R version 4.3.2"
"package 'tibble' was built under R version 4.3.2"
"package 'tidyr' was built under R version 4.3.2"
"package 'readr' was built under R version 4.3.2"
"package 'purrr' was built under R version 4.3.2"
"package 'dplyr' was built under R version 4.3.2"
"package 'stringr' was built under R version 4.3.2"
"package 'forcats' was built under R version 4.3.2"
"package 'lubridate' was built under R version 4.3.2"
"package 'rvest' was built under R version 4.3.2"
"package 'stringi' was built under R version 4.3.2"
"package 'readxl' was built under R version 4.3.2"
"package 'xml2' was built under R version 4.3.2"
"package 'leaflet' was built under R version 4.3.2"
"package 'htmlwidgets' was built under R version 4.3.2"
"package 'ggthemes' was built under R version 4.3.2"
"package 'forecas

### Folders configuration

In [3]:
dir_docs <- c("Documentos")
dir_dados <- c("Dados")
# Vetor contendo o camainho absoluto dos diretorios
diretorios <- c(dir_dados, dir_docs)
for (diretorio in diretorios) {
  if(!dir.exists(diretorio)) {
    dir.create(diretorio, recursive = TRUE)
  }
}

### Functions

In [4]:
# Extrai informacoes de ano do código html
extrai_lista_anos <- function(url_site) {
  pagina_html <- xml2::read_html(url_site)

  df_lista_urls <- pagina_html %>%
    rvest::html_nodes('.grid a.box') %>%
    purrr::map(xml2::xml_attrs) %>%
    purrr::map_df(~as.list(.)) %>%
    na.omit(.) %>%
    .[-c(nrow(.)),]

  df_lista_titulos <- pagina_html %>%
    rvest::html_nodes('.grid p') %>%
    rvest::html_text() %>%

    gsub(" ", "", .) %>%
    substr(., nchar(.)-3, nchar(.)) %>%
    .[-length(.)] %>% # remove ano 2013
    as.data.frame(.) %>%
    setNames(., "ano")

  # Juntar os dataframes df_lista_urls e df_lista_titulos
  df_urls_anos <- data.frame(df_lista_urls, df_lista_titulos)
  excluir <- c("class", "target")
  df_urls_anos <- df_urls_anos[,!(names(df_urls_anos) %in% excluir)]
  df_urls_anos[,1][df_urls_anos[, 1] == "#"] <- NA

  return(df_urls_anos)
}

# Baixa lista de arquivos por ano
obtem_arquivos <- function(url_site, anos) {
  pastas <- list.dirs(dir_docs, full.names = TRUE, recursive = FALSE)
  unlink(pastas, recursive = TRUE)

  for(indice in 1:nrow(anos)) {
    print(paste("Lendo codigo HTML da pagina Web:", anos[indice,1], "-", anos[indice,2], sep = " "))
    df_listas <- extrai_lista_documentos(anos[indice,1])
    Sys.sleep(1)
    download_documentos(df_listas)
    Sys.sleep(10)
  }
}

# Extrai informacoes de docs do codigo html
extrai_lista_documentos <- function(url_pagina) {
  pagina_html <- xml2::read_html(url_pagina)

  df_lista_urls <- pagina_html %>%
    rvest::html_nodes('.-Verde a.box') %>%
    purrr::map(xml2::xml_attrs) %>%
    purrr::map_df(~as.list(.))

  df_lista_titulos <- pagina_html %>%
    rvest::html_nodes('.-Verde h3') %>%
    rvest::html_text() %>%
    remove_acentos(.) %>%
    gsub("–", " - ", .) %>%
    gsub("/", " - ", .) %>%
    gsub(" ", "", .) %>%
    as.data.frame(.) %>%
    setNames(., "mes")

  # Juntar os dataframes df_lista_urls e df_lista_titulos
  df_urls_nomes <- data.frame(df_lista_urls, df_lista_titulos)
  excluir <- c("class", "target", "rel")
  df_urls_nomes <- df_urls_nomes[,!(names(df_urls_nomes) %in% excluir)]
  df_urls_nomes[,1][df_urls_nomes[, 1] == "#"] <- NA

  return(df_urls_nomes)
}

# Baixa os arquivos da web para uma pasta local
download_documentos <- function(d_frame_lista) {
  anos <- extrai_lista_anos(URL_site)
  for (titulo in d_frame_lista$mes) {
    df_url <- select(filter(d_frame_lista, d_frame_lista$mes == titulo), "href")
    df_nome <- select(filter(d_frame_lista, d_frame_lista$mes == titulo), "mes")
    for(ano in anos$ano) {
      if(ano == substr(df_nome$mes, nchar(as.character(df_nome$mes))-3, nchar(as.character(df_nome$mes)))) {
        diretorio <- file.path(".", dir_docs, ano)
        if(!dir.exists(diretorio)) {
          dir.create(diretorio, recursive = TRUE)
        }
        nome_arquivo_completo <- obtem_nome_arquivos(ano, df_nome$mes)
        if(file.exists(nome_arquivo_completo)) {
          print(paste0("Arquivo baixado: ", df_nome$mes))
        } else {
          if (!is.na(df_url$href)) {
            download.file(df_url$href, destfile = nome_arquivo_completo, mode = "wb", quiet=FALSE)
          } else {
            print(paste("Arquivo indisponivel: ", df_nome$mes, sep = " "))
          }
        }
      }
    }
  }
}

# Formata nome dos arquivos
obtem_nome_arquivos <- function(dir_superior, nome_relacao) {
  nome_arquivo <- nome_relacao %>% gsub("[/. ,]","_", .)
  path <- file.path(".", dir_docs, dir_superior, paste0(nome_arquivo, ".pdf"))
  return(path)
}

# Remover acentos
remove_acentos <- function(obj_str) {
  if(!is.character(obj_str)) {
    obj_str <- as.character(obj_str)
  }
  obj_str <- stringi::stri_trans_general(str = obj_str, "latin-ascii")
  return(obj_str)
}

# Aplicar transformações em data frame
processar_dataframe <- function(df) {
  df <- df %>% as.data.frame()
  df <- df[,-1]
  if (ncol(df) >= 7) {
    df <- df %>%
      select(1:7) %>%
      setNames(cabecalho)
      df <- df %>% mutate_all(~ifelse(. == "", NA, .))
      df <- na.omit(df)
      df <- df %>% mutate(across(everything(), as.character))
    return(df)
  }
}

### Collect of data

In [5]:
URL_site <- "https://www.sspds.ce.gov.br/estatisticas-2-2-2-2-2-2/"
anos <- extrai_lista_anos(URL_site)

anos_filtro <- anos[anos$ano %in% c("2023","2024"), ] # Year 2023 only, keep c("2023")
# anos_filtro <- anos # Enabled all years

obtem_arquivos(URL_site, anos_filtro)
vetor_anos <- as.numeric(anos_filtro$ano)
lista_df <- list()

for (iano in seq_along(vetor_anos)) {
  data_frame_meses <- extrai_lista_documentos(anos_filtro[iano, 1])
  num_docs <- nrow(data_frame_meses)
  for (ndoc in 1:num_docs) {
    arquivo <- file.path(dir_docs, vetor_anos[iano], paste0(data_frame_meses[ndoc, 2], ".pdf"))
    paginas <- pdf_info(arquivo)$pages
    for (npage in 1:paginas) {
      tabela <- extract_tables(file = arquivo, pages = npage, method = "stream", guess = TRUE, output = "data.frame", encoding = "UTF-8")
      lista_df[[length(lista_df) + 1]] <- tabela
    }
  }
}

[1] "Lendo codigo HTML da pagina Web: https://www.sspds.ce.gov.br/estatisticas-2023 - 2023"


### Processing of data

In [6]:
# Clear data and merge data frame
arquivos_csv <- list.files(dir_dados, pattern = "\\.csv$", full.names = TRUE)
file.remove(arquivos_csv)

df_final <- data.frame()
cabecalho = c("AIS","MUNICIPIO_HOMICIDIO", "NATUREZA_HOMICIDIO", "ARMA_UTILIZADA", "DATA_HOMICIDIO", "SEXO", "IDADE")
lista_df_processada <- lapply(lista_df, processar_dataframe)

for (i in seq_along(lista_df_processada)) {
  df <- lista_df_processada[[i]]
  df_final <- bind_rows(df_final, df)
  # write.csv(df, file = paste0(file.path(dir_dados, paste0(i,"cvli.csv"))), row.names = FALSE) # Enable export the csv of data frames unitary
}

df_final <- df_final %>% arrange(as.Date(DATA_HOMICIDIO, format="%d/%m/%Y")) %>% mutate(ID = c(1:nrow(.))) #%>% .[,c(8,1:ncol(.))]
df_final <- df_final %>% .[,c(8,1:ncol(.)-1)]

# Export dataset
write.csv(df_final, file = paste0(file.path(dir_dados, "cvli.csv")), row.names = FALSE)

### Analysis of dataset

In [7]:
# Import dataset
dados_cvli <- read_csv(file = paste0(file.path(dir_dados, "cvli.csv")), show_col_types = FALSE)
print(dados_cvli)

[90m# A tibble: 2,845 × 8[39m
      ID AIS    MUNICIPIO_HOMICIDIO NATUREZA_HOMICIDIO ARMA_UTILIZADA
   [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m  [3m[90m<chr>[39m[23m               [3m[90m<chr>[39m[23m              [3m[90m<chr>[39m[23m         
[90m 1[39m     1 AIS 23 Paracuru            HOMICIDIO DOLOSO   Arma de fogo  
[90m 2[39m     2 AIS 23 Paracuru            HOMICIDIO DOLOSO   Arma de fogo  
[90m 3[39m     3 AIS 07 Fortaleza           HOMICIDIO DOLOSO   Arma de fogo  
[90m 4[39m     4 AIS 19 Missão Velha        HOMICIDIO DOLOSO   Arma de fogo  
[90m 5[39m     5 AIS 19 Crato               HOMICIDIO DOLOSO   Arma de fogo  
[90m 6[39m     6 AIS 18 Jaguaribe           FEMINICÍDIO        Arma branca   
[90m 7[39m     7 AIS 25 Pacajus             HOMICIDIO DOLOSO   Arma branca   
[90m 8[39m     8 AIS 13 Aquiraz             HOMICIDIO DOLOSO   Arma de fogo  
[90m 9[39m     9 AIS 11 Caucaia             HOMICIDIO DOLOSO   Arma de fogo  
[90m10[39m 