## Instala módulos

In [4]:
# !pip3 install pytz
# !pip3 install bamboolib
# !pip3 install pyspark
# !pip3 install spark

## Importa bibliotecas

In [5]:
import os
import time
import datetime 
import pytz
import logging
import pandas as pd
import csv
import enum

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException
from pyspark.sql.functions import current_timestamp

logger=logging.getLogger()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

ModuleNotFoundError: No module named 'pyspark'

## Define diretórios

In [None]:
LOCAL_DIRECTORY = os.getcwd() + "/"
DATABASE_PATH = "."

## Define tipos de download

In [None]:
class DownloadType(enum.Enum):
    RS = 1
    AE = 2

## Cria uma nova instância do webdriver

In [None]:
def create_driver_instance():
    s = Service('/tmp/chrome/latest/chromedriver_linux64/chromedriver')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location = "/tmp/chrome/latest/chrome-linux/chrome"
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    wd_chrome = webdriver.Chrome(service=s ,options=chrome_options)
    wd_chrome.get('http://app4.mdr.gov.br/serieHistorica/')
    wd_chrome.implicitly_wait(8)
    
    return wd_chrome

## Verifica se um determinado path existe

In [None]:
def path_exists(path):
    try:
        dbutils.fs.ls(path)
        return True
    except:
        return False

## Trata exceções

In [None]:
def try_click(element):
    
    sleep_time = 0.5
    while (True):   
        try:
            element.click()
            break
        except (ElementClickInterceptedException, ElementNotInteractableException):
            if sleep_time >= 6:
                print(f"Erro ao tentar clicar no elemento: {element.text}")
                break

            time.sleep(sleep_time)
            sleep_time += 0.5

In [None]:
def try_find_element(xpath, driver):
    
    sleep_time = 0.5
    while (True):   
        try:
            element = driver.find_element(By.XPATH,xpath)
            return element
        except NoSuchElementException:
            if sleep_time >= 6:
                print(f"Erro no xpath: {xpath}")
                break

            time.sleep(sleep_time)
            sleep_time += 0.5

In [None]:
def add_bold_string(string): 
    return f'\033[1m{string}\033[0m'

In [None]:
def fix_null_bytes(file_path):

    df = None
    with open(file_path, 'rb') as file:
        data = file.read().replace(b'\x00', b'')
        decoded_data = data.decode('ISO-8859-1')
        reader = csv.reader(decoded_data.splitlines(), delimiter=';')
        rows = [row for row in reader] 
        clean_columns = rows[0].copy()
        clean_columns.append('Null Column')
        df = pd.DataFrame(rows[1:], columns=clean_columns)
        df.drop('Null Column', axis=1, inplace=True)

    return df

## Baixa dados por tipo

In [None]:
def download_file(driver, download_type):
    section1 = try_find_element('//*[@id="menu-item-mun"]/i', driver)  # Municípios
    try_click(section1)

    section2 = try_find_element('//*[@id="oculto"]/li[2]/a', driver) # Informações e indicadores municipais consolidados
    try_click(section2)    

    button1 = try_find_element('//*[@id="frm_filtros"]/fieldset[1]/p[1]/button', driver)  # Tipo Informação
    try_click(button1)

    if (download_type == DownloadType.RS):
        button2 = try_find_element('//*[@id="multiselect_menu_tipo_info"]/ul/li[3]/label', driver) # Municípios com informações de resíduo sólido
        try_click(button2)

    elif (download_type == DownloadType.AE):
        button2 = try_find_element('//*[@id="multiselect_menu_tipo_info"]/ul/li[2]/label/span', driver) # Municípios com informações de água e esgoto
        try_click(button2)

    button3 = try_find_element('//*[@id="frm_filtros"]/fieldset[1]/p[2]/button', driver) # Ano de Referência
    try_click(button3)   
    
    button4 = try_find_element('//*[@id="multiselect_menu_ano_ref"]/div/ul/li[1]/a/span[2]', driver) # Marcar todos
    try_click(button4)   
    
    no_button = try_find_element('//*[@id="div_realize_consulta"]/div', driver)
    try_click(no_button)
     
    button5 = try_find_element('//*[@id="frm_filtros"]/fieldset[2]/p[1]/button/span[1]', driver) # Região
    try_click(button5)

    button6 = try_find_element('//*[@id="multiselect_menu_cod_reg_geo"]/ul/li[5]/label', driver) # Sul
    try_click(button6)      
    
    try_click(no_button)
       
    button7 = try_find_element('//*[@id="frm_filtros"]/fieldset[2]/p[2]/button', driver) # Estado
    try_click(button7)

    button8 = try_find_element('//*[@id="multiselect_menu_sgl_est"]/ul/li[3]/label', driver) # Santa Catarina
    try_click(button8)      
    
    try_click(no_button)
       
    button9 = try_find_element('//*[@id="frm_filtros"]/fieldset[3]/p/button/span[1]', driver) # Municípios
    try_click(button9)

    button10 = try_find_element('//*[@id="multiselect_menu_cod_mun"]/div/ul/li[1]/a/span[2]', driver) # Marcar Todos
    try_click(button10)       
    
    try_click(no_button)
    
    button11 = try_find_element('//*[@id="bt_gerar"]/span/i', driver) # Continuar
    try_click(button11)

    button12 = try_find_element('//*[@id="frm_colunas"]/p[1]/button/span[1]', driver) # Famílias de Informações e Indicadores
    try_click(button12)

    if (download_type == DownloadType.RS):
        inputElement = try_find_element('//*[@id="multiselect_menu_cod_fam_info"]/div/div/input', driver)
        inputElement.send_keys('rs')
    elif (download_type == DownloadType.AE):
        inputElement = try_find_element('//*[@id="multiselect_menu_cod_fam_info"]/div/div/input', driver)
        inputElement.send_keys('ae')
    
    button13 = try_find_element('//*[@id="multiselect_menu_cod_fam_info"]/div/ul/li[1]/a/span[2]', driver) # Marcar Todos
    try_click(button13)

    if (download_type == DownloadType.RS):
        button14 = try_find_element('//*[@id="multiselect_menu_cod_fam_info"]/ul/li[11]/label', driver) # RS - Informações fianceiras
        try_click(button14)

    elif (download_type == DownloadType.AE):
        button14 = try_find_element('//*[@id="multiselect_menu_cod_fam_info"]/ul/li[2]/label/span', driver) # AE - Informações financeiras
        try_click(button14)

        button15 = try_find_element('//*[@id="multiselect_menu_cod_fam_info"]/ul/li[6]/label/span', driver) # AE - Indicadores econômico-financeiros e administrativos
        try_click(button15)

    try_click(no_button)

    try_click(button11)

    button14 = try_find_element('//*[@id="bt_relatorio"]', driver) # Gerar planilha
    try_click(button14)

    button15 = try_find_element('/html/body/div[14]/div[2]/a[1]', driver)
    try_click(button15)
    
    button16 = try_find_element('/html/body/div[14]/div[2]/a[2]', driver) # Fechar
    try_click(button16)

    button17 = try_find_element('//*[@id="menu-item-home"]/i', driver)
    try_click(button17)


## Baixa todos os arquivos

In [None]:
my_timezone = "Brazil/East"
brazil_tz = pytz.timezone(my_timezone) 
current_date = datetime.datetime.now(brazil_tz)

try:
    driver = create_driver_instance()

    for download_type in DownloadType:
        if (download_type == DownloadType.RS):
            print("Resíduos Sólidos: Download iniciado...\n")
        elif (download_type == DownloadType.AE):
            print("Água e Esgoto: Download iniciado...\n")

        download_file(driver, download_type)
       
        '''filename = ""
        for i in os.listdir():
            if i.startswith("ConsolidadoMunicipio"):
                filename = i.split(".")[0] + "." + i.split(".")[1]
        
        old_set_download_path = set(os.listdir(LOCAL_DIRECTORY))
        while (os.path.exists("/content/" + filename + ".crdownload")):
            continue '''
        
        old_set_download_path = set(os.listdir(LOCAL_DIRECTORY))
        filename = ''
        while True:
            new_set_download_path = set(os.listdir(LOCAL_DIRECTORY))
            resulting_set = new_set_download_path.difference(old_set_download_path)

            if len(resulting_set) > 0: 
                filename = list(resulting_set)[0]

                if filename.endswith(".csv"):
                    #print(filename)
                    break
        
        print('Download concluído!\n')
        df = fix_null_bytes(LOCAL_DIRECTORY + filename)
        
        reference_year_values = list(df['Ano de Referência'].unique())
        reference_year_values = list(filter(lambda x: x.isnumeric(), reference_year_values))
        
        init_year = min(reference_year_values)
        final_year = max(reference_year_values)
        
        os.remove(LOCAL_DIRECTORY + filename)
         
        landing_zone_path = ''
        new_filename = ''
        if (download_type == DownloadType.RS):
            
            landing_zone_path = DATABASE_PATH + f'residuos_solidos_{init_year}_{final_year}'
            
            new_filename = add_bold_string(f'residuos_solidos_{init_year}_{final_year}')

        elif (download_type == DownloadType.AE):
            
            landing_zone_path = DATABASE_PATH + f'agua_esgoto_{init_year}_{final_year}'
            
            new_filename = add_bold_string(f'agua_esgoto_{init_year}_{final_year}')
        
        if not path_exists(landing_zone_path):
            df_spark = spark.createDataFrame(df)
            df_spark = df_spark.withColumn('timestamp_ingestao',current_timestamp())

            print(f"Salvando o arquivo {new_filename} na landing-zone...\n")
            df_spark.write.format("parquet").mode("overwrite").save(landing_zone_path)

            print(f"O arquivo foi salvo na landing-zone!\n")
            
        else:
            print(f"O arquivo {new_filename} já está presente na landing-zone!\n")

except Exception as e:
    driver.quit()
    logger.exception(e)

In [0]:
os.listdir(LOCAL_DIRECTORY)

In [0]:
files = os.listdir(LOCAL_DIRECTORY)
for filename in files:
    if filename.endswith('.csv') or filename.endswith('.crdownload'):
        os.remove(LOCAL_DIRECTORY + filename)