In [1]:
import os
import pandas as pd
import glob
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import random

In [2]:
class Paraguay:
    '''
    The class uses packages shown as below:
        BeautifulSoup4  4.9.3;
        glob2  0.7;
        selenium  3.141.0;
        pandas  1.2.0;    
    '''
    
    
    def __init__(self, id):
        self.id= id

    
    def setUp(self):
        
        ## Set download options
        opts= Options()
        opts.add_experimental_option("prefs",{"download.prompt_for_download": False,
                                              "download.directory_upgrade": True,
                                              "safebrowsing.enabled": True})
        ## Initiate the driver
        self.driver= webdriver.Chrome("/Applications/chromedriver", options= opts)

        ## Get current working directory
        path= os.getcwd()
        download_dir= path + "/data"
        
        ## Set Chrome to trust downloads
        self.driver.command_executor._commands["send_command"]= ("POST", '/session/$sessionId/chromium/send_command')
        params= {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': download_dir}}
        command_result= self.driver.execute("send_command", params)


    def click(self):

        ## Access the URL
        url= "https://portaldjbr.contraloria.gov.py/portal-djbr/"
        self.driver.get(url)

        ## Use xpath to fill the id form
        element= self.driver.find_element_by_xpath('//*[@id="app"]/div[2]/div/div/div/div[2]/div[2]/input')
        element.send_keys(self.id)
        
        ## Find and click the search button
        search_button= self.driver.find_element_by_xpath('//*[@id="app"]/div[2]/div/div/div/div[2]/div[3]/button')
        search_button.click()
    
    
    def downloadFiles(self):

        docs_info= [i.text for i in self.driver.find_elements_by_xpath("//td")]

        ## Produce a list of clear filenames
        names=[]
        year=[]
        filenames=[]
        for key, value in enumerate(docs_info):
            if key %4==0:
                names.append(value)
            
            if key %4==2:
                year.append(value)
                filenames.append(str(self.id) + "_" + value)
                
                
        download_buttons= self.driver.find_elements_by_class_name("p-button-success")
        
        ## Check whether the database exists one's disclosure

        for index, button in enumerate(download_buttons):
            
            if len(download_buttons)==0:
                print("There is no "+ str(self.id)+ "'s records.")

            ## Randomly sleep
            time.sleep(random.uniform(1,3))

            ## Print the (name, year) for check
            print("Job "+ str(index+1)+ ": The file is from "+ names[index]+ " (" + str(self.id) + ")" + " in "+ year[index]+ ".")

            ## Click the download button
            button.click()

            ## Wait for sometime to finish the downloading process
            time.sleep(20)

            ## Get the latest files within the directory
            files= glob.glob("data/*")
            latest= max(files, key=os.path.getctime)   

            filename= "data/" + filenames[index]+ ".pdf"
            alternative= "data/" + filenames[index]+ "_1" + ".pdf"
            second_alternative= "data/" + filenames[index]+ "_2" + ".pdf"
            third_alternative= "data/" + filenames[index]+ "_3" + ".pdf"


            ## If the filename doesn't exist, then rename the latest file as the pre-created name
            if (os.path.exists(filename)== False):

                ## Rename the files by "id_year"
                os.rename(os.path.join(latest), os.path.join(filename))

            elif (os.path.exists(alternative)== False): 

                ## if one year has two documents, implement the alternative name
                os.rename(os.path.join(latest), os.path.join(alternative))

            elif (os.path.exists(second_alternative)== False): 
                os.rename(os.path.join(latest), os.path.join(second_alternative))

            else: 
                os.rename(os.path.join(latest), os.path.join(third_alternative))


        self.driver.quit()
        

## VP

In [7]:
## load the vp data
## id== 832988 or 652236 also exists in legislators
vp= pd.read_csv("~/Dropbox/disclosures-data/Charlie-Zhang/Paraguay/input/vp-id-list.csv")
vp_lists= vp["id"].to_list()
vp_lists

[389672, 652236, 352252, 832988]

In [8]:
for id in vp_lists:
    vice= Paraguay(id= id)
    vice.setUp()
    time.sleep(3)
    vice.click()
    time.sleep(5)
    vice.downloadFiles()

Job 1: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 1998.
Job 2: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 1998.
Job 3: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 1999.
Job 4: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 2000.
Job 5: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 2003.
Job 6: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 2003.
Job 7: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 2008.
Job 8: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 2012.
Job 9: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 2012.
Job 10: The file is from AMANCIO OSCAR DENIS SANCHEZ (389672) in 2013.
Job 1: The file is from JUAN EUDES AFARA MACIEL (652236) in 2014.
Job 2: The file is from JUAN EUDES AFARA MACIEL (652236) in 2018.
Job 3: The file is from JUAN EUDES AFARA MACIEL (652236) in 2018.
Job 4: The file is from JUANEUDES AFARAMACIEL (652236) in 2001.
Job 5: The file is from JUANEUDES AFA

## Cabinet

In [4]:
cabinet= pd.read_csv("~/Desktop/Paraguay/cabinet-id-list.csv")
cabinet_lists= cabinet["id"].to_list()
cabinet_lists

[404976,
 453913,
 238610,
 1336946,
 1047386,
 677755,
 1123122,
 409636,
 1388499,
 85848,
 931976,
 1699606,
 219847,
 232663,
 913111,
 992145,
 637256,
 1527620,
 446027,
 232627,
 1164164,
 316204,
 3569632,
 694926,
 838771,
 1123415,
 618781,
 725119,
 402938,
 2102663,
 435101,
 3548659,
 648993,
 380218,
 2849697,
 619890,
 1467955,
 1786719,
 346742,
 608726,
 3727699,
 1890292,
 923198,
 920868,
 487021,
 686959,
 876798,
 835848,
 902726,
 1384728,
 3400701,
 3382061,
 766742,
 3031787,
 1056514,
 886807,
 3492337,
 1431114,
 391746,
 2209022,
 770933,
 640913,
 2352438,
 1217833,
 1194695,
 1099091,
 1260233,
 2513962,
 2864308,
 905999,
 610668,
 916313,
 1742102,
 987143,
 852751,
 745882,
 1137105,
 346942,
 2974481,
 1362380,
 603957]

In [5]:
## Check whether there are overlappings between vp and cabinet 
for id in vp_lists: 
    if id in cabinet_lists:
        print("Yes")
        cabinet_lists.remove(id)
    else: 
        pass

In [8]:
for id in cabinet_lists:
    cab= Paraguay(id= id)
    cab.setUp()
    time.sleep(3)
    cab.click()
    time.sleep(5)
    status= cab.downloadFiles()

There is no 404976's records.
Job 1: The file is from MARTIN LUIS,BURT,ARTAZA (453913) in 2001.
Job 2: The file is from MARTIN LUIS,BURT,ARTAZA (453913) in 2012.
Job 3: The file is from MARTIN LUIS,BURT,ARTAZA (453913) in 2013.
There is no 238610's records.
Job 1: The file is from NURIA RENINSEB ISNARDI MARTINEZ (1336946) in 2018.
Job 2: The file is from NURIA RENINSEB ISNARDI MARTINEZ (1336946) in 2019.
Job 3: The file is from NURIA RENINSEB ISNARDI DE PEREIRA (1336946) in 2002.
Job 4: The file is from NURIA RENINSEB ISNARDI MARTINEZ (1336946) in 1999.
Job 5: The file is from NURIA RENINSEB ISNARDI MARTINEZ (1336946) in 2012.
Job 1: The file is from HUMBERTO RUBEN PERALTA BEAUFORT (1047386) in 2018.
Job 2: The file is from HUMBERTORUBEN PERALTABEAUFORT (1047386) in 2001.
Job 3: The file is from HUMBERTORUBEN PERALTABEAUFORT (1047386) in 2006.
Job 4: The file is from HUMBERTORUBEN PERALTABEAUFORT (1047386) in 2013.
Job 5: The file is from HUMBERTORUBEN PERALTABEAUFORT (1047386) in 2014

KeyboardInterrupt: 