In [1]:
import os
import pdfplumber

import fitz
fitz.TOOLS.mupdf_display_errors(False) # Ignore the warning

import pandas as pd
import numpy as np

## `Class`

Here are some notices about this class:
   - As `pdfplumber` can extract texts from well-formatted scanned pdf files, the parsing list only includes files from 2015 or later; 
   - As for files from 2013 and 2014, 2013 files are almost all images, while 2014 has several text files;


In [2]:
class Extract:
    """
    The `Extract` class is to extract information from pdf files. 
    
    It contains four functions, `generate_parsing_list`, `doubleCheck`, `getTexts`, and `getInfos`.
    
    """
    
    def __init__(self, path, identifier):
        """
        The constructor for Extract class.
        
        Parameters: 
        path (str): The path that stores all the downloaded pdf files.
        identifier (str): vp, cabinet, or legislators
        
        """
         
        self.path= path
        self.identifier= identifier
        self.parsing_list= []
        self.doublecheck_list= []
        self.checked_list= []
        self.txt_paths= []

    def generate_parsing_list(self):

        files= os.listdir(self.path + self.identifier + "/")

        # Check and delete .DS_Store (if exists)
        if ".DS_Store" in files:
            files.remove(".DS_Store")

        # Check each file's year and create the parsing list
        for file in files:
            year= file.split("_")[1]
            if ".pdf" in year:
                year= int(year.replace(".pdf", ""))

                # Check whether the year is later than 2014, because 2013 and 2014 are mixed mode
                if (year > 2014):
                    self.parsing_list.append(file)
                
                # Create a list containing 2013 and 2014 for doublecheck
                if (year== 2013 or year== 2014):             
                    self.doublecheck_list.append(file)
            else:
                
                if (int(year) > 2014):
                    self.parsing_list.append(file)
                    
                if (int(year)== 2013 or int(year)== 2014):
                    self.doublecheck_list.append(file)
            
        
    def doubleCheck(self): 
        """
        The function check the text-page ratio to distinguish the text-format from scanned pdf files.
        
        Returns:
            unparsed: a list of pdf files identified as the scanned pdf files in 2013 and 2014.
        
        """

        for i in self.doublecheck_list: 
                  
            total_page_area= 0.0
            total_text_area= 0.0

            try: 
                checkpath= self.path + self.identifier + "/" + i   
                doc= fitz.open(checkpath) 
                
                for page_num, page in enumerate(doc):
                    
                    total_page_area= total_page_area + abs(page.rect)
                    text_area= 0.0

                    for b in page.getTextBlocks():
                        
                        # Rectangle where block text appears
                        r= fitz.Rect(b[:4])  
                        text_area= text_area + abs(r)
                    
                    # Get the text ratio of the whole page area
                    total_text_area= total_text_area + text_area              
                    text_page_ratio= total_text_area / total_page_area

                # The normal ratio is below 1, and scanned document's ratio usually is larger than 1
                if (text_page_ratio < 1):
                    self.checked_list.append(i)
                        
                doc.close()

            # If EOPError exists, pass the document examination process
            except: 
                continue
        
        unparsed= list(set(self.doublecheck_list)- set(self.checked_list))
                
        return unparsed
                

    def getTexts(self):
        """
        The function is to extract texts from pdf files and to store them in the respective folders.
        
        """
        
        final_list= self.checked_list+ self.parsing_list
        
        for i in final_list:
            try:
                filepath= self.path + self.identifier + "/" + i
                pdf= pdfplumber.open(filepath)

                # Extract the each line of each page of the pdf files
                # Set the x-/y- tolerance as 4 (defacult as 3) to include more distant texts             
                texts= []
                for page in pdf.pages:
                    texts.append(page.extract_text(x_tolerance=4, y_tolerance=4).splitlines())

                output_path= ('/Users/charliezhang/Dropbox/disclosures-data/Charlie-Zhang/Paraguay/output/'
                                + self.identifier + "/")
                
                if not os.path.exists(output_path):
                    os.makedirs(output_path)

                txt_path= output_path + i.replace(".pdf", ".txt")
                
                # Store the txt file as part of the output
                with open(txt_path, "w") as f:
                    for page in texts:
                        for line in page:
                            f.write(line + "\n")
                
                self.txt_paths.append(txt_path)
                         

            except:
                print(i+ " cannot be parsed. Because it could be an image.")
                
    
    def getInfos(self):
        
        """
        The function is to read texts from intermediate txt files.
        
        Returns:
            infos: a dictionary can easily transformed to the DataFrame. 
        
        """
        reason= []
        personid= []
        dates= []
        first= []
        last= []
        dob= []
        marital_status= []
        educ=[]
        academic= []
        institutions= []
        departments= []
        positions= []
        deposits= []
        vehicles= []
        immoveables= []
        moveables=[]
        cattle_crops= []
        debts= []
        monthly_incomes= []
        annual_incomes= []
        
        
        # List the files in the assigned directory -> produce the absolute path for reading
        output_path= ('/Users/charliezhang/Dropbox/disclosures-data/Charlie-Zhang/Paraguay/output/'
                                + self.identifier + "/")
        
        relative_path= os.listdir(output_path)
        
        if ".DS_Store" in relative_path: 
            relative_path.remove(".DS_Store")
            
        txtpaths= [output_path+ i for i in relative_path]
        
        for i in txtpaths:
            with open(i, "r") as f:
                doc= f.read().splitlines()
                
                # Create lists to store matched lines
                date_lines= []
                name_lines= []
                institution_lines= []
                academic_lines= []
                department_lines= []
                position_lines= []

                deposit_lines= []
                vehicle_lines= []
                moveable_lines= []
                immoveable_lines= []
                cattle_crop_lines= []
                debt_lines= []
                monthly_income_lines= []
                annual_income_lines= []

                for index, line in enumerate(doc):
                    
                    # Use index to restrict multiple matches
                    # Often, it is the second/third line of the first page
                    if index < 5 and line.find("MOTIVO DE LA") != -1:
                        motive= line.replace("MOTIVO DE LA", "")
                        reason.append(motive.strip())

                    if line.find("RECEPCIONADO") != -1:
                        date_lines.append(line)

                    ## Use two position to check to aviod mismatching
                    if line.find("CÉDULA DE") != -1 and line.find("RUC")!= -1:
                        if line.find("IDENTIDAD") != -1: 
                            id= line.replace("CÉDULA DE IDENTIDAD:", "").replace("RUC", "").split(":")
                            personid.append(id[0].strip())
                        
                        else:
                            id= line.replace("CÉDULA DE", "").replace("RUC", "").split(":")
                            personid.append(id[0].strip())

                    # Use first and last name to restrict other matches
                    if line.find("NOMBRE:") != -1 and line.find("APELLIDOS") != -1:
                        name_lines.append(line)

                    if line.find("FECHA DE NAC.") != -1:
                        birth= line.replace("FECHA DE NAC.: ", "").replace("NACIONALIDAD", "").split(":")
                        dob.append(birth[0].strip())

                    if line.find("ESTADO CIVIL") != -1:
                        marital= line.replace("ESTADO CIVIL: ", "").replace("CELULAR 2", "").split(":")
                        marital_status.append(marital[0].strip())

                    if line.find("E MAIL") != -1 and line.find("GRADO ACADEM") != -1:
                        education= line.split(":")
                        educ.append(education[2].strip())        

                    if line.find("TITULO OBTENIDO") != -1:
                        academic_lines.append(line)

                    if line.find("INSTITUCIÓN:") != -1 and line.find("DIRECCIÓN") != -1:
                        institution_lines.append(line)

                    if line.find("TIPO") != -1 and line.find("DEPENDENCIA") != -1 :
                        department_lines.append(line)

                    if line.find("CARGO") != -1 and line.find("CATEGORÍA") != -1:
                        position_lines.append(line)

                    if line.find("TOTAL DEPÓSITOS:") != -1:
                        deposit_lines.append(line)

                    if line.find("TOTAL INMUEBLES:") != -1:
                        immoveable_lines.append(line)

                    if line.find("TOTAL MUEBLES:") != -1:
                        moveable_lines.append(line)

                    if line.find("TOTAL VEHÍCULOS:") != -1:
                        vehicle_lines.append(line)

                    if line.find("TOTAL ACTIVIDAD") != -1:
                        cattle_crop_lines.append(line)

                    if line.find("TOTALE") != -1:
                        debt_lines.append(line)
                        
                    if line.find("TOTAL INGRESOS MENSUALES") != -1:
                        monthly_income_lines.append(line)

                    if line.find("TOTAL INGRESOS ANUALES") != -1:
                        annual_income_lines.append(line)

                
                if len(date_lines) != 0:
                    if "RECEPCIONADO EL" in date_lines[0]:
                        date= date_lines[0].split(": ")[-1].split(" ")[0]
                        dates.append(date.strip())
                    else: 
                        date= date_lines[0].split(" ")[-2]
                        dates.append(date.strip())

                if len(name_lines) != 0:
                    name= name_lines[0].replace("NOMBRE: ", "").replace("APELLIDOS", "").split(":")
                    first.append(name[0].strip())
                    last.append(name[1].strip())

                
                if len(institution_lines) != 0:
                    inst_strs= ""
            
                    for line in institution_lines:            
                        institution= line.replace("INSTITUCIÓN:", "").replace("DIRECCIÓN", "").split(":")
                        inst= institution[0].strip() + " "
                        inst_strs += inst
                    institutions.append(inst_strs)
                    
                else: 
                    institutions.append(np.NaN)

                
                if len(department_lines) != 0:              
                    dept_strs= ""

                    for index, line in enumerate(department_lines):
                        department= line.replace("TELÉFONO", "").split(":")
                        dept= str(index+1) + ": " + department[2].strip() + " "
                        dept_strs += dept

                    departments.append(dept_strs)
            
                else:
                    departments.append(np.NaN)

                
                if len(position_lines) != 0:
                    posn_strs= ""

                    for index, line in enumerate(position_lines):
                        position= line.split(":")[-1]
                        posn= str(index+1) + ": " + position.strip() + " "
                        posn_strs += posn 

                    positions.append(posn_strs)
                
                else:
                    positions.append(np.NaN)
                
                
                ## If one has academic title, get it; else, return np.nan
                if len(academic_lines) == 0:
                    academic.append(np.NaN)                 
                
                else:
                    title= line.replace("TITULO OBTENIDO:", "")        
                    if bool(title and title.strip())== True:
                        academic.append(np.NaN)
                    else: 
                        academic.append(title[-1])

                if len(deposit_lines) != 0:
                    deposit= deposit_lines[-1].replace("TOTAL DEPÓSITOS:", "")
                    deposits.append(deposit.strip())
                else: 
                    deposits.append(np.NaN)


                if len(vehicle_lines) != 0:
                    vehicle= vehicle_lines[-1].replace("TOTAL VEHÍCULOS:", "")
                    vehicles.append(vehicle.strip())
                    
                else:
                    vehicles.append(np.NaN)

                if len(immoveable_lines) != 0:
                    immoveable= immoveable_lines[-1].replace("TOTAL INMUEBLES:", "")
                    immoveables.append(immoveable.strip())
                else: 
                    immoveables.append(np.NaN)

                if len(moveable_lines) != 0:
                    moveable= moveable_lines[-1].replace("TOTAL MUEBLES:", "")
                    moveables.append(moveable.strip())
                else:
                    moveables.append(np.NaN)

                if len(cattle_crop_lines) == 0:
                    cattle_crops.append(np.NaN)
                else:
                    cattle_crop= cattle_crop_lines[-1].replace("TOTAL ACTIVIDAD", "")
                    cattle_crops.append(cattle_crop.strip())

                if len(debt_lines) != 0:
                    debt= debt_lines[-1].split(" ")[-1]
                    debts.append(debt.strip())
                else:
                    debts.append(np.NaN)

                if len(monthly_income_lines) != 0:
                    monthly_income= monthly_income_lines[-1].replace("TOTAL INGRESOS MENSUALES", "")
                    monthly_incomes.append(monthly_income.strip())
                else: 
                    monthly_incomes.append(np.NaN)


                if len(annual_income_lines) != 0:
                    annual_income= annual_income_lines[-1].replace("TOTAL INGRESOS ANUALES", "")
                    annual_incomes.append(annual_income.strip())
                else: 
                    annual_incomes.append(np.NaN)



        # Compile the dictionary
        info= {"reason": reason,
               "id": personid,
               "date": dates,
               "first": first,
               "last": last,
               "dob": dob,
               "marital": marital_status,
               "educ": educ,
               "academic": academic,
               "institution": institutions,
               "department": departments,
               "position": positions,
               "deposit": deposits,
               "immoveable": immoveables,
               "vehicle": vehicles,
               "cattle_crop": cattle_crops,
               "moveable": moveables,
               "debt": debts,
               "income_month": monthly_incomes,
               "income_annual": annual_incomes}
        return info

In [4]:
help(Extract)

Help on class Extract in module __main__:

class Extract(builtins.object)
 |  Extract(path, identifier)
 |  
 |  The `Extract` class is to extract information from pdf files. 
 |  
 |  It contains four functions, `generate_parsing_list`, `doubleCheck`, `getTexts`, and `getInfos`.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path, identifier)
 |      The constructor for Extract class.
 |      
 |      Parameters: 
 |      path (str): The path that stores all the downloaded pdf files.
 |      identifier (str): vp, cabinet, or legislators
 |  
 |  doubleCheck(self)
 |      The function check the text-page ratio to distinguish the text-format from scanned pdf files.
 |      
 |      Returns:
 |          unparsed: a list of pdf files identified as the scanned pdf files in 2013 and 2014.
 |  
 |  generate_parsing_list(self)
 |  
 |  getInfos(self)
 |      The function is to read texts from intermediate txt files.
 |      
 |      Returns:
 |          infos: a dictionary can easily 

## VP

In [5]:
vp= Extract(path= "/Users/charliezhang/Dropbox/disclosures-data/Charlie-Zhang/Paraguay/data/",
            identifier= "vp")
vp.generate_parsing_list()
vp_unparsed_1314= vp.doubleCheck()
vp.getTexts()

In [6]:
vp_infos= pd.DataFrame(vp.getInfos())
vp_infos= vp_infos.sort_values(by = "id").reset_index().drop(columns= "index")

In [7]:
vp_infos

Unnamed: 0,reason,id,date,first,last,dob,marital,educ,academic,institution,department,position,deposit,immoveable,vehicle,cattle_crop,moveable,debt,income_month,income_annual
0,ASUNCION AL CARGO,352252,11/05/2018,ALICIA BEATRIZ,PUCHETA VDA DE CORREA,14/01/1950,VIUDO/A,TERCIARIO,,2 UNIVERSIDAD NACIONAL DE ASUNCION 3 VICEPRESI...,1: FACULTAD DE DERECHO 2: VICEPRESIDENCIA,1: DOCENTE 2: VICEPRESIDENTA,22000234,1650000000,177000000,,970000000,20493849,46497794,57572661
1,BAJA DEL CARGO,352252,17/08/2018,ALICIA BEATRIZ,PUCHETA VDA DE CORREA,14/01/1950,VIUDO/A,TERCIARIO,,2 UNIVERSIDAD NACIONAL DE ASUNCION 3 VICEPRESI...,1: FACULTAD DE DERECHO 2: VICEPRESIDENCIA,1: DOCENTE 2: VICEPRESIDENTA,22000234,1650000000,177000000,,970000000,43420137,46497794,57572661
2,BAJA DEL CARGO,352252,09/05/2018,ALICIA BEATRIZ,PUCHETA VDA DE CORREA,14/01/1950,VIUDO/A,TERCIARIO,,1 CORTE SUPREMA DE JUSTICIA 3 UNIVERSIDAD NACI...,1: MINISTRA DE LA CORTE SUPREMA DE JUSTICIA 2:...,1: MINISTRO DE LA CORTE SUPREMA DE JUSTICIA 2:...,22000234,1650000000,177000000,,970000000,20493849,57572661,57572661
3,OTRO,652236,27/06/2014,JUAN EUDES,AFARA MACIEL,19/08/1960,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 VICEPRESIDENCIA DE LA REPUBLICA,1: VICE PRESIDENCIA,1: VICEPRESIDENTE DE LA REPÚBLICA,825000000,8540700000,398800000,970100000.0,350000000,2392000000,91163080,2855257975
4,ASUNCION AL CARGO,652236,12/07/2018,JUAN EUDES,AFARA MACIEL,19/08/1960,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: SENADO DE LA NACIÓN,1: SENADOR NACIONAL,380314869,8590000000,396550000,1171000000.0,350000000,2399248000,87774840,2851869735
5,BAJA DEL CARGO,652236,26/04/2018,JUAN EUDES,AFARA MACIEL,19/08/1960,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 VICEPRESIDENCIA DE LA REPUBLICA,1: VICE PRESIDENCIA,1: VICEPRESIDENTE DE LA REPÚBLICA,1218988384,8590000000,396550000,1171000000.0,350000000,2458314687,90927900,2855257975
6,BAJA DEL CARGO,832988,24/07/2017,HUGO ADALBERTO,VELAZQUEZ MORENO,03/09/1967,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE DIPUTADOS,1: DIPUTADO NACIONAL,1: CESE PRESIDENCIA DE LA H. CAMARA DE DIPUTADOS,5907651,938708000,910816795,,162364635,297106617,37076740,8835658985
7,OTRO,832988,05/05/2020,HUGO ADALBERTO,VELAZQUEZ MORENO,03/09/1967,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 VICEPRESIDENCIA DE LA REPUBLICA,1: VICEPRESIDENCIA,1: VICEPRESIDENTE DE LA REPÚBLICA,344000,938708000,919442639,,172864635,283820373,32000000,625500000
8,ASUNCION AL CARGO,832988,30/08/2018,HUGO ADALBERTO,VELAZQUEZ MORENO,03/09/1967,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 VICEPRESIDENCIA DE LA REPUBLICA,1: VICEPRESIDENCIA,1: VICEPRESIDENTE DE LA REPÚBLICA,344000,938708000,919442639,,172864635,283820373,32000000,625500000
9,BAJA DEL CARGO,832988,18/04/2018,HUGO ADALBERTO,VELAZQUEZ MORENO,03/09/1967,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE DIPUTADOS,1: CAMARA DE DIPUTADOS,1: DIPUTADO NACIONAL,526446,938708000,910816795,,162364635,234061333,37076740,798597200


In [8]:
vp_infos.to_csv("vp_infos.csv")

## Cabinet

In [9]:
cab= Extract(path= "/Users/charliezhang/Dropbox/disclosures-data/Charlie-Zhang/Paraguay/data/",
            identifier= "cabinet")
cab.generate_parsing_list()
cab_unparsed_1314= cab.doubleCheck()
cab.getTexts()

In [10]:
cab_unparsed_1314

['640913_2013.pdf',
 '916313_2014_1.pdf',
 '1047386_2014.pdf',
 '835848_2013.pdf',
 '838771_2014_1.pdf',
 '2102663_2014.pdf',
 '1699606_2014.pdf',
 '446027_2013.pdf',
 '618781_2013_1.pdf',
 '1047386_2014_1.pdf',
 '453913_2013.pdf',
 '725119_2014_1.pdf',
 '725119_2013.pdf',
 '916313_2014.pdf',
 '1217833_2014.pdf',
 '1362380_2014.pdf',
 '1388499_2013_1.pdf',
 '1890292_2013.pdf',
 '409636_2013.pdf',
 '232663_2013_1.pdf',
 '920868_2013_1.pdf',
 '694926_2013.pdf',
 '1047386_2013_1.pdf',
 '2849697_2013_1.pdf',
 '232627_2013.pdf',
 '1467955_2013.pdf',
 '619890_2013_1.pdf',
 '232627_2014.pdf',
 '1699606_2013.pdf',
 '232663_2013_2.pdf',
 '920868_2013.pdf',
 '380218_2014.pdf',
 '1890292_2014.pdf',
 '694926_2014.pdf',
 '745882_2013.pdf',
 '1137105_2014.pdf',
 '618781_2014_1.pdf',
 '1388499_2013.pdf',
 '619890_2013.pdf',
 '1467955_2014.pdf',
 '2849697_2013.pdf',
 '725119_2014.pdf',
 '902726_2013.pdf',
 '1047386_2013.pdf',
 '402938_2014.pdf',
 '618781_2013.pdf',
 '435101_2013.pdf',
 '619890_2014.pd

In [11]:
cab_infos= pd.DataFrame(cab.getInfos())

In [12]:
for key, value in cab.getInfos().items():
    print(key, len(value))

reason 206
id 206
date 206
first 206
last 206
dob 206
marital 206
educ 206
academic 206
institution 206
department 206
position 206
deposit 206
immoveable 206
vehicle 206
cattle_crop 206
moveable 206
debt 206
income_month 206
income_annual 206


In [13]:
cab_infos= cab_infos.sort_values(by = "id").reset_index().drop(columns= "index")

In [14]:
cab_infos

Unnamed: 0,reason,id,date,first,last,dob,marital,educ,academic,institution,department,position,deposit,immoveable,vehicle,cattle_crop,moveable,debt,income_month,income_annual
0,BAJA DEL CARGO,1047386,22/08/2018,HUMBERTO RUBEN,PERALTA BEAUFORT,22/01/1971,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 SECRETARIA DE LA FUNCION PUBLICA,1: DESPACHO MINISTRO,1: SECRETARIO EJECUTIVO,,,,,225000000,106500000,42851200,24851200
1,PRESENTACIÓN: ASUNCION AL CARGO,1056514,30/11/2016,ARIEL,MARTINEZ FERNANDEZ,30/03/1979,SOLTERO/A,TERCIARIO,,1 MINISTERIO DEL INTERIOR 2 MINISTERIO PUBLICO...,1: VICE MINISTERIO DE ASUNTOS POLITICOS 2: DEL...,1: VICEMINISTRO DEL PODER EJECUTIVO 2: AGENTE ...,5.000.000,1.250.000.000,,,165.000.000,,24.693.600,
2,PRESENTACIÓN: ASUNCION AL CARGO,1056514,30/11/2016,ARIEL,MARTINEZ FERNANDEZ,30/03/1979,SOLTERO/A,TERCIARIO,,1 MINISTERIO DEL INTERIOR 2 MINISTERIO PUBLICO...,1: VICE MINISTERIO DE ASUNTOS POLITICOS 2: DEL...,1: VICEMINISTRO DEL PODER EJECUTIVO 2: AGENTE ...,5.000.000,1.250.000.000,,,165.000.000,,24.693.600,
3,PRESENTACIÓN: Actualizacion,1056514,13/05/2015,ARIEL,MARTINEZ FERNANDEZ,30/03/1979,SOLTERO/A,TERCIARIO,,1 MINISTERIO PUBLICO FISCALIA GENERAL DEL ESTADO,1: DELITOS INFORMATICOS,1: AGENTE FISCAL,20.823.736,1.250.000.000,100.0,,150.000.000,,20.327.600,20.327.600
4,OTRO,1056514,13/05/2015,ARIEL,MARTINEZ FERNANDEZ,30/03/1979,SOLTERO/A,TERCIARIO,,1 MINISTERIO PUBLICO FISCALIA GENERAL DEL ESTADO,1: DELITOS INFORMATICOS,1: AGENTE FISCAL,20823736,1250000000,100000,,150000000,,20327600,20327600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,OTRO,987143,29/08/2019,ROQUE ALBERTO,SOTELO CHAPARRO,06/05/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 COMANDO DE LAS FUERZAS MILITARES 2 PRESIDENC...,1: GABINETE MILITAR 2: GABINETE MILITAR,1: MINISTRO JEFE DEL GABINETE MILITAR 2: MINIS...,110000000,1630000000,165000000,,136000000,43844796,34970310,25151200
202,OTRO,987143,19/11/2018,ROQUE ALBERTO,SOTELO CHAPARRO,06/05/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 COMANDO DE LAS FUERZAS MILITARES,1: GABINETE MILITAR,1: MINISTRO JEFE DEL GABINETE MILITAR,442000000,730000000,90000000,,81000000,71286000,52900000,39500000
203,OTRO,987143,29/11/2016,ROQUE ALBERTO,SOTELO CHAPARRO,06/05/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 MINISTERIO DE DEFENSA NACIONAL,1: FUERZAS ARMADAS DE LA NACION,1: AGREGADO DE DEFENSA DE PARAGUAY EN ALEMANIA,442000000,730000000,90000000,,81000000,71286000,52900000,39500000
204,BAJA DEL CARGO,992145,21/08/2018,FABRIZIO,CALIGARIS RAMOS,05/05/1976,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 PRESIDENCIA DE LA REPUBLICA,1: SECRETARIA DE INFORMACION Y COMUNICACION,1: SECRETARIO EJECUTIVO,421920638,3600000000,330045030,,250000000,113400000,39851200,


In [15]:
cab_infos.to_csv("cabinet_infos.csv")

## Legislators

In [16]:
legis= Extract(path= "/Users/charliezhang/Dropbox/disclosures-data/Charlie-Zhang/Paraguay/data/",
            identifier= "legislators")
legis.generate_parsing_list()
legis_unparsed_1314= legis.doubleCheck()

In [17]:
legis.getTexts()

1094794_2014_1.pdf cannot be parsed. Because it could be an image.
95732_2018.pdf cannot be parsed. Because it could be an image.
1163876_2020.pdf cannot be parsed. Because it could be an image.


In [18]:
legis_info= legis.getInfos()

In [19]:
for key, value in legis_info.items():
    print(key, ":", len(value))

reason : 614
id : 614
date : 614
first : 614
last : 614
dob : 614
marital : 614
educ : 614
academic : 614
institution : 614
department : 614
position : 614
deposit : 614
immoveable : 614
vehicle : 614
cattle_crop : 614
moveable : 614
debt : 614
income_month : 614
income_annual : 614


In [20]:
legis_infos= pd.DataFrame(legis_info)

In [21]:
legis_infos= legis_infos.sort_values(by = "id").reset_index().drop(columns= "index")

In [22]:
legis_infos

Unnamed: 0,reason,id,date,first,last,dob,marital,educ,academic,institution,department,position,deposit,immoveable,vehicle,cattle_crop,moveable,debt,income_month,income_annual
0,PRESENTACIÓN: RECTIFICATIVA DJ 16-07-2018,1000687,08/05/2020,ENRIQUE FAUSTO,BACCHETTA CHIRIANI,15/07/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: CAMARA DE SENADORES,1: SENADOR NACIONAL,73.942.496,3.042.664.128,39.460.000,,895.000.000,1.879.230.319,54.759.872,896.273.382
1,OTRO,1000687,08/05/2020,ENRIQUE FAUSTO,BACCHETTA CHIRIANI,15/07/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: CAMARA DE SENADORES,1: SENADOR NACIONAL,49637265,1904636642,201160000,,895000000,206380581,78569395,351045872
2,OTRO,1000687,08/05/2020,ENRIQUE FAUSTO,BACCHETTA CHIRIANI,15/07/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: CAMARA DE SENADORES,1: SENADOR NACIONAL,73942496,3042664128,39460000,,895000000,1879230319,54759872,896273382
3,PRESENTACIÓN: RECTIFICATIVA DJ 29-10-2012,1000687,08/05/2020,ENRIQUE FAUSTO,BACCHETTA CHIRIANI,15/07/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 CONSEJO DE LA MAGISTRATURA,1: CONSEJO DE LA MAGISTRATURA,1: PRESIDENTE O MIEMBRO DEL CONSEJO DE LA MAGI...,48.043.098,1.904.636.642,201.160.000,,895.000.000,108.543.947,62.740.648,540.000.000
4,PRESENTACIÓN: RECTIFICATIVA DJ 19-07-2013,1000687,08/05/2020,ENRIQUE FAUSTO,BACCHETTA CHIRIANI,15/07/1967,CASADO/A SIN SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: CAMARA DE SENADORES,1: SENADOR NACIONAL,49.637.265,1.904.636.642,201.160.000,,895.000.000,206.380.581,78.569.395,351.045.872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,ASUNCION AL CARGO,989122,11/09/2017,JORGE ANTONIO,OVIEDO MATTO,13/06/1966,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: VICEPRESIDENCIA PRIMERA,1: SENADOR NACIONAL,27000000,1395750000,474397000,,60000000,3202609,72273392,532724840
610,ASUNCION AL CARGO,989122,15/07/2018,JORGE ANTONIO,OVIEDO MATTO,13/06/1966,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: SENADOR DE LA NACION,1: SENADOR DE LA NACION,41718760,3085750000,474397000,,60000000,73273291,52000000,
611,BAJA DEL CARGO,989122,28/09/2018,JORGE ANTONIO,OVIEDO MATTO,13/06/1966,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: SENADOR DE LA NACION,1: SENADOR DE LA NACION,818031,3325750000,390000000,,60000000,63298164,79530865,240000000
612,BAJA DEL CARGO,989122,25/01/2018,JORGE ANTONIO,OVIEDO MATTO,13/06/1966,CASADO/A CON SEPARACION DE BIENES,TERCIARIO,,1 HONORABLE CAMARA DE SENADORES,1: VICEPRESIDENCIA PRIMERA,1: SENADOR NACIONAL,3396075,3085750000,474397000,,60000000,1597029,70649398,33940888


In [23]:
legis_infos.to_csv("legislators_infos.csv")

In [24]:
legis_unparsed_1314

['753777_2013.pdf',
 '660812_2013.pdf',
 '650349_2013.pdf',
 '2188426_2013.pdf',
 '571502_2013.pdf',
 '1412612_2013.pdf',
 '981087_2013.pdf',
 '384004_2013.pdf',
 '481344_2013.pdf',
 '3984772_2013_1.pdf',
 '2480063_2013.pdf',
 '1009116_2013.pdf',
 '738164_2013.pdf',
 '380566_2013.pdf',
 '2846714_2013.pdf',
 '488492_2013.pdf',
 '1378996_2013_2.pdf',
 '875792_2013.pdf',
 '736233_2013.pdf',
 '719614_2013.pdf',
 '2952067_2013.pdf',
 '1255411_2013.pdf',
 '280307_2013.pdf',
 '1843567_2013.pdf',
 '1464359_2013.pdf',
 '1054280_2013.pdf',
 '1115731_2013.pdf',
 '3542359_2013.pdf',
 '1737242_2013.pdf',
 '1001174_2013.pdf',
 '823435_2013.pdf',
 '1378996_2013_3.pdf',
 '1886495_2013.pdf',
 '1555610_2013.pdf',
 '1189251_2013.pdf',
 '705190_2013.pdf',
 '1897783_2013.pdf',
 '1274778_2014.pdf',
 '262677_2013.pdf',
 '3984772_2013.pdf',
 '1708767_2013.pdf',
 '1025360_2013.pdf',
 '367979_2013.pdf',
 '2188425_2013.pdf',
 '229430_2013.pdf',
 '796721_2013.pdf',
 '710600_2013.pdf',
 '615516_2013.pdf',
 '137082