In [1]:
import pandas as pd
import numpy as np
import tabula

# This time, all pages have the same format, hence we only need to implement and explore it once!

In [2]:
# In this case, the first dataframe is the important one.
l = tabula.read_pdf('pbe-veicular-2022.pdf', pages=1, area=['20.13', '6.17', '93.16', '89.71'], relative_area=True)
df = l[0].reset_index(drop=True)
df = df.dropna(axis=0, how='all').dropna(axis=1, how='all')

df.columns = [i for i in range(len(df.columns))]
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,Sub Compacto,CAOA CHERY,ICAR EQ1,TEC,Elétrico,Elétrico,A,S,E,E,...,\,\,\,\,488,368,48,A,A,-
1,Sub Compacto,FIAT,500E,ICON,Elétrico,Elétrico,A-1,S,E,E,...,\,\,\,\,473,404,46,A,A,-
2,Sub Compacto,FIAT,MOBI,EASY,1.0-8V,Combustão,M-5,N,M,F,...,0,88,98,110,142,155,144,B,B,-
3,Sub Compacto,FIAT,MOBI,TREKKING,1.0-8V,Combustão,M-5,S,H,F,...,0,93,96,104,135,150,150,C,B,-
4,Sub Compacto,FIAT,MOBI,LIKE,1.0-8V,Combustão,M-5,S,H,F,...,0,93,96,104,135,150,150,C,B,-


In [3]:
assert df.index.size == 148

In [4]:
s = pd.Series([9, 27, 91, 21], index=['Sub Compacto', 'Compacto', 'Médio', 'Grande'], name=0).sort_index()
assert (df.groupby(0)[0].count() == s).sum() == 4

In [5]:
class DataCleaning:
    
    columns = [
        'categoria', 'marca', 'modelo', 'versao', 'motor', 'transmissao_velocidades',
        'ar_cond', 'direcao_assistida', 'propulsao', 'combustivel',
        'poluentes_nmhc_g_km', 'poluentes_co_g_km', 'poluentes_nox_g_km', 'poluentes_reducao_relativa',
        'efeito_estufa_etanol_co2_fossil_g_km', 'efeito_estufa_gasolina_diesel_fossil_co2',
        'km_litro_etanol_cidade', 'km_litro_etanol_estrada',
        'km_litro_gasolina_diesel_eletrico_cidade', 'km_litro_gasolina_diesel_eletrico_estrada',
        'consumo_energetico_mj_km',
        'pbe_classificacao_relativa_categoria', 'pbe_classificacao_absoluta_geral',
        'selo_conpet_eficiencia_energetica'
    ]
    to_numeric = [
        'poluentes_nmhc_g_km', 'poluentes_co_g_km', 'poluentes_nox_g_km',
        'efeito_estufa_etanol_co2_fossil_g_km', 'efeito_estufa_gasolina_diesel_fossil_co2',
        'km_litro_etanol_cidade', 'km_litro_etanol_estrada',
        'km_litro_gasolina_diesel_eletrico_cidade', 'km_litro_gasolina_diesel_eletrico_estrada',
        'consumo_energetico_mj_km'
    ]
    to_integer = ['efeito_estufa_etanol_co2_fossil_g_km', 'efeito_estufa_gasolina_diesel_fossil_co2']
    
    def get_df(self, page):
        l = tabula.read_pdf('pbe-veicular-2022.pdf', pages=page, area=['20.13', '6.17', '93.16', '89.71'], relative_area=True)
        df = l[0]

        df = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
        df.columns = [i for i in range(len(df.columns))]
        
        df = self.assign_columns(df)
        df = self.cast(df)
        return df
        
    def assign_columns(self, df):
        df.columns = self.columns
        return df
        
    def cast(self, df):
        # casting to numeric
        for c in self.to_numeric:
            df[c] = df[c].astype(str).str.replace(',', '.')
            df[c] = pd.to_numeric(df[c], errors='coerce')

        # casting to integer
        for c in self.to_integer:
            df[c] = df[c].fillna(0)
            df[c] = df[c].astype('int64')
            
        return df
    
dc = DataCleaning()
df = dc.get_df(1)
df

Unnamed: 0,categoria,marca,modelo,versao,motor,transmissao_velocidades,ar_cond,direcao_assistida,propulsao,combustivel,...,efeito_estufa_etanol_co2_fossil_g_km,efeito_estufa_gasolina_diesel_fossil_co2,km_litro_etanol_cidade,km_litro_etanol_estrada,km_litro_gasolina_diesel_eletrico_cidade,km_litro_gasolina_diesel_eletrico_estrada,consumo_energetico_mj_km,pbe_classificacao_relativa_categoria,pbe_classificacao_absoluta_geral,selo_conpet_eficiencia_energetica
0,Sub Compacto,CAOA CHERY,ICAR EQ1,TEC,Elétrico,Elétrico,A,S,E,E,...,0,0,,,48.8,36.8,0.48,A,A,-
1,Sub Compacto,FIAT,500E,ICON,Elétrico,Elétrico,A-1,S,E,E,...,0,0,,,47.3,40.4,0.46,A,A,-
2,Sub Compacto,FIAT,MOBI,EASY,1.0-8V,Combustão,M-5,N,M,F,...,0,88,9.8,11.0,14.2,15.5,1.44,B,B,-
3,Sub Compacto,FIAT,MOBI,TREKKING,1.0-8V,Combustão,M-5,S,H,F,...,0,93,9.6,10.4,13.5,15.0,1.50,C,B,-
4,Sub Compacto,FIAT,MOBI,LIKE,1.0-8V,Combustão,M-5,S,H,F,...,0,93,9.6,10.4,13.5,15.0,1.50,C,B,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,Grande,CHEVROLET,CRUZE HB,PRE HB AT,1.4T-16V,Combustão,A-6,S,E,F,...,0,117,7.0,9.2,10.4,13.3,1.91,C,C,-
144,Grande,CHEVROLET,CRUZE HB,PRE2 HB AT,1.4T-16V,Combustão,A-6,S,E,F,...,0,117,7.0,9.2,10.4,13.3,1.91,C,C,-
145,Grande,CHEVROLET,CRUZE HB,RS HB AT,1.4T-16V,Combustão,A-6,S,E,F,...,0,117,7.0,9.2,10.4,13.3,1.91,C,C,-
146,Grande,JEEP,COMPASS,S 4XE,1.3 16V T,Plug-in,A-6,S,E,G,...,0,47,,,25.4,24.2,0.80,A,A,SIM


In [6]:
df_list = []
for page in range(1, 8):
    df_list.append(dc.get_df(page))
    
# checking the first and last lines of the last page
# since the table has a different format
df_list[-1].iloc[[0,-1],:]

df_final = pd.concat(df_list).reset_index(drop=True)
df_final.to_csv('pbe_2022.csv', index=False)

In [7]:
# this dataframe has the same problem as the 2021 case:
# some lines are split in two or more.
# To solve this, I will use the same function to join the columns.

def concat_lines(df, save='temp.csv'):
    """
    Iterate over each line of the dataframe and concatenate the null lines.
    
    Ex.:
    row  = [a, b, c, d, e, nan, nan, nan]
    row2 = [f, g, h, i, nan, nan, nan, nan]
    
    returns [a, b, c, d, e f, g, h, i]
    """
    
    data = []
    i = 0
    while i < df.index.size:
        
        if i == df.index.size-1:
            data.append([r for r in df.iloc[i]])
            break
        
        row = df.iloc[i]
        row2 = df.iloc[i+1]

        if row.isnull().sum() > len(df.columns)/2:
            d = [r for r in row if not pd.isnull(r)]
            d[-1] += ' ' + row2.iloc[0]
            d += [r for r in row2.iloc[1:] if not pd.isnull(r)]
            data.append(d)
            i += 1
        else:
            data.append(row.tolist())
        i += 1

    df2 = pd.DataFrame(data, columns=df.columns).reset_index(drop=True)
    # df2.to_csv(save, index=False)
    return df2

df_final = pd.read_csv('pbe_2022.csv')
df_final = concat_lines(df_final)
df_final = concat_lines(df_final)
df_final.to_csv('pbe_2022.csv', index=False)

# df_final.to_excel('pbe_2022.xlsx')
df_final

Unnamed: 0,categoria,marca,modelo,versao,motor,transmissao_velocidades,ar_cond,direcao_assistida,propulsao,combustivel,...,efeito_estufa_etanol_co2_fossil_g_km,efeito_estufa_gasolina_diesel_fossil_co2,km_litro_etanol_cidade,km_litro_etanol_estrada,km_litro_gasolina_diesel_eletrico_cidade,km_litro_gasolina_diesel_eletrico_estrada,consumo_energetico_mj_km,pbe_classificacao_relativa_categoria,pbe_classificacao_absoluta_geral,selo_conpet_eficiencia_energetica
0,Sub Compacto,CAOA CHERY,ICAR EQ1,TEC,Elétrico,Elétrico,A,S,E,E,...,0.0,0.0,,,48.8,36.8,0.48,A,A,-
1,Sub Compacto,FIAT,500E,ICON,Elétrico,Elétrico,A-1,S,E,E,...,0.0,0.0,,,47.3,40.4,0.46,A,A,-
2,Sub Compacto,FIAT,MOBI,EASY,1.0-8V,Combustão,M-5,N,M,F,...,0.0,88.0,9.8,11.0,14.2,15.5,1.44,B,B,-
3,Sub Compacto,FIAT,MOBI,TREKKING,1.0-8V,Combustão,M-5,S,H,F,...,0.0,93.0,9.6,10.4,13.5,15.0,1.5,C,B,-
4,Sub Compacto,FIAT,MOBI,LIKE,1.0-8V,Combustão,M-5,S,H,F,...,0.0,93.0,9.6,10.4,13.5,15.0,1.5,C,B,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,Esportivo,PORSCHE,Taycan,Turbo,Elétrico,Elétrico,--,S,E,E,...,0.0,0.0,,,24.6,23.8,0.84,A,A,-
781,Esportivo,PORSCHE,Taycan,Turbo S,Elétrico,Elétrico,--,S,E,E,...,0.0,0.0,,,23.5,22.9,0.87,A,A,-
782,Esportivo,PORSCHE,Taycan,--,Elétrico,Elétrico,--,S,E,E,...,0.0,0.0,,,22.4,23.5,0.89,A,A,-
783,Esportivo,PORSCHE,Taycan,4CT,Elétrico,Elétrico,--,S,E,E,...,0.0,0.0,,,23.0,22.1,0.9,A,A,-


In [8]:
# df_final[(df_final['categoria'] == 'Extra Grande') & (df_final['marca'] == 'AUDI')]
# df_final[df_final['categoria'] == 'Plus']
df_final.iloc[190:202].to_excel('asdf.xlsx')

In [9]:
# this time, looks like a few columns got clumped together.
# the columns seems to be separated by spaces.
# notice that a few values in column transmissao velocidades contain entries separated by spaces ('A - 6', instead of 'A-6')
# I will fix this in the script

# def split_columns(df, columns):
    
#     df_list = []
#     ncol = 0
#     for c in df.columns:
        
#         if c in columns:
            
#             temp_df = df[c].str.replace(r'\s-\s', '-', regex=True).str.split(' ', expand=True)
#             temp_df.columns = [ncol + i for i in range(len(temp_df.columns))]
#             ncol = temp_df.columns[-1]+1
            
#             df_list.append(temp_df)
#         else:
#             temp_df = df[[c]]
#             temp_df.columns = [ncol]
#             ncol += 1
            
#             df_list.append(temp_df)
        
#     df = pd.concat(df_list, axis=1)
#     df.columns = [i for i in range(len(df.columns))]
    
#     return df

# df = split_columns(df, [5, 8, 9, 10, 11, 12])
# df.head()

In [10]:
# Now, we can apply the same data conversion applied in the other script.
# However, lets finish it by writing a function to do it all

# class DataCleaning:
#     """
#     Cleans the pdf 'pbe-veicular-2022.pdf'.
#     """
    
#     columns = [
#         'categoria', 'marca', 'modelo', 'versao', 'motor', 'transmissao_velocidades',
#         'ar_cond', 'direcao_assistida', 'propulsao', 'combustivel',
#         'poluentes_nmhc_g_km', 'poluentes_co_g_km', 'poluentes_nox_g_km', 'poluentes_reducao_relativa',
#         'efeito_estufa_etanol_co2_fossil_g_km', 'efeito_estufa_gasolina_diesel_fossil_co2',
#         'km_litro_etanol_cidade', 'km_litro_etanol_estrada',
#         'km_litro_gasolina_diesel_eletrico_cidade', 'km_litro_gasolina_diesel_eletrico_estrada',
#         'consumo_energetico_mj_km',
#         'pbe_classificacao_relativa_categoria', 'pbe_classificacao_absoluta_geral',
#         'selo_conpet_eficiencia_energetica'
#     ]
#     to_numeric = [
#         'poluentes_nmhc_g_km', 'poluentes_co_g_km', 'poluentes_nox_g_km',
#         'efeito_estufa_etanol_co2_fossil_g_km', 'efeito_estufa_gasolina_diesel_fossil_co2',
#         'km_litro_etanol_cidade', 'km_litro_etanol_estrada',
#         'km_litro_gasolina_diesel_eletrico_cidade', 'km_litro_gasolina_diesel_eletrico_estrada',
#         'consumo_energetico_mj_km'
#     ]
#     to_integer = ['efeito_estufa_etanol_co2_fossil_g_km', 'efeito_estufa_gasolina_diesel_fossil_co2']
    
#     def __init__(self, pdf: str):
#         """
#         :param pdf: file name.
#         """
#         self.pdf = pdf
        
#     def get_df(self, page: int) -> pd.DataFrame:
#         """
#         Get page as a dataframe
        
#         :param page: page to extract.
        
#         return pd.DataFrame
#         """
        
#         l = tabula.read_pdf(self.pdf, pages=page)
#         df = l[0]

#         # The data seems to start at the 21st row
#         df = df.loc[21:].reset_index(drop=True)
        
#         display(df.tail())
        
#         df.columns = [i for i in range(len(df.columns))]
#         df = split_columns(df, [5, 8, 9, 10, 11, 12])
        
#         display(df.tail())
        
#         df = self.assign_columns(df)
#         df = self.cast(df)
        
#         display(df.tail())

#         return df
    
#     def assign_columns(self, df):
#         df.columns = self.columns
#         return df
        
#     def cast(self, df):
#         # casting to numeric
#         for c in self.to_numeric:
#             df[c] = df[c].astype(str).str.replace(',', '.')
#             df[c] = pd.to_numeric(df[c], errors='coerce')

#         # casting to integer
#         for c in self.to_integer:
#             df[c] = df[c].fillna(0)
#             df[c] = df[c].astype('int64')
            
#         return df

# dc = DataCleaning('pbe-veicular-2022.pdf')
# df = dc.get_df(page=1)

# display(df.dtypes)
# df.reset_index(drop=True)