## Understanding the relationship between CERC and CIP participants

Let's start extracting the list of participants from publicly available lists starting with CERC.

In [97]:
import tabula
import pandas as pd

df_cerc = tabula.read_pdf("./input/LISTA-PARTICIPANTES-CERC-ARRANJOS-DE-PAGAMENTO-1.pdf", pages="all")

# In first dataframe transform the first row in header
df_cerc[0].columns = df_cerc[0].iloc[0]
df_cerc[0] = df_cerc[0].drop(df_cerc[0].index[0])

# Transform the header in the first row
for i in range(1,7):
    df_cerc[i].loc[-1] = df_cerc[i].columns
    df_cerc[i].index = df_cerc[i].index + 1
    df_cerc[i].sort_index(inplace=True)
    df_cerc[i].columns = df_cerc[0].columns

df_cerc = pd.concat([df_cerc[0], df_cerc[1], df_cerc[2], df_cerc[3], df_cerc[4], df_cerc[5], df_cerc[6]], ignore_index=True)

# Remove the first column in all the dataframes
df_cerc = df_cerc.drop(df_cerc.columns[:1], axis=1)

df_cerc.reset_index(drop=True)

# Renaming the columns
df_cerc.columns = ["corporate_name", "cnpj", "profile"]

# Remove dashes and dots in cnpj column
df_cerc["cnpj"] = df_cerc["cnpj"].str.replace("[./-]", "", regex=True)



The same operation we perform with CIP:

In [241]:
df_cip = tabula.read_pdf("./input/Participantes Homologados.pdf", pages="all")

# Remove first three rows from the first page
df_cip[0] = df_cip[0].drop(df_cip[0].index[:3])
df_cip[4] = df_cip[4].drop(df_cip[4].index[:3])


In [239]:
df_cip[4].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,SLC - RELAÇÃO DE PARTICIPANTES,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,,,PRODUÇÃO,,,,,,,
1,,,,,,,Papel,,,,
2,No de Partic.,CNPJ,ISPB,Razão Social,,,,,,,
3,356,33.172.537/0001.98,33172537,BANCO J.P. MORGAN S.A.,,X,,X,,,
4,357,33.254.319/0001.00,33254319,BANCO LOSANGO S.A. - BANCO MULTIPLO,,,X,,,,


In [210]:
df_cip[0].info()

# Remove unnecessary columns from the first and forth pages
df_cip[0] = df_cip[0].drop(df_cip[0].columns[[0,1,12]], axis=1)
df_cip[4] = df_cip[4].drop(df_cip[4].columns[[0]], axis=1)

# Naming the columns correctly
df_cip[0].columns = ["cnpj", "ispb", "corporate_name", "is_iap", "is_acquirer", \
    "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 3 to 89
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                0 non-null      float64
 1   Atualizado em 11.08.2022  87 non-null     object 
 2   Unnamed: 1                87 non-null     object 
 3   Unnamed: 2                87 non-null     object 
 4   Unnamed: 3                87 non-null     object 
 5   Unnamed: 4                4 non-null      object 
 6   Unnamed: 5                15 non-null     object 
 7   Unnamed: 6                19 non-null     object 
 8   Unnamed: 7                38 non-null     object 
 9   Unnamed: 8                22 non-null     object 
 10  Unnamed: 9                7 non-null      object 
 11  Unnamed: 10               32 non-null     object 
 12  Unnamed: 11               0 non-null      float64
dtypes: float64(2), object(11)
memory usage: 9.0+ KB


In [223]:
df_cip[4].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,SLC - RELAÇÃO DE PARTICIPANTES,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,,,PRODUÇÃO,,,,,,,
1,,,,,,,Papel,,,,
2,No de Partic.,CNPJ,ISPB,Razão Social,,,,,,,
3,356,33.172.537/0001.98,33172537,BANCO J.P. MORGAN S.A.,,X,,X,,,
4,357,33.254.319/0001.00,33254319,BANCO LOSANGO S.A. - BANCO MULTIPLO,,,X,,,,


In [193]:
for i in (1,2,3,5):
    df_cip[i] = df_cip[i].drop(df_cip[i].index[:2])
    df_cip[i] = df_cip[i].drop(df_cip[i].columns[[0,1,12]], axis=1)
    df_cip[i].columns = ["cnpj", "ispb", "corporate_name", "is_iap", "is_acquirer", \
        "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]

IndexError: index 12 is out of bounds for axis 0 with size 11

In [242]:
# Remove unnecessary columns from the first page and forth page
df_cip[0] = df_cip[0].drop(df_cip[0].columns[[0,1,12]], axis=1)
df_cip[4] = df_cip[4].drop(df_cip[4].columns[[0]], axis=1)

# Naming the columns correctly
df_cip[0].columns = ["cnpj", "ispb", "corporate_name", "is_iap", "is_acquirer", \
    "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]
df_cip[4].columns = df_cip[0].columns

# Handling all the pages at one expcept the 4th page (!!!)
for i in (1,2,3,5):
    df_cip[i] = df_cip[i].drop(df_cip[i].index[:2])
    df_cip[i] = df_cip[i].drop(df_cip[i].columns[[0,1,12]], axis=1)
    df_cip[i].columns = ["cnpj", "ispb", "corporate_name", "is_iap", "is_acquirer", \
        "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]

df_cip = pd.concat([df_cip[0], df_cip[1], df_cip[2], df_cip[3], df_cip[4], df_cip[5]], ignore_index=True)

df_cip["cnpj"] = df_cip["cnpj"].str.replace("[./-]", "", regex=True)

In [243]:
df_cip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   cnpj                     420 non-null    object
 1   ispb                     420 non-null    object
 2   corporate_name           420 non-null    object
 3   is_iap                   11 non-null     object
 4   is_acquirer              63 non-null     object
 5   is_bank_account_owner    64 non-null     object
 6   is_issuer                116 non-null    object
 7   is_settler               46 non-null     object
 8   is_subacquirer           60 non-null     object
 9   is_subacquirer_receiver  220 non-null    object
dtypes: object(10)
memory usage: 32.9+ KB


In [132]:
df_cip[1] = df_cip[1].drop(df_cip[1].columns[[0,1,12]], axis=1)

In [134]:
df_cip[1].columns = ["cnpj", "ispb", "corporate_name", "is_iap", "is_acquirer", \
         "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]

In [170]:
df_cip_test = pd.concat([df_cip[0], df_cip[1], df_cip[2]], ignore_index=True)

In [237]:
df_cip[4].info()

KeyError: 4