## Understanding the relationship between CERC and CIP participants

Let's start extracting the list of participants from publicly available lists starting with CERC.

In [9]:
import tabula
import pandas as pd

df_cerc = tabula.read_pdf("./input/LISTA-PARTICIPANTES-CERC-ARRANJOS-DE-PAGAMENTO-1.pdf", pages="all")

# In first dataframe transform the first row in header
df_cerc[0].columns = df_cerc[0].iloc[0]
df_cerc[0] = df_cerc[0].drop(df_cerc[0].index[0])

# Transform the header in the first row
for i in range(1,7):
    df_cerc[i].loc[-1] = df_cerc[i].columns
    df_cerc[i].index = df_cerc[i].index + 1
    df_cerc[i].sort_index(inplace=True)
    df_cerc[i].columns = df_cerc[0].columns

df_cerc = pd.concat([df_cerc[0], df_cerc[1], df_cerc[2], df_cerc[3], df_cerc[4], df_cerc[5], df_cerc[6]], ignore_index=True)

# Remove the first column in all the dataframes
df_cerc = df_cerc.drop(df_cerc.columns[:1], axis=1)

df_cerc.reset_index(drop=True)

# Renaming the columns
df_cerc.columns = ["corporate_name", "cnpj", "profile"]

# Remove dashes and dots in cnpj column
df_cerc["cnpj"] = df_cerc["cnpj"].str.replace("[./-]", "", regex=True)



The same operation we perform with CIP:

In [10]:
df_cip = tabula.read_pdf("./input/Participantes Homologados.pdf", pages="all")

# Remove first three rows from the first page
df_cip[0] = df_cip[0].drop(df_cip[0].index[:3])
df_cip[4] = df_cip[4].drop(df_cip[4].index[:3])

# Remove unnecessary columns from the first page and forth page
df_cip[0] = df_cip[0].drop(df_cip[0].columns[[0,1,12]], axis=1)
df_cip[4] = df_cip[4].drop(df_cip[4].columns[[0]], axis=1)

# Naming the columns correctly
df_cip[0].columns = ["cnpj", "ispb", "corporate_name", "is_iap", "is_acquirer", \
    "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]
df_cip[4].columns = df_cip[0].columns

# Handling all the pages at one expcept the 4th page (!!!)
for i in (1,2,3,5):
    df_cip[i] = df_cip[i].drop(df_cip[i].index[:2])
    df_cip[i] = df_cip[i].drop(df_cip[i].columns[[0,1,12]], axis=1)
    df_cip[i].columns = ["cnpj", "ispb", "corporate_name", "is_iap", "is_acquirer", \
        "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]

df_cip = pd.concat([df_cip[0], df_cip[1], df_cip[2], df_cip[3], df_cip[4], df_cip[5]], ignore_index=True)

df_cip["cnpj"] = df_cip["cnpj"].str.replace("[./-]", "", regex=True)

In [11]:
# Select only subacquirers from CIP original list
df_sub_cip = df_cip.loc[(df_cip['is_subacquirer'] == 'X') | (df_cip['is_subacquirer_receiver'] == 'X')]

In [12]:
df_sub_cip

Unnamed: 0,cnpj,ispb,corporate_name,is_iap,is_acquirer,is_bank_account_owner,is_issuer,is_settler,is_subacquirer,is_subacquirer_receiver
2,00063960000109,00063960,WMB SUPERMERCADOS DO BRASIL LTDA.,,,,,,,X
10,00714671000114,00714671,EWALLY TECNOLOGIA E SERVIÇOS S.A.,,X,,,,X,
11,00776574000156,00776574,B2W COMPANHIA DIGITAL,,,,,,,X
12,00776574000660,00776574,B2W COMPANHIA DIGITAL,,,,,,X,
14,00860640000171,00860640,INGRESSO.COM LTDA,,,,,,,X
...,...,...,...,...,...,...,...,...,...,...
499,29257606000199,29257606,PAYSHOPX,,,,,,,X
501,21301803000156,21301803,POTENCIAL TECNOLOGIA LTDA,,,,,,,X
504,12481100000166,12481100,BIZ2U PAGAMENTOS LTDA,,,,,,,X
505,37313312000165,37313312,POCKET PAY SOLUÇÕES EM PAGAMENTOS LTDA,,,,,,,X


In [13]:
# Merging CIP and CERC dataframes
df_merged = pd.merge(df_cerc, df_cip, on="cnpj", how="inner")

# Removing NaN values
df_merged.fillna({'is_iap':False, 'is_acquirer':False, 'is_bank_account_owner':False, 'is_issuer':False,\
    'is_settler':False, 'is_subacquirer': False, 'is_subacquirer_receiver':False}, inplace=True)

# Substitute X value by True
entity_columns = ["is_iap", "is_acquirer", "is_bank_account_owner", "is_issuer", "is_settler", "is_subacquirer", "is_subacquirer_receiver"]
df_merged[entity_columns] = df_merged[entity_columns].replace({"X":True})

# Selecting only subacquirers and subacquirers receivers
df_sub = df_merged.loc[(df_merged['is_subacquirer'] == True) | (df_merged['is_subacquirer_receiver'] == True)]

In [17]:

df_cip_sub_only = df_sub_cip[~df_sub_cip.cnpj.isin(df_sub.cnpj)]

# Removing NaN values
df_cip_sub_only.fillna({'is_iap':False, 'is_acquirer':False, 'is_bank_account_owner':False, 'is_issuer':False,\
    'is_settler':False, 'is_subacquirer': False, 'is_subacquirer_receiver':False}, inplace=True)

# Substitute X value by True
df_cip_sub_only[entity_columns] = df_cip_sub_only[entity_columns].replace({"X":True})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cip_sub_only[entity_columns] = df_cip_sub_only[entity_columns].replace({"X":True})


In [7]:
occur = df_sub.groupby(['is_subacquirer', 'is_subacquirer_receiver']).size()

In [8]:
df_sub.to_csv("./output/subacquirers_br.csv", index=False)