In [63]:
import pandas as pd

# Reading and cleaning of the dataset

Reading of the different parts of the dataset. There are 4 different files that are converted into pandas DataFrames:
- `Entities.csv`, `Officers.csv`, `Intermediaries.csv` are dedicated to the three types of actors encountered in the database. Entities refer to asset providers and officers to financial actors (company, private client, ...). Intermediaries refer to actors putting clients and financial service providers in contact.
- `Addresses.csv` describe all the addresses contained in the database those addresses are linked to officers.
- `all_edges.csv` describe the relationships between the items of the database described before, that are entities, officers, intermediaries and addresses. Four different kinds of relationships are described in this dataset: 'registered address', 'shareholder of', 'beneficiary of' and 'intermediary of'.



In [64]:
entities = pd.read_csv("./data/data_csv/Entities.csv", dtype = 'object')
intermediaries = pd.read_csv("./data/data_csv/Intermediaries.csv", dtype = 'object')
officers = pd.read_csv("./data/data_csv/Officers.csv", dtype = 'object')
addresses = pd.read_csv("./data/data_csv/Addresses.csv", dtype = 'object')
all_edges = pd.read_csv("./data/data_csv/all_edges.csv", dtype = 'object')

### Dataset description

Now we print the DataFrames' columns and size in order to have a rough idea of their content.

In [65]:
print('entities:')
print('\tshape:', entities.shape)
print('\tcolumns:', entities.columns)
print()

print('intermediaries:')
print('\tshape:', intermediaries.shape)
print('\tcolumns:', intermediaries.columns)
print()

print('officers:')
print('\tshape:', officers.shape)
print('\tcolumns:', officers.columns)
print()

print('addresses:')
print('\tshape:', addresses.shape)
print('\tcolumns:', addresses.columns)
print()

print('all_edges:')
print('\tshape:', all_edges.shape)
print('\tcolumns:', all_edges.columns)

entities:
	shape: (495038, 21)
	columns: Index(['name', 'original_name', 'former_name', 'jurisdiction',
       'jurisdiction_description', 'company_type', 'address', 'internal_id',
       'incorporation_date', 'inactivation_date', 'struck_off_date',
       'dorm_date', 'status', 'service_provider', 'ibcRUC', 'country_codes',
       'countries', 'note', 'valid_until', 'node_id', 'sourceID'],
      dtype='object')

intermediaries:
	shape: (24177, 10)
	columns: Index(['name', 'internal_id', 'address', 'valid_until', 'country_codes',
       'countries', 'status', 'node_id', 'sourceID', 'note'],
      dtype='object')

officers:
	shape: (370854, 8)
	columns: Index(['name', 'icij_id', 'valid_until', 'country_codes', 'countries',
       'node_id', 'sourceID', 'note'],
      dtype='object')

addresses:
	shape: (151605, 8)
	columns: Index(['address', 'icij_id', 'valid_until', 'country_codes', 'countries',
       'node_id', 'sourceID', 'note'],
      dtype='object')

all_edges:
	shape: (1535552, 

### Cleaning up

The `entities`, `intermediaries` and `officers`' rows whose columns `name` and `countries` contains a NaN value are dropped because these rows will not be exploitable for the analysis. We also drop `addresses`' rows whose `address` column has a NaN value.

In [66]:
original_num_row = entities.shape[0]
entities = entities[entities['name'].notnull()]
entities = entities[entities['countries'].notnull()]
final_num_row = entities.shape[0]
print(original_num_row - final_num_row, 'rows dropped in entities')

original_num_row = intermediaries.shape[0]
intermediaries = intermediaries[intermediaries['name'].notnull()]
intermediaries = intermediaries[intermediaries['countries'].notnull()]
final_num_row = intermediaries.shape[0]
print(original_num_row - final_num_row, 'rows dropped in intermediaries')

original_num_row = officers.shape[0]
officers = officers[officers['name'].notnull()]
officers = officers[officers['countries'].notnull()]
final_num_row = officers.shape[0]
print(original_num_row - final_num_row, 'rows dropped in officers')

original_num_row = addresses.shape[0]
addresses = addresses[addresses['address'].notnull()]
addresses = addresses[addresses['countries'].notnull()]
final_num_row = addresses.shape[0]
print(original_num_row - final_num_row, 'rows dropped in addresses')

176682 rows dropped in entities
1515 rows dropped in intermediaries
118856 rows dropped in officers
891 rows dropped in addresses


### Writing clean datasets in new files

In order to not have to carry out the preprocessing again we write the the DataFrames in new files.

In [67]:
entities.to_csv('./data/data_clean_csv/entities_clean.csv', index = False)
intermediaries.to_csv('./data/data_clean_csv/intermediaries_clean.csv', index = False)
officers.to_csv('./data/data_clean_csv/officers_clean.csv', index = False)
addresses.to_csv('./data/data_clean_csv/addresses_clean.csv', index = False)
all_edges.to_csv('./data/data_clean_csv/all_edges_clean.csv', index = False)