In [1]:
from density_maps import *
import folium
import pandas as pd
import numpy as np
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import branca
import panama_papers_aux
from IPython.display import Markdown

# I. Data Preprocessing

## Merging of the 4 leaks folders

Paradise papers are not included for the moment.
This short code makes correspond the new version of the dataset with the former one.

In [2]:
leaks= ['bahamas_leaks', 'offshore_leaks', 'panama_papers'] #paradise_papers
month_code = '2017-11-17'

edges_df = pd.DataFrame([])
addresses_df = pd.DataFrame([])
entities_df = pd.DataFrame([])
intermediaries_df = pd.DataFrame([])
officers_df = pd.DataFrame([])


for leak in leaks:
    edges_df = edges_df.append(pd.read_csv('./data/data_csv/csv_' + leak + '.' +
                                           month_code + '/' + leak + '.edges.csv', dtype=object))
    
    addresses_df = addresses_df.append(pd.read_csv('./data/data_csv/csv_' + leak + '.' + 
                                               month_code + '/' + leak + '.nodes.address.csv', dtype=object))
    
    entities_df = entities_df.append(pd.read_csv('./data/data_csv/csv_' + leak + '.' + 
                                           month_code + '/' + leak + '.nodes.entity.csv', dtype=object))
    
    intermediaries_df = intermediaries_df.append(pd.read_csv('./data/data_csv/csv_' + leak + '.' + 
                                           month_code + '/' + leak + '.nodes.intermediary.csv', dtype=object))
    
    officers_df = officers_df.append(pd.read_csv('./data/data_csv/csv_' + leak + '.' + 
                                           month_code + '/' + leak + '.nodes.officer.csv', dtype=object))
    
    
edges_df.index = range(len(edges_df))
edges_df.to_csv('./data/data_csv/all_edges.csv')
addresses_df.index = range(len(addresses_df))
addresses_df.columns = addresses_df.columns.str.replace('n\.','')
addresses_df.to_csv('./data/data_csv/Addresses.csv')
entities_df.index = range(len(entities_df))
entities_df.columns = entities_df.columns.str.replace('n\.','')
entities_df.to_csv('./data/data_csv/Entities.csv')
intermediaries_df.index = range(len(intermediaries_df))
intermediaries_df.columns = intermediaries_df.columns.str.replace('n\.','')
intermediaries_df.to_csv('./data/data_csv/Intermediaries.csv')
officers_df.index = range(len(officers_df))
officers_df.columns = officers_df.columns.str.replace('n\.','')
officers_df.to_csv('./data/data_csv/Officers.csv')

## Reading and cleaning of the dataset

Reading of the different parts of the dataset. There are 4 different files that are converted into pandas DataFrames:
- `Entities.csv`, `Officers.csv`, `Intermediaries.csv` are dedicated to the three types of actors encountered in the database. Entities refer to asset providers and officers to financial actors (company, private client, ...). Intermediaries refer to actors putting clients and financial service providers in contact.
- `Addresses.csv` describe all the addresses contained in the database those addresses are linked to officers.
- `all_edges.csv` describe the relationships between the items of the database described before, that are entities, officers, intermediaries and addresses. Four different kinds of relationships are described in this dataset: 'registered address', 'shareholder of', 'beneficiary of' and 'intermediary of'.



In [3]:
entities = pd.read_csv("./data/data_csv/Entities.csv", dtype = 'object')
intermediaries = pd.read_csv("./data/data_csv/Intermediaries.csv", dtype = 'object')
officers = pd.read_csv("./data/data_csv/Officers.csv", dtype = 'object')
addresses = pd.read_csv("./data/data_csv/Addresses.csv", dtype = 'object')
all_edges = pd.read_csv("./data/data_csv/all_edges.csv", dtype = 'object')

### Dataset description

Now we print the DataFrames' columns and size in order to have a rough idea of their content.

In [4]:
print('entities:')
print('\tshape:', entities.shape)
print('\tcolumns:', entities.columns)
print()

print('intermediaries:')
print('\tshape:', intermediaries.shape)
print('\tcolumns:', intermediaries.columns)
print()

print('officers:')
print('\tshape:', officers.shape)
print('\tcolumns:', officers.columns)
print()

print('addresses:')
print('\tshape:', addresses.shape)
print('\tcolumns:', addresses.columns)
print()

print('all_edges:')
print('\tshape:', all_edges.shape)
print('\tcolumns:', all_edges.columns)

entities:
	shape: (495038, 19)
	columns: Index(['Unnamed: 0', 'labels(n)', 'valid_until', 'country_codes', 'countries',
       'node_id', 'sourceID', 'address', 'name', 'jurisdiction_description',
       'service_provider', 'jurisdiction', 'closed_date', 'incorporation_date',
       'ibcRUC', 'type', 'status', 'company_type', 'note'],
      dtype='object')

intermediaries:
	shape: (24177, 19)
	columns: Index(['Unnamed: 0', 'labels(n)', 'valid_until', 'country_codes', 'countries',
       'node_id', 'sourceID', 'address', 'name', 'jurisdiction_description',
       'service_provider', 'jurisdiction', 'closed_date', 'incorporation_date',
       'ibcRUC', 'type', 'status', 'company_type', 'note'],
      dtype='object')

officers:
	shape: (370854, 19)
	columns: Index(['Unnamed: 0', 'labels(n)', 'valid_until', 'country_codes', 'countries',
       'node_id', 'sourceID', 'address', 'name', 'jurisdiction_description',
       'service_provider', 'jurisdiction', 'closed_date', 'incorporation_date'

### Cleaning up

The `entities`, `intermediaries` and `officers`' rows whose columns `name` and `countries` contains a NaN value are dropped because these rows will not be exploitable for the analysis. We also drop `addresses`' rows whose `address` column has a NaN value.

In [5]:
original_num_row = entities.shape[0]
entities = entities[entities['name'].notnull()]
entities = entities[entities['countries'].notnull()]
entities = entities[entities['country_codes'] != 'XXX']
entities[entities['country_codes'] == 'REU'] = 'FRA'
entities[entities['country_codes'] == 'MTQ'] = 'FRA'
final_num_row = entities.shape[0]
print(original_num_row - final_num_row, 'rows dropped in entities')

original_num_row = intermediaries.shape[0]
intermediaries = intermediaries[intermediaries['name'].notnull()]
intermediaries = intermediaries[intermediaries['countries'].notnull()]
intermediaries = intermediaries[intermediaries['country_codes'] != 'XXX']
intermediaries[intermediaries['country_codes'] == 'REU'] = 'FRA'
intermediaries[intermediaries['country_codes'] == 'MTQ'] = 'FRA'
final_num_row = intermediaries.shape[0]
print(original_num_row - final_num_row, 'rows dropped in intermediaries')

original_num_row = officers.shape[0]
officers = officers[officers['name'].notnull()]
officers = officers[officers['countries'].notnull()]
officers = officers[officers['country_codes'] != 'XXX']
officers[officers['country_codes'] == 'REU'] = 'FRA'
officers[officers['country_codes'] == 'MTQ'] = 'FRA'
final_num_row = officers.shape[0]
print(original_num_row - final_num_row, 'rows dropped in officers')

original_num_row = addresses.shape[0]
addresses = addresses[addresses['address'].notnull()]
addresses = addresses[addresses['countries'].notnull()]
addresses = addresses[addresses['country_codes'] != 'XXX']
addresses[addresses['country_codes'] == 'REU'] = 'FRA'
addresses[addresses['country_codes'] == 'MTQ'] = 'FRA'
final_num_row = addresses.shape[0]
print(original_num_row - final_num_row, 'rows dropped in addresses')

187862 rows dropped in entities
1909 rows dropped in intermediaries
158254 rows dropped in officers
1247 rows dropped in addresses


### Writing clean datasets in new files

In order to not have to carry out the preprocessing again we write the the DataFrames in new files.

In [6]:
entities.to_csv('./data/data_clean_csv/entities_clean.csv', index = False)
intermediaries.to_csv('./data/data_clean_csv/intermediaries_clean.csv', index = False)
officers.to_csv('./data/data_clean_csv/officers_clean.csv', index = False)
addresses.to_csv('./data/data_clean_csv/addresses_clean.csv', index = False)
all_edges.to_csv('./data/data_clean_csv/all_edges_clean.csv', index = False)

# II. Density Maps

This widget allows to display maps showing the repartition of officers, intermediaries and entities in the world.
The user can specify which of these data he wants to display, and on which region (can be the entire world or a specified one).

The user can also choose between 3 geoJSON, having different weight, and thus being more or less precise. With the lightest geoJSON, the processing time is reduces and the map is more reactive, but some small countries are missing.
With the heaviest geoJSON, all the countries are present, but the processing time can take time and the map can be less reactive if the computer is not powerful enough. If the user wishes to see a small country that is not in the lightest geoJSON, he can choose a heavier geoJSON and reduce the geoJSON area to a continent so that it does not involve too much computation time.

The color scale is logarithmic because the number of actors in a country can differ a lot, and the presence of outliers gives an unreadable map with a linear scale.

In [7]:
interact(create_density_map, query=['entities','officers','intermediaries'], json_weight=['light','mid_weight','heavy'],
        region=['World', 'Europe', 'Asia', 'Americas', 'Africa',
                'Oceania'])

<function density_maps.create_density_map>

# III. Connection Queries

## Simple queries

### Reading clean data

In [8]:
entities = pd.read_csv("./data/data_clean_csv/entities_clean.csv", dtype = 'object')
intermediaries = pd.read_csv("./data/data_clean_csv/intermediaries_clean.csv", dtype = 'object')
officers = pd.read_csv("./data/data_clean_csv/officers_clean.csv", dtype = 'object')
addresses = pd.read_csv("./data/data_clean_csv/addresses_clean.csv", dtype = 'object')
all_edges = pd.read_csv("./data/data_clean_csv/all_edges_clean.csv", dtype = 'object')

### DataFrame dictionary creation

We now create a dictionary that contains all the DataFrames:

In [9]:
df_dictionary = {'Entity': entities, 'Intermediary': intermediaries, 'Officer': officers, 'Address': addresses,
             'all_edges': all_edges}

### List of available countries

`available_countries` is a list containing all the countries that can be used as a filter for the query.

In [10]:
available_countries = [x for x in officers['countries'].unique() if ';' not in x]
available_countries.sort()
available_countries = ['All'] + available_countries
len(available_countries)

206

### GUI to get the relationships of a particular item

The `get_relationships` function is meant to find all the relationships of a particular item of the database, represented as DataFrame line that is passed as input (`df_queried_item`). It outputs a DataFrame showing all the in and out-relationships involving the queried item of the database.
This function is then used through a GUI interface at the end of this notebook.

In [11]:
panama_papers_aux.Item.readCapitalCoordinates()

In [12]:
options_type_dic = {'Entities': 'entities', 'Intermediaries': 'intermediaries', 'Officers': 'officers'}

option_list = ['Entity', 'Intermediary', 'Officer']
widgetDFName = widgets.Dropdown(description = 'Type', options = option_list)
widgetQueriedName = widgets.Text(description = 'Name', value = 'Type name here')
widgetCountry = widgets.Dropdown(description = 'Country', options = available_countries)
# 1st widget
first_widget = interactive(panama_papers_aux.search_by_name, queried_name = widgetQueriedName, dictionary = fixed(df_dictionary),\
                df_name = widgetDFName, country = widgetCountry)

# Button 1
button1 = widgets.Button(description="Apply Filter")
def show_results(b):
    widgetResult.options = first_widget.result['name'].values.tolist()
button1.on_click(show_results)

# 2nd widget
def name_selection(selected_name):
    matched_df = first_widget.result
    return matched_df[matched_df['name'] == selected_name]
widgetResult = widgets.Dropdown(description = 'Name options')
second_widget = interactive(name_selection, selected_name = widgetResult)

# Button 2
def print_selected_result(b):
    if (second_widget.result.shape[0] != 0):
        # Instantiating an Item object corresponding to the query
        # Then instantiating a NetworkItem object from the previous Item object
        type_ = widgetDFName.value
        queried_item = panama_papers_aux.Item.fromSingleLineDF(second_widget.result, type_)
        queried_item_network = panama_papers_aux.ItemNetwork(queried_item, df_dictionary)
        
        display()
        
        # get ItemNetwork DataFrame and print it
        network_df = queried_item_network.getDF()
        display()
        
        # get FoliumMap of connections and print it
        connection_map = queried_item_network.getMap()
        display(Markdown('### ' + queried_item.name), network_df, connection_map)
        #button2.close()
        #button3.open()

button2 = widgets.Button(description="Display Relationships")
button2.on_click(print_selected_result)

To test the widget, you can try with 'Trump' query as an officer / entity.

In [15]:
handle = display(first_widget, button1, second_widget, button2)