In [154]:
import pandas as pd
import numpy as np
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# Simple queries

### Reading clean data

In [155]:
entities = pd.read_csv("./data/data_clean_csv/entities_clean.csv", dtype = 'object')
intermediaries = pd.read_csv("./data/data_clean_csv/intermediaries_clean.csv", dtype = 'object')
officers = pd.read_csv("./data/data_clean_csv/officers_clean.csv", dtype = 'object')
addresses = pd.read_csv("./data/data_clean_csv/addresses_clean.csv", dtype = 'object')
all_edges = pd.read_csv("./data/data_clean_csv/all_edges_clean.csv", dtype = 'object')

### DataFrame dictionary creation

We now create a dictionary that contains all the DataFrames:

In [156]:
dictionary = {'entities': entities, 'intermediaries': intermediaries, 'officers': officers, 'addresses': addresses,
             'all_edges': all_edges}

### List of available countries

`available_countries` is a list containing all the countries that can be used as a filter for the query.

In [157]:
available_countries = [x for x in officers['countries'].unique() if ';' not in x]
available_countries.sort()
available_countries = ['All'] + available_countries
len(available_countries)

208

### Query function definition

Query function that can be applied to `entities`, `ìntermediaries`, `officers` and `addresses`. The query can be filtered thanks by selecting a particular country.

In [158]:
def search_by_name(queried_name, dictionary, df_name, country):
    df = dictionary[df_name]
    df_copy = df.copy()
    if (type(queried_name) == str):
        if (country != 'All'):
            df_copy = df_copy[df_copy['countries'].str.contains(country, case = False)]        
        matched_df_copy = df_copy[df_copy['name'].str.contains(queried_name, case = False)]
        matched_name_serie = matched_df_copy['name']
        
        if (matched_name_serie.shape[0] > 100):
            matched_name_serie = matched_name_serie.iloc[:100]
            matched_df_copy = matched_df_copy.iloc[:100]
        
        return matched_df_copy
    else:
        name_serie = df_copy['name']
        matched_name_serie = name_serie[name_serie.str.contains(queried_name.value, case = False)]
        #print(matched_name_serie)
        
    
        return matched_name_serie

### Get the relationships of a particular item

The `get_relationships` function is meant to find all the relationships of a particular item of the database, represented as DataFrame line that is passed as input. This function is then used through a GUI interaction.

In [327]:
def get_relationships(df_answer):
    queried_node_id = df_answer['node_id'].values[0]
    queried_name = df_answer['name'].values[0]
    queried_country = df_answer['countries'].values[0]
    queried_type = widgetDFName.value

    in_relationship_df = all_edges[all_edges['node_2'] == queried_node_id]
    out_relationship_df = all_edges[all_edges['node_1'] == queried_node_id]
    
    in_node_id_list = in_relationship_df['node_1'].values
    out_node_id_list = out_relationship_df['node_2'].values
    
    
    output_columns = ['in_node', 'in_name', 'in_country', 'in_type', 'relationship',\
                         'out_node', 'out_name', 'out_country', 'out_type']
    output_df = pd.DataFrame(columns = output_columns) 
    
    
    # Looking for in-nodes throughout dataset
    for index, in_node_id in enumerate(in_node_id_list):
        in_type = 'entities'
        in_node_df = entities[entities['node_id'] == in_node_id]
        if (in_node_df.empty):
            in_type = 'intermediaries'
            in_node_df = intermediaries[intermediaries['node_id'] == in_node_id]
        if (in_node_df.empty):
            in_type = 'officers'
            in_node_df = officers[officers['node_id'] == in_node_id]
        if (in_node_df.empty):
            in_type = 'addresses'
            in_node_df = addresses[addresses['node_id'] == in_node_id]
        if (in_node_df.empty):
            print('Unfound in-node in entities, intermediaries and officers:', in_node_id) 
        else:
            in_name = in_node_df['name'].values[0]
            in_country = in_node_df['countries'].values[0]
            relationship = in_relationship_df['rel_type'].iloc[index]
            new_df_line = pd.Series([in_node_id, in_name, in_country, in_type, relationship, queried_node_id,\
                                     queried_name, queried_country, queried_type], index = output_columns)
            output_df = output_df.append(new_df_line, ignore_index = True)
    
    # Looking for out-nodes throughout dataset
    for index, out_node_id in enumerate(out_node_id_list):
        out_type = 'entities'
        out_node_df = entities[entities['node_id'] == out_node_id]
        if (out_node_df.empty):
            out_type = 'intermediaries'
            out_node_df = intermediaries[intermediaries['node_id'] == out_node_id]
        if (out_node_df.empty):
            out_type = 'officers'
            out_node_df = officers[officers['node_id'] == out_node_id]
        if (out_node_df.empty):
            out_type = 'addresses'
            out_node_df = addresses[addresses['node_id'] == out_node_id]
        if (out_node_df.empty):
            print('Unfound out-node in entities, intermediaries and officers:', in_node_id)
        else:
            if (out_type == 'addresses'):
                out_name = out_node_df['address'].values[0]
            else:
                out_name = out_node_df['name'].values[0]
            out_country = out_node_df['countries'].values[0]
            relationship = out_relationship_df['rel_type'].iloc[index]
            new_df_line = pd.Series([queried_node_id, queried_name, queried_country,\
                                     queried_type, relationship, out_node_id, out_name, out_country, out_type],\
                                    index = output_columns)
            output_df = output_df.append(new_df_line, ignore_index = True)
                
    return output_df

### GUI

In [330]:
df_name_dic = {'Entities': 'entities', 'Intermediaries': 'intermediaries', 'Officers': 'officers'}
widgetDFName = widgets.Dropdown(description = 'Type', options = df_name_dic)
widgetQueriedName = widgets.Text(description = 'Name', value = 'Type name here')
widgetCountry = widgets.Dropdown(description = 'Country', options = available_countries)
# 1st widget
first_widget = interactive(search_by_name, queried_name = widgetQueriedName, dictionary = fixed(dictionary),\
                df_name = widgetDFName, country = widgetCountry)

# Button 1
button1 = widgets.Button(description="Apply Filter")
def show_results(b):
    widgetResult.options = first_widget.result['name'].values.tolist()
button1.on_click(show_results)

# 2nd widget
def name_selection(selected_name):
    matched_df = first_widget.result
    return matched_df[matched_df['name'] == selected_name]
widgetResult = widgets.Dropdown(description = 'Name options')
second_widget = interactive(name_selection, selected_name = widgetResult)

# Button 2
def print_selected_result(b):
    if (second_widget.result.shape[0] != 0):
        display(get_relationships(second_widget.result))

button2 = widgets.Button(description="Display relationships")
button2.on_click(print_selected_result)

display(first_widget, button1, second_widget, button2)

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

Unnamed: 0,in_node,in_name,in_country,in_type,relationship,out_node,out_name,out_country,out_type
0,11010040,Y.T. LO & CO,Hong Kong,intermediaries,intermediary of,10136155,TRUMP BASE HOLDINGS LTD.,Hong Kong,entities
1,12217409,WONG SIU HONG,Hong Kong,officers,shareholder of,10136155,TRUMP BASE HOLDINGS LTD.,Hong Kong,entities
