In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import itertools
from tqdm import tqdm
from scipy import stats
from geolite2 import geolite2 

%config InlineBackend.figure_format = 'retina' 

In [99]:
def countries_in_subgraph(G, subgraph_idx, field='country'):
    '''
    Given a NetworkX subgraph, generates the list of countries seen. 

    Parameters
    ----------
    G (nx.Graph): the original graph
    subgraph_idx (int): index number of the graph component

    Returns
    -------
    num_nodes (int): number of nodes in the subgraph
    all_countries (list): list of all countries seen in the subgraph
    '''
    subgraphs = [nx.subgraph(G,c) for c in nx.connected_components(G)]
    k = subgraphs[subgraph_idx]
    # k = subgraph_list[subgraph_idx]
    all_countries = []
    all_nodes = []
    num_nodes = len(k.nodes)
    for node in k.nodes(data=True):
        all_nodes.append(node[0])
        try:
            if str(node[1][field]) != 'nan':
                all_countries.append(node[1][field])
        except:
            pass
    return num_nodes, all_countries, all_nodes

### My method

My method of synthetic locations performs the following steps.

1. Create a network of worker-miner-owner relationships using all historical data. 
2. Perform a lookup of the country from `geolite2`.
3. My synthetic location algorithm
    * Take the top 1000 connected subnetworks (can be more, but depends on how much spare time you have). 
    * For each subnetwork, label all miners by the modal location. (For example, if there are 10 miners in the subnetwork, 5 from China and 1 from Singapore, then all 10 miners are designated from China.)

In [135]:
# 0. Preliminaries
# 0a. Load lookup database
geo = geolite2.reader()

def extract_country(x):
    out = np.nan
    try:
        # out = geo.get(x)['country']['names']['en']
        out = geo.get(x)['country']['iso_code']
    except:
        pass
    return out

# 0b. Load worker relationship data
data = pd.read_csv('../data/worker_relationship_jun_19.csv', index_col=[0])
dsplt = data['multi_addresses'].str.split('/', expand=True)
dsplt = dsplt[dsplt.columns[2]]
data['ip'] = dsplt
del dsplt

# 0c. Extract country
data['country'] = data['ip'].apply(extract_country)

# 1. Load network
# 1a. Add edges
G = nx.Graph()
for m,c in zip(data.miner_id, data.country):
    G.add_node(m, country=c)
G.add_edges_from([mi, wi] for mi, wi in data[['miner_id', 'worker_id']].drop_duplicates().values)
G.add_edges_from([mi, oi] for mi, oi in data[['miner_id', 'owner_id']].drop_duplicates().values)

# 1b. Remove small components
small_components = sorted(nx.connected_components(G), key=len)[:-800]
G.remove_nodes_from(itertools.chain.from_iterable(small_components))


In [136]:
subgraphs = [nx.subgraph(G,c) for c in nx.connected_components(G)]
number_of_subgraphs = len(subgraphs)
n_node_list = []
mode_country_list = []
n_geolocated_list = []
full_node_list = []

for n in tqdm(range(number_of_subgraphs)):
    n_nodes, all_c, all_nodes = countries_in_subgraph(G, n)
    n_node_list.append(n_nodes)
    full_node_list.append(all_nodes)
    if len(all_c) > 0:
        mode_country_list.append(stats.mode(all_c).mode[0])
    else: 
        mode_country_list.append('None')
    n_geolocated_list.append(len(all_c))

geo_df = pd.DataFrame(np.array([n_node_list, mode_country_list, n_geolocated_list, full_node_list]).T, 
                      columns=['num_nodes', 'mode_country', 'num_geolocated_nodes', 'associated_nodes'])    
geo_df.num_geolocated_nodes = pd.to_numeric(geo_df.num_geolocated_nodes)
geo_df.num_nodes = pd.to_numeric(geo_df.num_nodes)

100%|████████████████████████████████| 800/800 [10:03<00:00,  1.33it/s]
  geo_df = pd.DataFrame(np.array([n_node_list, mode_country_list, n_geolocated_list, full_node_list]).T,


In [25]:
geo_df[geo_df.num_geolocated_nodes > 0].sort_values(by='num_geolocated_nodes', ascending=False)
geo_df['num_geolocated_nodes'].sum() / len(data), geo_df['num_nodes'].sum() / len(data)

(0.0008979145211122554, 0.9533486740473739)

In [137]:
geo_df.to_csv('beng_method.csv')

### Jim Pick method

This method combines Jim Pick's inferred location with our own historical network data. 

1. Create a network of worker-miner-owner relationships using all historical data. 
2. Join this network table to Jim Pick's synthetic locations JSON.
    * Note that I have calculated a column called `country_multi`, which contains all the countries seen for the given miner.
3. My synthetic location algorithm
    * Take the top 1000 connected subnetworks (can be more, but depends on how much spare time you have). 
    * (not fully implemented) Find out which are the countries seen in this subnetwork

In [35]:
# 0. Read Jim's JSON file
locations_jimpick = pd.read_json('https://geoip.feeds.provider.quest/synthetic-locations-latest.json')
locations_df = pd.json_normalize(locations_jimpick.providerLocations)
locations_df.index = locations_df.provider
locations_df['country_multi'] = locations_df['country']

In [73]:
# 1. Impute Jim's synthetic locations
for prov in locations_df.provider.unique():
    try:
        if len(locations_df.loc[prov].country.unique())<2:
            locations_df.country_multi.loc[prov]  = locations_df.loc[prov].country.unique()[0]
        else:
            locations_df.country_multi.loc[prov]  = str(locations_df.loc[prov].country.unique())
    except:
        locations_df.country_multi.loc[prov]   = locations_df.loc[prov].country
lookup_df = locations_df[['provider', 'country_multi']].reset_index(drop=True)        

In [91]:
# 2. Join our network table to Jim's locations
data = pd.read_csv('../data/worker_relationship_jun_19.csv', index_col=[0])
df_out = data.merge(lookup_df, how='left', left_on='miner_id', right_on='provider')

In [117]:
# 3. Load network
# 3a. Add edges
G = nx.Graph()
for m,c in zip(df_out.miner_id, df_out.country_multi):
    G.add_node(m, country=c)
G.add_edges_from([mi, wi] for mi, wi in data[['miner_id', 'worker_id']].drop_duplicates().values)
G.add_edges_from([mi, oi] for mi, oi in data[['miner_id', 'owner_id']].drop_duplicates().values)

# 3b. Remove small components
small_components = sorted(nx.connected_components(G), key=len)[:-800]
G.remove_nodes_from(itertools.chain.from_iterable(small_components))


In [118]:
subgraphs = [nx.subgraph(G,c) for c in nx.connected_components(G)]
number_of_subgraphs = len(subgraphs)
n_node_list = []
mode_country_list = []
n_geolocated_list = []
full_node_list = []

for n in tqdm(range(number_of_subgraphs)):
    n_nodes, all_c, all_nodes = countries_in_subgraph(G, n, field='country')
    n_node_list.append(n_nodes)
    full_node_list.append(all_nodes)
    if len(all_c) > 0:
        mode_country_list.append(stats.mode(all_c).mode[0])
    else: 
        mode_country_list.append('None')
    n_geolocated_list.append(len(all_c))

geo_df = pd.DataFrame(np.array([n_node_list, mode_country_list, n_geolocated_list, full_node_list]).T, 
                      columns=['num_nodes', 'mode_country', 'num_geolocated_nodes', 'associated_nodes'])    
geo_df.num_geolocated_nodes = pd.to_numeric(geo_df.num_geolocated_nodes)
geo_df.num_nodes = pd.to_numeric(geo_df.num_nodes)

100%|████████████████████████████████| 800/800 [09:18<00:00,  1.43it/s]
  geo_df = pd.DataFrame(np.array([n_node_list, mode_country_list, n_geolocated_list, full_node_list]).T,


In [119]:
geo_df[geo_df.num_geolocated_nodes > 0].sort_values(by='num_geolocated_nodes', ascending=False)
geo_df['num_geolocated_nodes'].sum() / len(data), geo_df['num_nodes'].sum() / len(data)

(0.0030606333676622037, 0.9508882595262615)

In [133]:
geo_df.to_csv('jimpick_method.csv')