In [None]:
#IMPORT
try:
    import graphlab as gl
    import numpy as np
    import graphlab.aggregate as agg
    import matplotlib.pyplot as plt
    from matplotlib import rcParams
    import datetime as dt   
    from graphlab import degree_counting
    from graphlab import connected_components
    from graphlab import pagerank
    from graphlab import shortest_path
    from graphlab import triangle_counting
    from graphlab import label_propagation
    from graphlab import kcore
    from graphlab import graph_coloring
    print('success')
except:
    raise ImportError("Key libraries cannot be loaded.")

success


In [None]:
import os

transaction_data = '../code/graph-code/data/blocks_417500_424572/'

if os.path.exists('../code/graph-code/data/blocks_417500_424572'):
    sf = gl.SFrame(transaction_data)
    # TAKE SAMPLE
    s = sf.sample(0.22, seed=1)
    df = s.to_dataframe()
    df = df[df['input_address'].notnull()]
    df = df[df['output_address'].notnull()]
    df = df.fillna(0)
    sf_transactions = gl.SFrame(df)
    g = gl.SGraph().add_edges(sf_transactions, src_field='input_address', dst_field='output_address')
    print(len(sf))    
    print(len(df))
    print(g)
else:
    print('cant find data')

In [None]:
print(df.head())

In [None]:
#LOAD TAGGED DATA

import json
import pandas as pd

whoiswho = '../code/identity_scraper/output/whoiswho.json'
blockinfo = '../code/identity_scraper/output/b_info_out.json'
explorer = '../code/identity_scraper/output/explorer.json'

if os.path.exists('../code/identity_scraper/output/'):
    with open(whoiswho) as json_data:
        who = json.load(json_data)
    with open(blockinfo) as json_data:
        block = json.load(json_data) 
    with open(explorer) as json_data:
        explorer = json.load(json_data) 
else:
    print('cant find data')

total = who + block + explorer
print(len(total))

In [None]:
#CREATE TAG DATAFRAME
import re

categories = []
for i in total:
    addr = re.search("'(.*?)'", str(i)).group(0)
    num = re.search('(?<=\s)[0-9]', str(i)).group(0)
    addr.replace("'", "")
    t_dict = {
        'address': addr,
        'category': int(num)
    }
    categories.append(t_dict)

df_tags = pd.DataFrame.from_dict(categories)
df_tags['address'] = df_tags['address'].map(lambda x: str(x)[:-1])
df_tags['address'] = df_tags['address'].map(lambda x: str(x)[1:])
df_tags = df_tags.drop_duplicates()
print(df_tags.head(n=2))
print(len(df_tags))

In [None]:
df_tags.to_csv('df_tags', sep=',')

In [None]:
df = s.to_dataframe()
df = df[df['input_address'].notnull()]
df = df[df['output_address'].notnull()]

In [None]:
#ADD TAGS TO TRANSACTION RECORDS
df_transactions = df
df_transactions = (pd.merge(df_transactions, df_tags.groupby(['address', 'category'])['category'].size().unstack()
                        .add_prefix('input_')
                        .reset_index(), how='left', left_on = 'input_address', right_on = 'address')
                        .fillna(0))

df_transactions = (pd.merge(df_transactions, df_tags.groupby(['address', 'category'])['category'].size().unstack()
                        .add_prefix('output_')
                        .reset_index(), how='left', left_on = 'output_address', right_on = 'address')
                        .fillna(0))

In [None]:
#CREATE NODES WITH TAGS FROM TRANSACTIONS
nodes = []
for index, i in df_transactions.iterrows():
    temp = {
        'address': i['input_address'], "value": i['value'], 
        "received_gambling": i['input_1'],
        "received_charity": i['input_2'],
        "received_finance": i['input_3'],
        "received_services": i['input_4'],
        "received_junk": i['input_5'],
        "received_pools": i['input_6'],
        "transaction_count": 1
    }
    nodes.append(temp)
    temp = {
        'address': i['output_address'], "value": i['value'], 
        "sent_gambling": i['output_1'],
        "sent_charity": i['output_2'],
        "sent_finance": i['output_3'],
        "sent_services": i['output_4'],
        "sent_junk": i['output_5'],
        "sent_pools": i['input_6'],
        "transaction_count": 1
    }
    nodes.append(temp)

In [None]:
df_nodes = pd.DataFrame(nodes)
df_nodes = df_nodes.fillna(0)

In [None]:
df_merged = df_nodes.merge(df_tags, how='inner', on='address')
print(len(df_merged))
print(len(df_nodes))

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df_merged['address'])
df_merged['address'] = le.transform(df_merged['address'])

In [None]:
df_merged['address'] = le.inverse_transform(df_merged['address'])

In [None]:
df_merged = df_merged.groupby('address').sum()
df_merged = df_merged.reset_index(drop=False)

In [None]:
df_merged['is_gambler'] = df_merged['sent_gambling'] > 1
df_merged['is_charity'] = df_merged['sent_charity'] > 1
df_merged['is_finance'] = df_merged['sent_finance'] > 1
df_merged['is_junk'] = df_merged['sent_junk'] > 1
df_merged['is_pools'] = df_merged['sent_pools'] > 1
df_merged['is_services'] = df_merged['sent_services'] > 1

In [None]:
df_merged.save('output/nodes/temp')

In [None]:
try:
    transaction_data = 'output/nodes/temp/'
    df_merged = gl.SFrame(transaction_data)
    print(len(df_merged))
except:
    print('error')

In [None]:
#DEGREES

deg = degree_counting.create(g)
deg_graph = deg['graph'] # a new SGraph with degree data attached to each vertex
in_degree = deg_graph.vertices[['__id', 'in_degree']]
out_degree = deg_graph.vertices[['__id', 'out_degree']]

#PAGERANK

pr = pagerank.create(g)
pr_out = pr['pagerank']

#CONNECTED COMPONENT

cc = gl.connected_components.create(g)
wcc = (cc['component_id'])

#K-CORE

kc = kcore.create(g)
kcore = kc['core_id']

df_pagerank = pr_out.to_dataframe()
df_in = in_degree.to_dataframe()
df_out = out_degree.to_dataframe()
df_wcc = wcc.to_dataframe()
df_kcore = kcore.to_dataframe()

df_merged = df_merged.merge(df_pagerank[['__id', 'pagerank', 'delta']], how='left', left_on='address', right_on='__id')
df_merged = df_merged.merge(df_in[['__id', 'in_degree']], how='left', left_on='address', right_on='__id')
df_merged = df_merged.merge(df_out[['__id', 'out_degree']], how='left', left_on='address', right_on='__id')
df_merged = df_merged.merge(df_wcc[['__id', 'component_id']], how='left', left_on='address', right_on='__id')
df_merged = df_merged.merge(df_kcore[['__id', 'core_id']], how='left', left_on='address', right_on='__id')

print(df_merged.sort_values('sent_finance', ascending=False).head(1))

In [None]:
df_merged['avg_value'] = df_merged['value'] / df_merged['transaction_count']
df_merged.drop(['__id_x'],inplace=True,axis=1)
df_merged.drop(['__id_y'],inplace=True,axis=1)
df_merged.drop(['__id'],inplace=True,axis=1)
# print(df_merged.sort_values('sent_finance', ascending=False).head(10))
df_merged.to_csv('output/computed_dataset.csv', sep=',')

In [None]:
computed_output = gl.SFrame(df_merged)
computed_output.save('output/nodes/data')

In [None]:
print(len(computed_output))
print(len(df_merged))