# Visualising the database using iGraph

In [2]:
import pandas as pd 
import numpy as np
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.model_selection
import sklearn.cluster
import matplotlib
from matplotlib import pyplot as plt
import py2neo
import seaborn as sb ##includes convenient heatmaps and boxplots
import scipy as sp
import pylab as pl
import igraph
#import cairocffi as cairo
import cairo
import pickle

In [60]:
def query_database(query):
    # REMEMBER TO BE CONNECTED TO IMPERIAL WIFI!
    graph_db = py2neo.Graph("https://dsi-bitcoin.doc.ic.ac.uk:7473/db/data/", auth=("adi", "aditi123"))
    return graph_db.run(query)

def get_block_data(first_block, last_block):
    query_string = """
                    MATCH (b:Block) <-[:MINED_IN]- (t:Tx) <-[:IN]- (txi:TxIn) <-[:UNLOCK]- (iadr:Address)
                    WHERE b.height >= {} AND b.height <= {}
                    MATCH (txi) <-[:SPENT]- (txo_in:TxOut) 
                    MATCH (oadr:Address) <-[:LOCK]- (txo_out:TxOut) <-[:OUT]- (t) 
                    
                    RETURN iadr.address as iadr, oadr.address as oadr, txo_in.value as input_val, txo_out.value as output_val, ID(txo_in) as id_txo_in, ID(txi) as id_txi, ID(t) as id_t, ID(txo_out) as id_txo_out
                    
                    """.format(first_block, last_block)
    return query_string

In [61]:
result = query_database(get_block_data(400000,400000))
df = result.to_data_frame()
#df = pd.read_csv('./csv/400000addr.csv')
#df[[col for col in df.columns if not 'Unnamed' in col]]

# New columns for number of input and output transaction ids
df['num_txo'] = df.groupby('id_t')['id_txo_out'].transform('nunique')
df['num_txi'] = df.groupby('id_t')['id_txi'].transform('nunique')

## Finding users by using heuristic measures

## Heuristic 1

In [62]:
from collections import defaultdict

#CHANGE LATER TO QUERY DATABASE INSTEAD
def iadrs_from_tx(id_t):
    return set(df['iadr'][df["id_t"] == id_t])

def oadrs_from_tx(id_t):
    return set(df['oadr'][df["id_t"] == id_t])

def tx_from_iadr(iadr):
    return set(df['id_t'][df["iadr"] == iadr])

#FUNCTION TO WHICH YOU GIVE AN INPUT ADDRESS AND GET USER 
def get_user(input_adr):
    to_inv = [input_adr]
    user_iadrs = set()
    seen_txs = set()
    while to_inv:
        current_iadr = to_inv.pop(0)
        user_iadrs.add(current_iadr)
        for id_t in tx_from_iadr(current_iadr):

            if id_t not in seen_txs:
                
                seen_txs.add(id_t)
                iadrs = iadrs_from_tx(id_t)
                to_inv += iadrs.difference(user_iadrs) #Adding addr
                user_iadrs.update(iadrs)
            
    return User(user_iadrs, seen_txs)

#LIST OF LISTS OF USER AND THEIR ASSOCIATED ADDRESSES
users = []

class User:
    def __init__(self, adrs, txs):
        self.adr = set(adrs)
        self.sending_tx = set(txs)
        self.cadr = set()
        self.receiving_tx = set(txs)

# Bitcoin- 
bitcoin = User({str(0)}, set()) #Make user object with bitcoin iadr (which is 0)
seen_miner_iadrs_tx = defaultdict(set) #Make dict associating miner payment address with tx_ids they've been involved in

for index, row in df[df['iadr'] == str(0)].iterrows(): #Going through all mining txs 
    bitcoin.sending_tx.add(row['id_t']) #Adding tx id to bitcoin user's txs
    seen_miner_iadrs_tx[row['oadr']].add(row['id_t']) # Updating dict to register the tx_id as corresponding to miners adr. If new, then new key added, otherwise added to values of existing key
    
users.append(bitcoin) # Add bitcoin user
already_seen_iadr = {str(0)}  # Bitcoin iadr has already been seen

# make miners users
for adr, id_ts in seen_miner_iadrs_tx.items(): #Go through dictionary for every adr (miner) and txs he's been involved
    miner = get_user(adr) #From an address, give back user ... aka identify all addresses belonging to miner
    miner.adr.add(adr) #Make sure adrs and txs are added in
    miner.sending_tx.update(id_ts)
    users.append(miner)
    already_seen_iadr.update(miner.adr)#Made sure miner's addresses are in already seen so that we don't create a second user with the same addresses
    

## ASSOCIATE INPUT ADDRESS AND TX WITH EACH USER IN BLOCK
# make other users from heuristic
for input_adr in df.iadr:
    if input_adr not in already_seen_iadr:
        user = get_user(input_adr)
        users.append(user)
        already_seen_iadr.update(user.adr)


#TAKES OUTPUT ADDRESS AND GIVES USER THAT HAS THAT ADDRESS AS INPUT
def user_from_oadr(oadr):
    for i,user in enumerate(users):
        if oadr in user.adr:
            return i
        
edges = defaultdict(int)

#WHICH USERS IN HAVE TRANSACTED WITH EACH OTHER
for i,user in enumerate(users):
    for tx_id in user.sending_tx:
        for oadr in oadrs_from_tx(tx_id):
            if oadr in already_seen_iadr:

                    edges[(i, user_from_oadr(oadr))]+=1  
            

## Heuristic 2

In [63]:
import pickle
with open ('outfile', 'rb') as fp:
    not_seen = pickle.load(fp)
    
appeared_once_o= list(df.oadr.value_counts()[df.oadr.value_counts()==1].index)
all_iadrs= list(df.iadr.value_counts().index)

o_never_used_as_i = set(appeared_once_o).difference(all_iadrs)

In [64]:
from collections import Counter

tx_ids = []
for user in users[1:]:#Drop out bitcoin user
    tx_ids += list(user.sending_tx)

#tx_ids = list.append([list(user.sending_tx) for user in users])
        
ads = []
for i,user in enumerate(users):
    for ad in user.adr:
        ads.append(ad)
        
cads = []
for i,user in enumerate(users):
    for cad in user.cadr:
        cads.append(cad)

if Counter(tx_ids).most_common(10)[0][1] == 1:  ##Repeated txids between bitcoin and the miners
    print("No repeats txid")
    
if Counter(ads).most_common(10)[0][1] == 1:
    print("No repeats ads")

No repeats txid
No repeats ads


In [65]:
#Change Transactions
for i,user in enumerate(users):
    temp = set()
    for tx_id in user.sending_tx:
        o = oadrs_from_tx(tx_id)
        potential_cadr = []
        for oadr in o:
            if oadr in not_seen and oadr in o_never_used_as_i:
                potential_cadr.append(oadr)
        if len(potential_cadr)==1:
            temp.add(potential_cadr[0])
    user.cadr.update(temp)   

In [None]:
df.groupby('id_t').agg({'oadr':['nunique', 'count'],'iadr':['nunique', 'count'],'id_txo_in':['nunique', 'count'],'id_txo_out':['nunique', 'count']})

df.to_csv('users.csv', columns=['input_user','output_user'])

In [66]:
with open('users.pickle', 'wb') as handle:
    pickle.dump(users, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Construct User Graph

In [67]:
#Construct User Graph
df['input_user'] = df['iadr']  
df['output_user'] = df['oadr']

#Replacing all input addresses and output addresses with a user corresponding to that address
for i, user in enumerate(users): 
    #assert(isinstance(i, int))
    df['input_user'] = df['input_user'].apply(lambda x: i if x in user.adr else x)
    df['output_user'] = df['output_user'].apply(lambda x: i if x in user.adr else x)
    df['output_user'] = df['output_user'].apply(lambda x: i if x in user.cadr else x)
    
for tx_id, output_user in df[['id_t','output_user']].values:
    if isinstance(output_user,int):
        users[output_user].receiving_tx.add(tx_id)

In [68]:
# can't trust input_val column now
# because dropped lots of inputs
edges_df0 = df.drop_duplicates(['input_user', 'id_txo_out'])
edges_df = edges_df0.groupby(['input_user', 'output_user']).apply(lambda group: group['output_val'].sum()).reset_index()
#edges_df0 = edges_df0.rename(columns={0: 'edge_amount'})

# edges_df2 = (
#     df.groupby(['input_user', 'output_user'])
#     .apply(lambda group: (group['output_val'] / group['num_txi']).sum())
#     .reset_index()
# )

# assert((edges_df == edges_df2).all())

# tups = []
# for i in range(0, edges_df.shape[0]):
#     tups.append((edges_df.at[i, 'input_user'],edges_df.at[i, 'output_user']))
    
tups = [(input_user, output_user, amount) for (index, input_user, output_user, amount) in edges_df.itertuples()]

with open('./pickles/tups.pickle', 'wb') as handle:
    pickle.dump(tups, handle, protocol=pickle.HIGHEST_PROTOCOL)
#tups2 = [(input_user, output_user, amount) for (index, input_user, output_user, amount) in edges_df2.itertuples()]

# assert(all(tups == tups2))

In [15]:
users[1].sending_tx.intersection(users[934].receiving_tx)

{113001822}

In [None]:
for i,user in enumerate(users):
    if len(user.adr) ==1:
        print(i)

In [None]:
df.head()
df['iadr'].apply(type).value_counts()

## User Feature Extraction

In [69]:
# user input features
user_input_df = df.groupby('input_user').agg({
    'id_txo_out': 'nunique', #Num unique times paid out
    'oadr':'nunique', #Num of unique out addresses paid out
    'output_user': 'nunique', #Num of unique users paid out (Out Degree)
    #'id_txi': 'nunique', #Num unique times paid in
    'id_t': 'nunique', #Num Txs involved in
    'input_val': ['max', 'min']
})

user_input_df.columns = ['_'.join(col) for col in user_input_df.columns]

user_input_df.rename(columns={
    'id_txo_out_nunique': 'unique_sent',
    'oadr_nunique': 'unique_sent_adr',
    'output_user_nunique': 'unique_sent_user',  # (Out Degree)
    'id_t_nunique': 'tx1',
    'input_val_max': 'max_sent',
    'input_val_min': 'min_sent'
}, inplace=True)

user_input_df['total_sent'] = (df['input_val'] / df['num_txo']).groupby(df['input_user']).sum()
#dummy1 = (df['input_val'] / df['num_txo']).groupby(df['input_user']).sum()
#dummy2 = (df['output_val'] / df['num_txi']).groupby(df['input_user']).sum()

#print(user_df.tail())

user_input_df.head()

Unnamed: 0_level_0,max_sent,min_sent,unique_sent_adr,tx1,unique_sent_user,unique_sent,total_sent
input_user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2.32804,0.9849,1,1,1,1,9.38559
2,1.7137,1.2503,1,1,1,1,8.96339
3,0.064797,0.003142,1,1,1,1,0.233997
4,0.400641,0.239484,1,1,1,1,2.065325
5,1.34841,0.0002,1,1,1,1,4.046738


In [70]:
# user output features
user_out_df = df.groupby('output_user').agg({
    'id_txi': 'nunique', #Num unique times paid in
    'iadr': ['nunique', lambda x: (x == str(0)).any()], #Num of unique in addresses paid this user
    'input_user': 'nunique', #Num of unique users paid in (In Degree)
    #'id_txo_out': 'nunique', #Num unique times paid
    'id_t': 'nunique', #Num Txs involved in
    'output_val': ['max', 'min']
})

user_out_df.columns = ['_'.join(col) for col in user_out_df.columns]

user_out_df.rename(columns={
    'id_txi_nunique': 'unique_rec',
    'iadr_nunique': 'unique_rec_adr',
    'iadr_<lambda>': 'is_miner',
    'input_user_nunique': 'unique_rec_user',  # (In Degree)
    'id_t_nunique': 'tx2',
    'output_val_max': 'max_rec',
    'output_val_min': 'min_rec'
}, inplace=True)

user_out_df['total_rec'] = (df['output_val'] / df['num_txi']).groupby(df['output_user']).sum()

user_out_df.head()

Unnamed: 0_level_0,unique_rec_adr,is_miner,unique_rec_user,max_rec,min_rec,tx2,unique_rec,total_rec
output_user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,6,False,2,9.43,0.165496,2,6,9.595496
13,2,False,2,5.822,5.026,2,2,10.848
58,1,False,1,0.002605,0.002605,1,1,0.002605
59,2,False,2,0.003954,0.001324,2,2,0.005278
60,2,False,2,0.005303,0.001324,2,2,0.006627


In [71]:
# Merge input and output user features
user_df = user_input_df.merge(user_out_df, how='left', left_index=True, right_index=True)

# Append miners
user_df = user_df.append(user_out_df[user_out_df['is_miner']])

# Name index
user_df.index.name = 'user'

# Fill in NA values
user_df['is_miner'].fillna(False, inplace=True)
user_df.fillna(0, inplace=True)

# New columns
user_df['num_tx'] = user_df['tx1'] + user_df['tx2']
# user_df = user_df.drop(['tx1', 'tx2'], axis=1)

user_total_sent = [(user,total_rec) for (user, total_rec) in user_df['total_sent'].iteritems()]
users_identified = list(user_df.index.values) 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [72]:
# Totals
total_amt_spent = (df['input_val'] / df['num_txo']).sum() ##Total amount spent in this block?
total_amt_recieved = (df['output_val'] / df['num_txi']).sum() ##Total amount received in this block?

## Draw User Graph

In [101]:
import math

ug = igraph.Graph.TupleList(tups,directed=True,vertex_name_attr='user',edge_attrs=['amount'])

ug.write_graphml('./graphml/user.graphml')

In [110]:
for u in ug.vs():
    all_addr = []
    node = u['user']
    if node in list(range(len(users))):
        a = users[node].adr
        c = users[node].cadr
        for address in a.union(c):
            if address in dic.keys():
                all_addr.append(address)
                check = True
    elif node in dic.keys():
        all_addr.append(address)
        
    if all_addr:
        u['tatti'] = all_addr
    else:
        u['tatti'] = ''                

In [42]:
#Backup Graph
ag = igraph.Graph.Read("../Graphs/400000_addr.graphml", format = "graphml")

for i,k in enumerate(ag.vs):
    addr = ag.vs[i]["name"]
    assigned = False
    count = 0
    if addr in dic.keys():
        ag.vs[i]["label"] = dic[addr]
    else:
        ag.vs[i]["label"] = ""
    for j, user in enumerate(users):
        if addr in user.adr or addr in user.cadr:
            ag.vs[i]["user"] = j
            assigned = True
            count = count +1
    if not assigned:
        ag.vs[i]["user"] = addr
    if count>1:
        print("Aw shit")
        

ag.write_graphml('./fuckme.graphml')

In [101]:
#Pagerank
nodes = ig.vs['user']
pr = ig.pagerank()
result = list(zip(nodes, pr))
d = dict(result)
new_d = {k:v for k,v in d.items() if not isinstance(k,str)}
pagerank_df = pd.DataFrame.from_dict(new_d,orient='index')

In [None]:
visual_style = {}
visual_style["layout"] = layout
visual_style["bbox"]= (10000, 10000)
visual_style["margin"] = 50
#visual_style["autocurve"] = True
visual_style["vertex_label"] = ig.vs['user']
visual_style['edge_arrow_size'] = 0.05
visual_style['edge_width'] = [0.03*i for i in ig.es['amount']]
visual_style['vertex_label_size'] = 5
visual_style['edge_curved'] = 1
visual_style['keep_aspect_ratio'] = True

size = []
d = dict(user_total_sent)
for k in ig.vs['user']:
    if isinstance(k, str):
        size.append(1)
    else:
        size.append(20*math.log(d[k]))
        
#norm = [float(i)/max(size) for i in size]
# size = []
# for i in ig.degree():
#     if i > 1:
#         size.append(20*math.log(i))
#     else:
#         size.append(i)
visual_style["vertex_size"] = new_list

fileName = "USER.png"
p = igraph.Plot(fileName, bbox=(10000, 10000), background="white")
p.add(ig, **visual_style)
p.save(fileName)

## Clustering

In [None]:
#Normalize data

scaler = sklearn.preprocessing.StandardScaler()
scaled_cluster = scaler.fit_transform(user_df)
cluster_scaled = pd.DataFrame(scaled_cluster, columns=user_df.columns, index=user_df.index)


data_corr = cluster_scaled.corr()
sb.heatmap(data_corr, cmap = 'bwr') #heatmap of correlation matrix

In [None]:
#split data into train and test sets
clus_train, clus_test = sklearn.model_selection.train_test_split(cluster_scaled, test_size=0.3, random_state=123)

clusters = range(1,10)
meandist=[]

for k in clusters:
    model = sklearn.cluster.KMeans(n_clusters = k)
    model.fit(clus_train)
    clusassign=model.predict(clus_train)
    meandist.append(sum(np.min(sp.spatial.distance.cdist(clus_train,model.cluster_centers_,'euclidean'),axis=1))/clus_train.shape[0])
    
plt.plot(clusters, meandist, '-o')
#plt.subplot(2,1,1)
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')
plt.show

# # Convert DataFrame to matrix
# mat = cluster_scaled.values
# distorsions = []
# x = range(2, 20)
# for k in x:
#     #Perform K Means
#     kmeans = sk.cluster.KMeans(n_clusters=k)
#     kmeans.fit(mat)
#     distorsions.append(kmeans.inertia_)

# fig = plt.figure(figsize=(15, 5))
# plt.plot(x, distorsions)
# plt.grid(True)
# plt.title('Elbow curve')
# plt.show
# plt.xticks(x)

In [None]:
plt.gcf().clear()
model = sklearn.cluster.KMeans(n_clusters = 2)
model.fit(clus_train)
clusassign = model.predict(clus_train)

colors = ['red', 'blue']

#Principal Component Analysis
pca_2 = sklearn.decomposition.PCA(2)
plot_columns = pca_2.fit_transform(clus_train)    
plt.scatter(x=plot_columns[:,0],y=plot_columns[:,1],c=model.labels_,cmap = matplotlib.colors.ListedColormap(colors),edgecolors = 'none')
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 2 clusters')
plt.show


# Get cluster assignment labels
labels = model.labels_
# Format results as a DataFrame
data = {'transaction_id':clus_train.index,'cluster_label`':labels}
results = pd.DataFrame(data)

## Label graph with services


In [None]:
#Dictionary structure - 
#'Service': 'Address'
data = pd.read_csv('./wallet_explorer/wexplorer.csv')
service = []
for i in range(len(data)):
    service.append(data.iloc[i]['Col'])
    
block = 400000
dic = {}
for i in service:  
    df = pd.read_pickle('./wallet_explorer/data_2/{}'.format(i))
    addr = df.loc[df['last used in block'] == block]['address'].tolist()
    dic[i] = addr
    
dic2 = {k:v for k,v in dic.items() if len(v)!=0}

In [26]:
#Dictionary structure - 
#'Address': 'Service'
data = pd.read_csv('./wallet_explorer/wexplorer.csv')
service = []
for i in range(len(data)):
    service.append(data.iloc[i]['Col'])
    
block = 400000
dic = {}
for i in service:  
    df = pd.read_pickle('./wallet_explorer/data_2/{}'.format(i))
    addr = df.loc[df['last used in block'] == block]['address'].tolist()
    if(len(addr)!=0):
        for a in addr:
            dic[a] = i

In [29]:
import pickle

with open('./pickles/service_dic.pickle', 'wb') as handle:
    pickle.dump(dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

## TESTING

In [126]:
import igraph
import py2neo
import pickle

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

class User:
    def __init__(self, adrs, txs):
        self.adr = set(adrs)
        self.sending_tx = set(txs)
        self.cadr = set()
        self.receiving_tx = set(txs)

def query_database(query):
    # REMEMBER TO BE CONNECTED TO IMPERIAL WIFI!
    graph_db = py2neo.Graph("https://dsi-bitcoin.doc.ic.ac.uk:7473/db/data/", auth=("adi", "aditi123"))
    return graph_db.run(query)

def get_block_data(first_block, last_block):
    query_string = """
                    MATCH (b:Block) <-[:MINED_IN]- (t:Tx) <-[:IN]- (txi:TxIn) <-[:UNLOCK]- (iadr:Address)
                    WHERE b.height >= {} AND b.height <= {}
                    MATCH (txi) <-[:SPENT]- (txo_in:TxOut)
                    MATCH (oadr:Address) <-[:LOCK]- (txo_out:TxOut) <-[:OUT]- (t)

                    RETURN iadr.address as iadr, oadr.address as oadr
                    """.format(first_block, last_block)
    return query_string

def address_graph(result,dic,users):
    #tups1 = []
    #for d in result:
    #    tups1.append((d['iadr'],d['oadr']))
    #ag = igraph.Graph.TupleList(tups1,directed=True,vertex_name_attr='addr')
    ag = igraph.Graph.Read("../Graphs/400000_addr.graphml", format = "graphml")

    for i,k in enumerate(ag.vs):
        
        a = ag.vs[i]["name"]
        assigned = False
        if a in dic.keys():
            ag.vs[i]["Label"] = dic[a]
        else:
            ag.vs[i]["Label"] = ""
        for j, user in enumerate(users):
            if a in user.adr or a in user.cadr:
                var = j
                assigned = True
        if not assigned:
            var = a
        ag.vs[i]["user"] = str(var)
    ag.write_graphml('./graphml/addr.graphml')
    return ag

def user_graph(tups):
    ug = igraph.Graph.TupleList(tups,directed=True,vertex_name_attr='user',edge_attrs=['amount'])

    for u in ug.vs():
        all_addr = ''
        node = u['user']
        if node in list(range(len(users))):
            a = users[node].adr
            c = users[node].cadr
            for address in a.union(c):
                if address in dic.keys():
                    all_addr += " {} ".format(address)
                    check = True
        elif node in dic.keys():
            all_addr += " {} ".format(address)

        if all_addr:
            u['user'] = all_addr
        else:
            u['user'] = ''
    
    
    print(ug.vertex_attributes())

    ug.write_graphml('./graphml/user.graphml')
    return ug

with open('./pickles/tups.pickle', 'rb') as handle:
    tups = pickle.load(handle)
with open('./pickles/service_dic.pickle', 'rb') as handle:
    dic = pickle.load(handle)
with open('./pickles/users.pickle', 'rb') as handle:
    users = pickle.load(handle)
#result = query_database(get_block_data(400000,400000))
result = 0
new_ag = address_graph(result,dic,users)
new_ug = user_graph(tups)


['user']


In [121]:
ag = igraph.Graph.Read("../Graphs/400000_addr.graphml", format = "graphml")
ag.vertex_attributes()

['id', 'name']

In [122]:
new_ag.vertex_attributes()

['id', 'user', 'name', 'Label']

In [123]:
newer_ag = igraph.Graph.Read("./graphml/addr.graphml", format = "graphml")
newer_ag.vertex_attributes()

  return reader(f, *args, **kwds)


['id', 'user', 'name', 'Label']

In [127]:
new_ug.vertex_attributes()

['user']

In [None]:
newer_ug = igraph.Graph.Read("./graphml/user.graphml", format = "graphml")
newer_ug.vertex_attributes()

for i in new_ug.vs():
    print(i)