# Visualising the database using iGraph

In [61]:
import pandas as pd 
import numpy as np
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.model_selection
import sklearn.cluster
import matplotlib
from matplotlib import pyplot as plt
import py2neo
#import seaborn as sb ##includes convenient heatmaps and boxplots
import scipy as sp
import pylab as pl
import igraph
import cairo


In [2]:
def query_database(query):
    # REMEMBER TO BE CONNECTED TO IMPERIAL WIFI!
    graph_db = py2neo.Graph("https://dsi-bitcoin.doc.ic.ac.uk:7473/db/data/", auth=("guest_ro", "imperialO_nly"))
    return graph_db.run(query)

def get_block_data(first_block, last_block):
    query_string = """
                    MATCH (b:Block) <-[:MINED_IN]- (t:Tx) <-[:IN]- (txi:TxIn) <-[:UNLOCK]- (iadr:Address)
                    WHERE b.height >= {} AND b.height <= {}
                    MATCH (txi) <-[:SPENT]- (txo_in:TxOut) 
                    MATCH (oadr:Address) <-[:LOCK]- (txo_out:TxOut) <-[:OUT]- (t) 
                    
                    RETURN iadr.address as iadr, oadr.address as oadr, txo_in.value as input_val, txo_out.value as output_val, ID(txo_in) as id_txo_in, ID(txi) as id_txi, ID(t) as id_t, ID(txo_out) as id_txo_out
                    
                    """.format(first_block, last_block)
    return query_string

In [62]:
#result = query_database(get_block_data(400000,400000))
#df = result.to_data_frame()
df = pd.read_csv('block_400000.csv')

# tups1 = []
# tups2 = []
# for d in result:
#     tups1.append((d['iadr'],d['oadr']))

In [47]:
#import igraph.remote.gephi as igephi
ig2 = igraph.Graph.Read_GraphML('./filename.graphml')

## Finding users by using heuristic measures

In [None]:
from collections import defaultdict

users_dict = defaultdict(lambda: {'iadr': set(), 'oadr': set()})
address_dict = defaultdict(lambda: {'user':set()})

## Heuristic 1

In [63]:
from collections import defaultdict

#CHANGE LATER TO QUERY DATABASE INSTEAD
def iadrs_from_tx(id_t):
    return set(df['iadr'][df["id_t"] == id_t])

def oadrs_from_tx(id_t):
    return set(df['oadr'][df["id_t"] == id_t])

def tx_from_iadr(iadr):
    return set(df['id_t'][df["iadr"] == iadr])

#FUNCTION TO WHICH YOU GIVE AN INPUT ADDRESS AND GET USER 
def get_user(input_adr):
    to_inv = [input_adr]
    user_iadrs = set()
    seen_txs = set()
    while to_inv:
        current_iadr = to_inv.pop(0)
        user_iadrs.add(current_iadr)
        for id_t in tx_from_iadr(current_iadr):

            if id_t not in seen_txs:
                
                seen_txs.add(id_t)
                iadrs = iadrs_from_tx(id_t)
                to_inv += iadrs.difference(user_iadrs) #Adding addr
                user_iadrs.update(iadrs)
            
    return User(user_iadrs, seen_txs)

#LIST OF LISTS OF USER AND THEIR ASSOCIATED ADDRESSES
users = []

class User:
    def __init__(self, adrs, txs):
        self.adr = set(adrs)
        self.tx = set(txs)
        self.cadr = set()
        

#ASSOCIATE INPUT ADDRESS AND TX WITH EACH USER IN BLOCK
already_seen_iadr = set()
for input_adr in df.iadr:
    if input_adr not in already_seen_iadr:
        user = get_user(input_adr)
        users.append(user)
        already_seen_iadr.update(user.adr)

#TAKES OUTPUT ADDRESS AND GIVES USER THAT HAS THAT ADDRESS AS INPUT
def user_from_oadr(oadr):
    for i,user in enumerate(users):
        if oadr in user.adr:
            return i
        
edges = defaultdict(int)

#WHICH USERS IN HAVE TRANSACTED WITH EACH OTHER
for i,user in enumerate(users):
    for tx_id in user.tx:
        for oadr in oadrs_from_tx(tx_id):
            if oadr in already_seen_iadr:

                    edges[(i, user_from_oadr(oadr))]+=1  
            

## Heuristic 2

In [64]:
import pickle
with open ('outfile', 'rb') as fp:
    not_seen = pickle.load(fp)
    
appeared_once_o= list(df.oadr.value_counts()[df.oadr.value_counts()==1].index)
all_iadrs= list(df.iadr.value_counts().index)

o_never_used_as_i = set(appeared_once_o).difference(all_iadrs)

In [65]:
from collections import Counter

tx_ids = []
for i,user in enumerate(users):
    for tx_id in user.tx:
        tx_ids.append(tx_id)
        
ads = []
for i,user in enumerate(users):
    for ad in user.adr:
        ads.append(ad)
        
cads = []
for i,user in enumerate(users):
    for cad in user.cadr:
        cads.append(cad)

if Counter(tx_ids).most_common(10)[0][1] == 1:
    print("No repeats")
    
if Counter(ads).most_common(10)[0][1] == 1:
    print("No repeats")

No repeats
No repeats


In [74]:
for i,user in enumerate(users):
    temp = set()
    for tx_id in user.tx:
        o = oadrs_from_tx(tx_id)
        potential_cadr = []
        for oadr in o:
            if oadr in not_seen and oadr in o_never_used_as_i:
                potential_cadr.append(oadr)
        if len(potential_cadr)==1:
            temp.add(potential_cadr[0])
    user.cadr.update(temp)   

In [None]:
df.groupby('id_t').agg({'oadr':['nunique', 'count'],'iadr':['nunique', 'count'],'id_txo_in':['nunique', 'count'],'id_txo_out':['nunique', 'count']})

In [None]:
df.to_csv('users.csv', columns=['input_user','output_user'])

In [77]:
#Construct User Graph
df['input_user'] = df['iadr']
df['output_user'] = df['oadr']
for i, user in enumerate(users):
    df['input_user'] = df['input_user'].apply(lambda x: i if x in user.adr else x)
    df['output_user'] = df['output_user'].apply(lambda x: i if x in user.adr else x)
    df['output_user'] = df['output_user'].apply(lambda x: i if x in user.cadr else x)

# can't trust input_val column now
# because dropped lots of inputs
df_new = df.drop_duplicates(['input_user', 'id_txo_out'])
df_new = df_new.groupby(['input_user', 'output_user']).apply(lambda group: group['output_val'].sum()).reset_index()
tups = []
for i in range(0, df_new.shape[0]):
    tups.append((df_new.at[i, 'input_user'],df_new.at[i, 'output_user']))

In [82]:
import math

ig = igraph.Graph.TupleList(tups,directed=True)

layout = ig.layout_kamada_kawai()
visual_style = {}
visual_style["layout"] = layout
visual_style["bbox"]= (10000, 10000)
visual_style["margin"] = 50
visual_style["autocurve"] = True
visual_style["arrow_size"] = 0.01

#visual_style["vertex_label"] = ig.vs['label']
#visual_style['edge_width'] = [0.03*i for i in ig.es['weight']]
#visual_style['edge_color'] = [color[i] for i in ig.es['platform']]
visual_style['keep_aspect_ratio'] = True

size = []
for i in ig.degree():
    if i > 1:
        size.append(20*math.log(i))
    else:
        size.append(i)
visual_style["vertex_size"] = size

p = igraph.Plot("user_graph.png", bbox=(10000, 10000), background="white")
p.add(ig, **visual_style)
# p.redraw()

fileName = 'hi.png'
context = cairo.Context(p.surface)
context.set_font_size(60)
title = "Address Graph of Block 400000"
#drawer = TextDrawer(context, text=title, halign=TextDrawer.CENTER)
#drawer.draw_at(x=1745, y=100, width=600)
p.save(fileName)
#ig.write_graphml('./testing.graphml')

KeyboardInterrupt: 

In [79]:
ig.write_graphml('./hello2.graphml')

## Feature extraction per user

In [None]:
df[[col for col in df.columns if not 'Unnamed' in col]].head()

In [120]:
user_in_df = df.groupby('input_user').agg({
    'id_txo_out': 'nunique',
    'id_txi': 'nunique',
    'input_val': ['max', 'min', 'sum']
}).rename({
    
})

user_in_df['input_val']['sum'] = user_df['input_val']['sum'] / user_df['id_txo_out']['nunique']

user_out_df = df.groupby('output_user').agg({
    'output_val': 'sum',
    'id_txo_in': 'nunique'
}).rename({
})

user_out_df['output_val'] = user_out_df['output_val'] / user_out_df['id_txo_in']

user_df.merge(user_out_df, how='left', left_index=True, right_index=True).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,"(input_val, max)","(input_val, min)","(input_val, sum)","(id_txo_out, nunique)","(id_txi, nunique)",id_txo_in,output_val
input_user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,0.0,0.0,1,1,,
1,2.32804,0.9849,9.38559,1,6,6.0,7.885916
2,1.7137,1.2503,8.96339,1,6,,
3,0.064798,0.003142,0.233997,1,3,,
4,0.400641,0.239484,2.065325,1,6,,


In [192]:
user_in_df = (df.groupby('input_user').agg({
    'id_txo_out': 'nunique',
    'input_val': ['max', 'min', 'sum'],
    'id_txi': 'nunique'
}).rename(columns = {
    'sum':'total_out',
    'max':'max_out',
    'min':'min_out',
    'nunique': 'in_degree' 
}))


user_in_df['input_val']['total_out'] = user_in_df['input_val']['total_out'] / user_in_df['id_txo_out']
# cluster['total_val'] = cluster['total_val']/cluster['id_txo_out']

user_out_df = df.groupby('output_user').agg({
    'output_val': ['max', 'min', 'sum'],
    'id_txi': 'nunique',
    'id_txo_out': 'nunique'
}).rename(columns = {
    'sum':'total_in',
    'max':'max_in',
    'min':'min_in',
    'nunique': 'out_degree'
})

user_out_df['output_val']['total_in'] = user_out_df['output_val']['total_in']  / user_out_df['id_txi']

user_in_df.merge(user_out_df, how='left', left_index=True, right_index=True).head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
  return_indexers=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,input_val,input_val,input_val,id_txo_out,id_txi,output_val,output_val,output_val,id_txi,id_txo_out
Unnamed: 0_level_1,max_out,min_out,total_out,in_degree,in_degree,max_in,min_in,total_in,out_degree,out_degree
input_user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,0.0,0.0,0.0,1,1,,,,,
1,2.32804,0.9849,9.38559,1,6,9.43,0.165496,47.315496,6.0,2.0
2,1.7137,1.2503,8.96339,1,6,,,,,
3,0.064798,0.003142,0.233997,1,3,,,,,
4,0.400641,0.239484,2.065325,1,6,,,,,


In [189]:
user_out_df.head()

Unnamed: 0_level_0,input_val,input_val,input_val,id_txi
Unnamed: 0_level_1,max_out,min_out,total_out,in_degree
output_user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,255.175,0.011502,264.621969,6
13,14.5957,8.56087,23.15657,2
58,0.003954,0.003954,0.003954,1
59,0.005303,0.003954,0.009256,2
60,0.006651,0.005303,0.011954,2


In [121]:
col_names =  ['id','total_in','total_out','in_degree','out_degree','max_in', 'max_out','min_in','min_out']
cluster  = pd.DataFrame(columns = col_names)

for u in users: 
    
    out_degree=0
    in_degree=0
    in_vals=[]
    out_vals=[]
    for a in u.adr:
        d = df.loc[df['iadr'] == a]
        out_degree=out_degree+d.shape[0]
        out_vals.extend(list(d['input_val']))

        d = df.loc[df['oadr'] == a]
        in_degree=in_degree+d.shape[0]
        in_vals.extend(list(d['output_val']))
    
    if len(u.cadr) != 0:
        for c in u.cadr:
            d = df.loc[df['oadr'] == c]
            in_degree=in_degree+d.shape[0]
            in_vals.extend(list(d['output_val'])) 

    ratio=in_degree/out_degree
    total_out_val = sum(out_vals)
    total_in_val = sum(in_vals)
    max_out = max(out_vals) if out_vals else 0
    max_in= max(in_vals) if in_vals else 0
    min_in= min(in_vals) if in_vals else 0
    min_out= min(out_vals) if out_vals else 0
    
    data =  {'id':users.index(u),'total_in':total_in_val,'total_out':total_out_val
             ,'in_degree':in_degree,'out_degree':out_degree,'max_in':max_in,'max_out':max_out,
             'min_in':min_in,'min_out':min_out}
    cluster = cluster.append(pd.DataFrame(data,index=[0])) 
    
cluster = cluster.set_index('id')

In [122]:
cluster.head()

Unnamed: 0_level_0,in_degree,max_in,max_out,min_in,min_out,out_degree,total_in,total_out
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0.0,0.0,0.0,0.0,1,0.0,0.0
1,6,9.43,2.32804,0.165496,0.9849,6,47.315496,9.38559
2,0,0.0,1.7137,0.0,1.2503,6,0.0,8.96339
3,0,0.0,0.064798,0.0,0.003142,9,0.0,0.233997
4,0,0.0,0.400641,0.0,0.239484,6,0.0,2.065325


In [None]:
col_names =  ['id','total_in','total_out','in_degree','out_degree','max_in', 'max_out','min_in','min_out']
# cluster  = pd.DataFrame(columns = col_names)
cluster = []

for u in users: 
    
    out_degree=0
    in_degree=0
    in_vals=[]
    out_vals=[]
    for a in u.adr:
        d = df.loc[df['iadr'] == a]
        out_degree=out_degree+d.shape[0]
        out_vals.extend(list(d['input_val']))

        d = df.loc[df['oadr'] == a]
        in_degree=in_degree+d.shape[0]
        in_vals.extend(list(d['output_val']))
    
    belongs_to_user = df[df['iadr'].apply(lambda address: address in u.adr)]
    
    if len(u.cadr) != 0:
        for c in u.cadr:
            d = df.loc[df['oadr'] == c]
            in_degree=in_degree+d.shape[0]
            in_vals.extend(list(d['output_val'])) 

    ratio=in_degree/out_degree
    total_out_val = sum(out_vals)
    total_in_val = sum(in_vals)
    max_out = max(out_vals) if out_vals else 0
    max_in= max(in_vals) if in_vals else 0
    min_in= min(in_vals) if in_vals else 0
    min_out= min(out_vals) if out_vals else 0
    
    data =  {'id':users.index(u),'total_in':total_in_val,'total_out':total_out_val
             ,'in_degree':in_degree,'out_degree':out_degree,'max_in':max_in,'max_out':max_out,
             'min_in':min_in,'min_out':min_out}
    cluster = cluster.append(pd.DataFrame(data,index=[0])) 

cluster = pd.concat(cluster)
cluster = cluster.set_index('id')

In [None]:
#Normalize data

scaler = sklearn.preprocessing.StandardScaler()
scaled_cluster = scaler.fit_transform(cluster)
cluster_scaled = pd.DataFrame(scaled_cluster, columns=cluster.columns, index=cluster.index)


data_corr = cluster_scaled.corr()
sb.heatmap(data_corr, cmap = 'bwr') #heatmap of correlation matrix

In [None]:
#split data into train and test sets
clus_train, clus_test = sklearn.model_selection.train_test_split(cluster_scaled, test_size=0.3, random_state=123)

clusters = range(1,10)
meandist=[]

for k in clusters:
    model = sklearn.cluster.KMeans(n_clusters = k)
    model.fit(clus_train)
    clusassign=model.predict(clus_train)
    meandist.append(sum(np.min(sp.spatial.distance.cdist(clus_train,model.cluster_centers_,'euclidean'),axis=1))/clus_train.shape[0])
    
plt.plot(clusters, meandist, '-o')
#plt.subplot(2,1,1)
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')
plt.show

# # Convert DataFrame to matrix
# mat = cluster_scaled.values
# distorsions = []
# x = range(2, 20)
# for k in x:
#     #Perform K Means
#     kmeans = sk.cluster.KMeans(n_clusters=k)
#     kmeans.fit(mat)
#     distorsions.append(kmeans.inertia_)

# fig = plt.figure(figsize=(15, 5))
# plt.plot(x, distorsions)
# plt.grid(True)
# plt.title('Elbow curve')
# plt.show
# plt.xticks(x)

In [None]:
plt.gcf().clear()
model = sklearn.cluster.KMeans(n_clusters = 2)
model.fit(clus_train)
clusassign = model.predict(clus_train)

colors = ['red', 'blue']

#Principal Component Analysis
pca_2 = sklearn.decomposition.PCA(2)
plot_columns = pca_2.fit_transform(clus_train)    
plt.scatter(x=plot_columns[:,0],y=plot_columns[:,1],c=model.labels_,cmap = matplotlib.colors.ListedColormap(colors),edgecolors = 'none')
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 2 clusters')
plt.show


# Get cluster assignment labels
labels = model.labels_
# Format results as a DataFrame
data = {'transaction_id':clus_train.index,'cluster_label':labels}
results = pd.DataFrame(data)

In [None]:
import collections as c

count = c.defaultdict(int) 

for user in users:
    count[len(user)]+=1

In [None]:
plt.plot(count.keys(),count.values())

In [None]:
matches = set(df.iadr).intersection(df.oadr)
print(matches