In [None]:
import pandas as pd 
import numpy as np
import sklearn.cluster as sk

import py2neo

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

df = pd.read_csv('block_400000.csv')

# Query Database

In [None]:
def query_database(query):
    # REMEMBER TO BE CONNECTED TO IMPERIAL WIFI!
    graph_db = py2neo.Graph("https://dsi-bitcoin.doc.ic.ac.uk:7473/db/data/", auth=("guest_ro", "imperialO_nly"))
    return graph_db.run(query)

In [None]:
def get_block_data(first_block, last_block):
    query_string = """
                    MATCH (b:Block) <-[:MINED_IN]- (t:Tx) <-[:IN]- (txi:TxIn) <-[:UNLOCK]- (iadr:Address)
                    WHERE b.height >= {} AND b.height <= {}
                    MATCH (txi) <-[:SPENT]- (txo_in:TxOut) 
                    MATCH (oadr:Address) <-[:LOCK]- (txo_out:TxOut) <-[:OUT]- (t)
                    
                    RETURN iadr.address as iadr, oadr.address as oadr, txo_in.value as input_val, txo_out.value as output_val, ID(txo_in) as id_txo_in, ID(txi) as id_txi, ID(t) as id_t, ID(txo_out) as id_txo_out
                    """.format(first_block, last_block)
    return query_string

In [None]:
print(df['iadr'].value_counts().sort_values(ascending=False))

In [None]:
print(df['oadr'].value_counts().sort_values(ascending=False))

In [None]:
len(df['oadr'].value_counts().sort_values(ascending=False))

In [None]:
len(df['iadr'].value_counts().sort_values(ascending=False))

In [None]:
#Check if any columns are unique
for column in df:
    print(df[column].is_unique)

In [None]:
df1 = df.loc[df['iadr'] == '1BQLNJtMDKmMZ4PyqVFfRuBNvoGhjigBKF']

# Feature extraction

In [None]:
features = np.array([], dtype=np.int64).reshape(0,8)
for val in df.id_t.value_counts().iteritems():
    data = df[df.id_t==val[0]]
    if data.input_val.sum() == 0:
        continue
    f = feature_extract(data)
    features = np.concatenate([features,f])  

In [None]:
def feature_extract(id_t,df):
    total_val = df.input_val.sum()
    num_in = df.nunique().id_txi
    num_out = df.nunique().id_txo_out
    ratio = num_in/float(num_out)
    degree = num_in + num_out
    num_unique_addr_in = df.nunique().iadr
    num_unique_addr_out = df.nunique().oadr
    data =  {'id':id_t,'total_val':total_val,'ratio':ratio,'num_in':num_in,'num_out':num_out,'degree':degree,'num_unique_addr_in':num_unique_addr_in,'num_unique_addr_out':num_unique_addr_out}
    df_ = pd.DataFrame(data,index=[0])
    return df_
    

In [None]:
col_names =  ['id','total_val','ratio','num_in','num_out','degree','num_unique_addr_in','num_unique_addr_out']
features  = pd.DataFrame(columns = col_names)
tx_ids = []
tx_id_val = []

for val in df.id_t.value_counts().iteritems():
    data = df[df.id_t==val[0]]
    tx_ids.append(val[0])
    tx_id_val.append(data.output_val.sum())
    if data.input_val.sum() == 0:
        continue
    f = feature_extract(val[0],data)
    features = features.append(f,'sort=True')
    

features = features.set_index('id')

# Evaluating K Means clustering performance using elbow method

In [None]:
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

# Convert DataFrame to matrix
mat = features.values
distorsions = []
x = range(2, 20)
for k in x:
    #Perform K Means
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(mat)
    distorsions.append(kmeans.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(x, distorsions)
plt.grid(True)
plt.title('Elbow curve')
plt.show
plt.xticks(x)

# Run K Means and assign centroids to transactions

In [None]:
# Convert DataFrame to matrix
mat = features.values
# Using sklearn
km = sk.KMeans(n_clusters=4)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
data = {'transaction_id':features.index,'cluster_label':labels}
results = pandas.DataFrame(data)

# Trying to visualize K Means

In [None]:
from sklearn.decomposition import PCA
import pylab as pl

pca = PCA(n_components=2).fit(mat)
pca_2d = pca.transform(mat)
pl.figure('Reference Plot')
pl.scatter(pca_2d[:, 0], pca_2d[:, 1])

km = sk.KMeans(n_clusters=4)
km.fit(mat)
pl.figure('K-means with 4 clusters')
pl.scatter(pca_2d[:, 0], pca_2d[:, 1], c=kmeans.labels_)
pl.show()



# Determining statistics of inputs, outputs of a transaction

In [None]:
tx_ids = []
tx_id_val = []
txo_in_ids = []
txo_out_ids = []
iadr = []
oadr = []
txi_ids = []


for val in df.iadr.value_counts().iteritems():
    iadr.append(val[0])

for val in df.id_txi.value_counts().iteritems():
    txi_ids.append(val[0])
    
    
for val in df.oadr.value_counts().iteritems():
    oadr.append(val[0])
    
for val in df.id_t.value_counts().iteritems():
    tx_ids.append(val[0])
    data = df[df.id_t==val[0]]
    tx_id_val.append(data.output_val.sum())
    

for val in df.id_txo_in.value_counts().iteritems():
    txo_in_ids.append(val[0])
    
for val in df.id_txo_out.value_counts().iteritems():
    txo_out_ids.append(val[0])


In [None]:
print(len(txo_in_ids))
print(len(txi_ids))
print(len(tx_ids))
print(len(txo_out_ids))

In [None]:
matches = set(txo_in_ids).intersection(txo_out_ids)
print(len(matches))

In [None]:
print(len(iadr))
print(len(oadr))

In [None]:
matches = set(iadr).intersection(oadr)
len(matches)

# Visualising the database using iGraph


In [None]:
import igraph
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
result = query_database(get_block_data(339000,339000))

tups1 = []
tups2 = []
for d in result:
    tups1.append((d['id_txo_in'],d['id_t']))
    tups2.append((d['id_t'],d['id_txo_out']))


In [None]:
result.get_graph()

In [None]:
ig = igraph.Graph.TupleList(tups1)

visual_style = {}
visual_style["vertex_size"] = 0.5
visual_style["layout"] = "fr"

igraph.plot(ig,**visual_style)

In [None]:
ig.vcount()

In [None]:
ig2 = igraph.Graph.TupleList(tups2, vertex_name_attr = 'tx')
g.vs['out'] = 
igraph.plot(ig2,**visual_style)

In [None]:
g = igraph.Graph()

In [None]:
g.add_edge('A','B',weight = 20)


In [None]:
#Insert Transactions and value of each transaction into graph

g.add_vertices(txo_out_ids)





In [None]:
g.add_vertices(txo_in_ids)

In [None]:
len(txo_out_ids)

In [None]:
len(tx_ids)

In [None]:
len(txo_in_ids)

In [None]:
g.vcount()

In [None]:
g.vs[3]

In [None]:
#Insert tx inputs into graph
g.add_vertices(txo_in_ids)
g.vs["type"] = 'in'
g.vs["in_ids"] = txo_in_ids
try:
    del g.vs["name"]
except Exception: 
  pass    

In [None]:
#Insert tx outputs into graph
g.add_vertices(txo_out_ids)
g.vs["type"] = 'out'
g.vs["out_ids"] = txo_out_ids
try:
    del g.vs["name"]
except Exception: 
  pass    

In [None]:
color_dict = {"tx": "green", "in": "orange", "out":'blue'}

In [None]:
visual_style = {}
visual_style["vertex_size"] = 2
visual_style["layout"] = "fr"
visual_style["vertex_color"] = [color_dict[type] for type in g.vs["type"]]

In [None]:
igraph.plot(g, **visual_style)

In [None]:
tx_id = []
for i in df.index():
    edge = (df.ix[i, 'name1'], df.ix[i, 'name2'])
    if edge not in edgelist:
        edgelist.append(edge)
        weights.append(1)
    else:
        weights[edgelist.index(edge)] += 1

G = Graph()
G.add_edges(edgelist)
G.es['weight'] = weights