In [None]:
import pandas as pd 
import numpy as np
import sklearn as sk
from matplotlib import pyplot as plt
import py2neo
import seaborn as sb ##includes convenient heatmaps and boxplots
import scipy as sp
import pylab as pl

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Query Database

In [None]:
def query_database(query):
    # REMEMBER TO BE CONNECTED TO IMPERIAL WIFI!
    graph_db = py2neo.Graph("https://dsi-bitcoin.doc.ic.ac.uk:7473/db/data/", auth=("guest_ro", "imperialO_nly"))
    return graph_db.run(query)

def get_block_data(first_block, last_block):
    query_string = """
                    MATCH (b:Block) <-[:MINED_IN]- (t:Tx) <-[:IN]- (txi:TxIn) <-[:UNLOCK]- (iadr:Address)
                    WHERE b.height >= {} AND b.height <= {}
                    MATCH (txi) <-[:SPENT]- (txo_in:TxOut) 
                    MATCH (oadr:Address) <-[:LOCK]- (txo_out:TxOut) <-[:OUT]- (t)
                    
                    RETURN iadr.address as iadr, oadr.address as oadr, txo_in.value as input_val, txo_out.value as output_val, ID(txo_in) as id_txo_in, ID(txi) as id_txi, ID(t) as id_t, ID(txo_out) as id_txo_out
                    """.format(first_block, last_block)
    return query_string

def write_to_csv(result,string):

    df = result.to_data_frame()

    if (df.empty):
        print("Something went wrong, there is no data for this/these blocks")
    else:
        df.to_csv('{}.csv'.format(string), encoding='utf-8', index=False)

def seen_before1(block, adr):
    params = {'block':block,'adr':adr}
    query_string = """
                    MATCH (a:Address) <-[:LOCK]- (to:TxOut) <-[:OUT]- (t1:Tx)-[:MINED_IN]->(b1:Block)
                    WHERE a.address = "{adr}" AND b1.height<{block}
                    RETURN b1.height LIMIT 1
                    """.format(**params)
    
    data_fr = query_database(query_string).to_data_frame()
    if data_fr.empty:
        return False
    return True

def seen_before2(block, adr):
    params = {'block':block,'adr':adr}
    query_string = """
                    MATCH (a:Address) -[:UNLOCK]-> (ti:TxIn) -[:IN]-> (t2:Tx)-[:MINED_IN]->(b2:Block)
                    WHERE a.address = "{adr}" AND b2.height<{block}
                    RETURN b2.height LIMIT 1
                    """.format(**params)
    
    data_fr = query_database(query_string).to_data_frame()
    if data_fr.empty:
        return False

    return True



In [None]:
result = query_database(get_block_data(400000,400000))
df = result.to_data_frame()

In [None]:
#Check if any columns are unique
for column in df:
    print(df[column].is_unique)

In [None]:
print(df['oadr'].value_counts().sort_values(ascending=False))
df1 = df.loc[df['iadr'] == '1BQLNJtMDKmMZ4PyqVFfRuBNvoGhjigBKF']

# CLUSTERING

In [None]:
cluster = (
    df.groupby('id_t')
    .agg({
        'input_val': 'sum',
        'id_txi': 'nunique',
        'id_txo_out': 'nunique',
        'iadr': 'nunique',
        'oadr': 'nunique',       
    }).rename(columns = {
        'iadr':'num_unique_addr_in',
        'oadr':'num_unique_addr_out'
    })
)

cluster['ratio'] = cluster['id_txi'] / cluster['id_txo_out']
cluster['degree'] = cluster['id_txi'] + cluster['id_txo_out'] 

cluster = cluster.drop(columns=['id_txi', 'id_txo_out'])

#Normalize data

scaler = sk.preprocessing.StandardScaler()
scaled_cluster = scaler.fit_transform(cluster)
cluster_scaled = pd.DataFrame(scaled_cluster, columns=cluster.columns, index=cluster.index)


data_corr = cluster_scaled.corr()
sb.heatmap(data_corr, cmap = 'bwr') #heatmap of correlation matrix

#split data into train and test sets
clus_train, clus_test = sk.model_selection.train_test_split(cluster_scaled, test_size=0.3, random_state=123)

clusters = range(1,10)
meandist=[]

for k in clusters:
    model = sk.cluster.KMeans(n_clusters = k)
    model.fit(clus_train)
    clusassign=model.predict(clus_train)
    meandist.append(sum(np.min(sp.spatial.distance.cdist(clus_train,model.cluster_centers_,'euclidean'),axis=1))/clus_train.shape[0])
    
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')

# Evaluating K Means clustering performance using elbow method

In [None]:
# Convert DataFrame to matrix
mat = cluster_scaled.values
distorsions = []
x = range(2, 20)
for k in x:
    #Perform K Means
    kmeans = sk.cluster.KMeans(n_clusters=k)
    kmeans.fit(mat)
    distorsions.append(kmeans.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(x, distorsions)
plt.grid(True)
plt.title('Elbow curve')
plt.show
plt.xticks(x)

# Run K Means and assign centroids to transactions

In [None]:
# Using sklearn
km = sk.cluster.KMeans(n_clusters=5)
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
data = {'transaction_id':cluster_scaled.index,'cluster_label':labels}
results = pd.DataFrame(data)

# Trying to visualize K Means

In [None]:
pca = sk.decomposition.PCA(n_components=2).fit(mat)
pca_2d = pca.transform(mat)
pl.figure('Reference Plot')
pl.scatter(pca_2d[:, 0], pca_2d[:, 1])

km = sk.cluster.KMeans(n_clusters=4)
km.fit(mat)
pl.figure('K-means with 4 clusters')
pl.scatter(pca_2d[:, 0], pca_2d[:, 1], c=kmeans.labels_)
pl.show()

In [None]:
from sklearn.manifold import TSNE

mat_tsne = TSNE(n_components=2).fit_transform(mat)

# Finding users by using heuristic measures

In [None]:
from collections import defaultdict

users_dict = defaultdict(lambda: {'iadr': set(), 'oadr': set()})
address_dict = defaultdict(lambda: {'user':set()})

In [None]:
iadr = (
    df.groupby('id_t')
    .agg({
        'iadr': 'nunique't    
    })
)

# users_dict = {user: iadr, oadr for (user, iadr, oadr) in df['id_']}

df.head()

In [None]:
u = 123
inputs = [1,2,3,4,5]
outputs = [3,4,5,6]

users_dict[u]['in_adr'].update(set(inputs))
users_dict[u]['out_adr'].update(set(outputs))

In [None]:
def iadrs_from_tx(id_t):
    d = {
        1: [7, 8],
        2: [7, 9]
    }
    return set(d[id_t])

def tx_from_iadr(iadr):
    d = {
        7: [1, 2],
        8: [1],
        9: [2],
        10: [3]
    }
    return set(d[iadr])

In [None]:
df.head()

In [None]:
from collections import defaultdict

#CHANGE LATER TO QUERY DATABASE INSTEAD
def iadrs_from_tx(id_t):
    return set(df['iadr'][df["id_t"] == id_t])

def oadrs_from_tx(id_t):
    return set(df['oadr'][df["id_t"] == id_t])

def tx_from_iadr(iadr):
    return set(df['id_t'][df["iadr"] == iadr])

#FUNCTION TO WHICH YOU GIVE AN INPUT ADDRESS AND GET USER 
def get_user(input_adr):
    to_inv = [input_adr]
    user_iadrs = set()
    seen_txs = set()
    while to_inv:
        current_iadr = to_inv.pop(0)
        user_iadrs.add(current_iadr)
        for id_t in tx_from_iadr(current_iadr):

            if id_t not in seen_txs:
                
                seen_txs.add(id_t)
                iadrs = iadrs_from_tx(id_t)
                to_inv += iadrs.difference(user_iadrs) #Adding addr
                user_iadrs.update(iadrs)
            
    return User(user_iadrs, seen_txs)

#LIST OF LISTS OF USER AND THEIR ASSOCIATED ADDRESSES
users = []

class User:
    def __init__(self, iadrs, txs):
        self.iadr = set(iadrs)
        self.tx = set(txs)
        self.cadr = set()
        

#ASSOCIATE INPUT ADDRESS AND TX WITH EACH USER IN BLOCK
already_seen_iadr = set()
for input_adr in df.iadr:
    if input_adr not in already_seen_iadr:
        user = get_user(input_adr)
        users.append(user)
        already_seen_iadr.update(user.iadr)

#TAKES OUTPUT ADDRESS AND GIVES USER THAT HAS THAT ADDRESS AS INPUT
def user_from_oadr(oadr):
    for i,user in enumerate(users):
        if oadr in user.iadr:
            return i
        
edges = defaultdict(int)

#WHICH USERS IN HAVE TRANSACTED WITH EACH OTHER
for i,user in enumerate(users):
    for tx_id in user.tx:
        for oadr in oadrs_from_tx(tx_id):
            if oadr in already_seen_iadr:
                if(i!=user_from_oadr(oadr)):
                    edges[(i, user_from_oadr(oadr))]+=1  
            

In [None]:
for i,user in enumerate(users):
    for tx_id in user.tx:
        o = oadrs_from_tx(tx_id)
        if not user.iadr.intersection(o):
            adr_found = None
            for oadr in o:
                if not seen_before(400000, str(oadr)):
                    if adr_found:
                        ch_addr = None
                        break
                    else:
                        ch_addr = oadr
            if ch_addr:
                user.cadr.add(ch_addr)

In [None]:
addresses1 = list(df.oadr.value_counts()[df.oadr.value_counts()>1].index)
addresses2 = list(df.oadr.value_counts()[df.oadr.value_counts()==1].index)

In [None]:
addresses = list(df.oadr.value_counts().index)

In [None]:
blk = 400000

not_seen =[]
import time

temptime = 0

for adr in addresses:
    if seen_before1(blk, adr) or seen_before2(blk, adr):
        continue
    else:
        not_seen.append(adr)
    print(adr, " processed in ", time.time()-temptime)
    temptime = time.time()

In [None]:
blk = 400000
not_seen = [adr for adr in addresses1[:5] if not seen_before1(blk, adr) and not seen_before2(blk, adr)]

In [None]:
#Average number of addresses per user
sum(len(user) for user in users)/len(users)

In [None]:
import collections as c

count = c.defaultdict(int) 

for user in users:
    count[len(user)]+=1

In [None]:
df.oadr.nunique()

In [None]:
plt.plot(count.keys(),count.values())

In [None]:
matches = set(df.iadr).intersection(df.oadr)
print(matches)


# Determining statistics of inputs, outputs of a transaction

In [None]:
tx_ids = []
tx_id_val = []
txo_in_ids = []
txo_out_ids = []
iadr = []
oadr = []
txi_ids = []


for val in df.iadr.value_counts().iteritems():
    iadr.append(val[0])

for val in df.id_txi.value_counts().iteritems():
    txi_ids.append(val[0])
    
    
for val in df.oadr.value_counts().iteritems():
    oadr.append(val[0])
    
for val in df.id_t.value_counts().iteritems():
    tx_ids.append(val[0])
    data = df[df.id_t==val[0]]
    tx_id_val.append(data.output_val.sum())
    

for val in df.id_txo_in.value_counts().iteritems():
    txo_in_ids.append(val[0])
    
for val in df.id_txo_out.value_counts().iteritems():
    txo_out_ids.append(val[0])


In [None]:
print(len(txo_in_ids))
print(len(txi_ids))
print(len(tx_ids))
print(len(txo_out_ids))

In [None]:
matches = set(txo_in_ids).intersection(txo_out_ids)
print(len(matches))

In [None]:
print(len(iadr))
print(len(oadr))

In [None]:
matches = set(iadr).intersection(oadr)
len(matches)

# Visualising the database using iGraph


In [None]:
import igraph
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
result = query_database(get_block_data(400000,400000))

tups1 = []
tups2 = []
for d in result:
    tups1.append((d['id_txo_in'],d['id_t']))
    tups2.append((d['id_t'],d['id_txo_out']))


In [None]:
result.get_graph()

In [None]:
ig = igraph.Graph.TupleList(tups1)

visual_style = {}
visual_style["vertex_size"] = 0.5
visual_style["layout"] = "fr"

igraph.plot(ig,**visual_style)

In [None]:
ig.vcount()

In [None]:
ig2 = igraph.Graph.TupleList(tups2, vertex_name_attr = 'tx')
g.vs['out'] = 
igraph.plot(ig2,**visual_style)

In [None]:
g = igraph.Graph()

In [None]:
g.add_edge('A','B',weight = 20)


In [None]:
#Insert Transactions and value of each transaction into graph

g.add_vertices(txo_out_ids)





In [None]:
g.add_vertices(txo_in_ids)

In [None]:
len(txo_out_ids)

In [None]:
len(tx_ids)

In [None]:
len(txo_in_ids)

In [None]:
g.vcount()

In [None]:
g.vs[3]

In [None]:
#Insert tx inputs into graph
g.add_vertices(txo_in_ids)
g.vs["type"] = 'in'
g.vs["in_ids"] = txo_in_ids
try:
    del g.vs["name"]
except Exception: 
  pass    

In [None]:
#Insert tx outputs into graph
g.add_vertices(txo_out_ids)
g.vs["type"] = 'out'
g.vs["out_ids"] = txo_out_ids
try:
    del g.vs["name"]
except Exception: 
  pass    

In [None]:
color_dict = {"tx": "green", "in": "orange", "out":'blue'}

In [None]:
visual_style = {}
visual_style["vertex_size"] = 2
visual_style["layout"] = "fr"
visual_style["vertex_color"] = [color_dict[type] for type in g.vs["type"]]

In [None]:
igraph.plot(g, **visual_style)

In [None]:
tx_id = []
for i in df.index():
    edge = (df.ix[i, 'name1'], df.ix[i, 'name2'])
    if edge not in edgelist:
        edgelist.append(edge)
        weights.append(1)
    else:
        weights[edgelist.index(edge)] += 1

G = Graph()
G.add_edges(edgelist)
G.es['weight'] = weights