In [3]:
import pandas as pd 
import pickle
import time
import py2neo
import urllib3
import os.path
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def query_database(query):
    # REMEMBER TO BE CONNECTED TO IMPERIAL WIFI!
    graph_db = py2neo.Graph("https://dsi-bitcoin.doc.ic.ac.uk:7473/db/data/", auth=("adi", "aditi123"))
    return graph_db.run(query)

def get_block_data(blk):
    query_string = """
                    MATCH (b:Block) <-[:MINED_IN]- (t:Tx) <-[:IN]- (txi:TxIn) <-[:UNLOCK]- (iadr:Address)
                    WHERE b.height = {}
                    MATCH (txi) <-[:SPENT]- (txo_in:TxOut) 
                    MATCH (oadr:Address) <-[:LOCK]- (txo_out:TxOut) <-[:OUT]- (t)
                    
                    RETURN iadr.address as iadr, oadr.address as oadr, txo_in.value as input_val, txo_out.value as output_val, ID(txo_in) as id_txo_in, ID(txi) as id_txi, ID(t) as id_t, ID(txo_out) as id_txo_out
                    """.format(blk)
    return query_string

def get_coinbase(blk):
    query_string = """
                    MATCH (b:Block) <-[:MINED_IN]- (t:Tx)<-[:IN]-(cb:CBscript)
                    MATCH (oadr:Address) <-[:LOCK]- (txo_out:TxOut) <-[:OUT]- (t)
                    WHERE b.height = {}
                    RETURN                       oadr.address as oadr,                            txo_out.value as output_val,                          ID(cb) as id_txi, ID(t) as id_t, ID(txo_out) as id_txo_out             
                    """.format(blk)
    return query_string

def check_block(first_block, last_block):
    query_string = """ 
                    MATCH (b:Block) 
                    WHERE b.height >= {} AND b.height <= {}
                    RETURN b
                    """.format(first_block, last_block)
    return query_string

def write_to_csv(result,string):

    df = result.to_data_frame()

    if (df.empty):
        print("Something went wrong, there is no data for this/these blocks")
    else:
        df.to_csv('{}.csv'.format(string), encoding='utf-8', index=False)
        
def seen_before_out(block, adr):
    params = {'block':block,'adr':adr}
    query_string = """
                    MATCH (a:Address) <-[:LOCK]- (to:TxOut) <-[:OUT]- (t1:Tx)-[:MINED_IN]->(b1:Block)
                    WHERE a.address = "{adr}" AND b1.height<{block}
                    RETURN b1.height LIMIT 1
                    """.format(**params)
    
    data_fr = query_database(query_string).to_data_frame()
    if data_fr.empty:
        return False
    return True

def seen_before_in(block, adr):
    params = {'block':block,'adr':adr}
    query_string = """
                    MATCH (a:Address) -[:UNLOCK]-> (ti:TxIn) -[:IN]-> (t2:Tx)-[:MINED_IN]->(b2:Block)
                    WHERE a.address = "{adr}" AND b2.height<{block}
                    RETURN b2.height LIMIT 1
                    """.format(**params)
    data_fr = query_database(query_string).to_data_frame()
    if data_fr.empty:
        return False
    return True

In [4]:
first_block = 400000
last_block = 410000

## Get block(s) data

In [55]:
temptime = time.time()
starttime = time.time()

for block in range(first_block,last_block+1,1):
    print(block)
    result = query_database(get_block_data(block))
    df = result.to_data_frame()
    
    #Append coinbase txs to dataframe
    result_cb = query_database(get_coinbase(block))
    df_cb = result_cb.to_data_frame()
    df_cb.insert(0, 'iadr', '0')
    df_cb.insert(3, 'id_txo_in', 0)
    df_cb.insert(5, 'input_val', 0)
    
    df = df.append(df_cb)
    df = df.reset_index()
    df = df.drop(columns=['index'])
    df.to_pickle("../pickles/df/{}.pickle".format(block))
    print(block, " processed in ", time.time()-temptime)
    temptime = time.time()
    
print("Total time to process block {}: {}".format(block, time.time()-starttime))

In [5]:
#temptime = time.time()
#starttime = time.time()

def all_block_data(block):
#for block in range(first_block,last_block+1,1):
    if(os.path.exists("../pickles/df/{}.pickle".format(block))):
        df = pd.read_pickle("../pickles/df/{}.pickle".format(block))
        return df
    
    result = query_database(get_block_data(block))
    df = result.to_data_frame()
    
    #Append coinbase txs to dataframe
    result_cb = query_database(get_coinbase(block))
    df_cb = result_cb.to_data_frame()
    df_cb.insert(0, 'iadr', '0')
    df_cb.insert(3, 'id_txo_in', 0)
    df_cb.insert(5, 'input_val', 0)
    
    df = df.append(df_cb)
    df = df.reset_index()
    df = df.drop(columns=['index'])
    df.to_pickle("../pickles/df/{}.pickle".format(block))
    return df
    #print(block, " processed in ", time.time()-temptime)
    #temptime = time.time()  
#print("Total time to process everything: {}".format(time.time()-starttime))

## Determine addresses (per Block) seen for the first time

In [6]:
initaltime = time.time()
for block in range(first_block,last_block+1,1):
    starttime = time.time()
    print(block)
    if(os.path.exists('../pickles/otc/otc_{}.pickle'.format(block))):
        continue
    
    df = all_block_data(block)
    
    if df.empty:
        print("Block {} contained nothing".format(block))
        continue

    addresses = list(df.oadr.value_counts().index)

    not_seen =[]
    import time

    temptime = time.time()

    for adr in addresses:
        if seen_before_in(block, adr) or seen_before_out(block, adr):
            continue
        else:
            not_seen.append(adr)
        #print(adr, " processed in ", time.time()-temptime)
        temptime = time.time()
    
    with open('../pickles/otc/out.txt', 'a') as f:
        print("Total time to process block {}: {}".format(block, time.time()-starttime), file=f)
        
    with open('../pickles/otc/otc_{}.pickle'.format(block),'wb') as f:
        pickle.dump(not_seen,f)

with open('../pickles/otc/out.txt', 'a') as f:
    print("Total time to process everything: {}".format(time.time()-initaltime), file=f)

400000
400001
400002
400003
400004
400005
400006
400007
400008
400009
400010
400011
400012
400013
400014
400015
400016
400017
400018
400019
400020
400021
400022
400023
400024
400025
400026
400027
400028
400029
400030
400031
400032
400033
400034
400035
400036
400037
400038
400039
400040
400041
400042
400043
400044
400045
400046
400047
400048
400049
400050
400051
400052
400053
400054
400055
400056
400057
400058
400059
400060
400061
400062
400063
400064
400065
400066
400067
400068
400069
400070
400071
400072
400073
400074
400075
400076
400077
400078
400079
400080
400081
400082
400083
400084
400085
400086
400087
400088
400089
400090
400091
400092
400093
400094
400095
400096
400097
400098
400099
400100
400101
400102
400103
400104
400105
400106
400107
400108
400109
400110
400111
400112
400113
400114
400115
400116
400117
400118
400119
400120
400121
400122
400123
400124
400125
400126
400127
400128
400129
400130
400131
400132
400133
400134
400135
400136
400137
400138
400139
400140
400141
400142

KeyboardInterrupt: 

In [None]:
pool_df = pd.read_pickle('../pickles/services/pools.pickle')
#print(pool_df['last used in block'].value_counts().sort_values(ascending=False))

In [None]:
addresses1 = list(df.oadr.value_counts()[df.oadr.value_counts()>1].index)
addresses2 = list(df.oadr.value_counts()[df.oadr.value_counts()==1].index)

In [None]:
not_seen = [adr for adr in addresses1[:5] if not seen_before_in(block, adr) and not seen_before_out(block, adr)]