In [7]:
import pandas as pd 
from functools import reduce
import pickle
import time

first_block = 401180
last_block = 401200

class User:
    def __init__(self, adrs, txs):
        self.adr = set(adrs)
        self.sending_tx = set(txs)
        self.cadr = set()
        self.receiving_tx = set(txs)

In [8]:
df = pd.read_pickle('../pickles/df/{}_to_{}.pickle'.format(first_block,last_block))
#df = pd.read_csv('./csv/400000addr.csv')
#df[[col for col in df.columns if not 'Unnamed' in col]]

# New columns for number of input and output transaction ids
df['num_txo'] = df.groupby('id_t')['id_txo_out'].transform('nunique')
df['num_txi'] = df.groupby('id_t')['id_txi'].transform('nunique')

## Finding users by using heuristic measures

## Heuristic 1

In [10]:
from collections import defaultdict

#CHANGE LATER TO QUERY DATABASE INSTEAD
def iadrs_from_tx(id_t):
    return set(df['iadr'][df["id_t"] == id_t])

def oadrs_from_tx(id_t):
    return set(df['oadr'][df["id_t"] == id_t])

def tx_from_iadr(iadr):
    return set(df['id_t'][df["iadr"] == iadr])

#FUNCTION TO WHICH YOU GIVE AN INPUT ADDRESS AND GET USER 
def get_user(input_adr):
    to_inv = [input_adr]
    user_iadrs = set()
    seen_txs = set()
    while to_inv:
        current_iadr = to_inv.pop(0)
        user_iadrs.add(current_iadr)
        for id_t in tx_from_iadr(current_iadr):

            if id_t not in seen_txs:
                
                seen_txs.add(id_t)
                iadrs = iadrs_from_tx(id_t)
                to_inv += iadrs.difference(user_iadrs) #Adding addr
                user_iadrs.update(iadrs)
            
    return User(user_iadrs, seen_txs)

#LIST OF LISTS OF USER AND THEIR ASSOCIATED ADDRESSES
users = []

starttime = time.time()        
        
# Bitcoin- 
bitcoin = User({str(0)}, set()) #Make user object with bitcoin iadr (which is 0)
seen_miner_iadrs_tx = defaultdict(set) #Make dict associating miner payment address with tx_ids they've been involved in

for index, row in df[df['iadr'] == str(0)].iterrows(): #Going through all mining txs
    bitcoin.sending_tx.add(row['id_t']) #Adding tx id to bitcoin user's txs
    seen_miner_iadrs_tx[row['oadr']].add(row['id_t']) # Updating dict to register the tx_id as corresponding to miners adr. If new, then new key added, otherwise added to values of existing key

users.append(bitcoin) # Add bitcoin user
already_seen_iadr = {str(0)}  # Bitcoin iadr has already been seen

# make miners users
for adr, id_ts in seen_miner_iadrs_tx.items(): #Go through dictionary for every adr (miner) and txs he's been involved
    miner = get_user(adr) #From an address, give back user ... aka identify all addresses belonging to miner
    miner.adr.add(adr) #Make sure adrs and txs are added in
    miner.receiving_tx.update(id_ts)
    users.append(miner)
    already_seen_iadr.update(miner.adr)#Made sure miner's addresses are in already seen so that we don't create a second user with the same addresses

## ASSOCIATE INPUT ADDRESS AND TX WITH EACH USER IN BLOCK
# make other users from heuristic
for input_adr in df.iadr:
    if input_adr not in already_seen_iadr:
        user = get_user(input_adr)
        users.append(user)
        already_seen_iadr.update(user.adr)


#TAKES OUTPUT ADDRESS AND GIVES USER THAT HAS THAT ADDRESS AS INPUT
def user_from_oadr(oadr):
    for i,user in enumerate(users):
        if oadr in user.adr:
            return i
        
edges = defaultdict(int)

#WHICH USERS IN HAVE TRANSACTED WITH EACH OTHER
for i,user in enumerate(users):
    for tx_id in user.sending_tx:
        for oadr in oadrs_from_tx(tx_id):
            if oadr in already_seen_iadr:

                    edges[(i, user_from_oadr(oadr))]+=1  
                    
print("Total time to process heuristic 1:", time.time()-starttime)

Total time to process heuristic 1: 740.1732258796692


## Heuristic 2

In [11]:
otc_dic = {}
for block in range(first_block,last_block+1,1):
    with open ('../pickles/otc/otc_{}.pickle'.format(block), 'rb') as fp:
        otc_dic[block] = pickle.load(fp)
not_seen = list(reduce(set.symmetric_difference, (set(val) for val in otc_dic.values())))

#appeared_once_o= list(df.oadr.value_counts()[df.oadr.value_counts()==1].index)
#all_iadrs= list(df.iadr.value_counts().index)
#temp_df = df.loc[df['num_txo'] > 1]    #Don't regard any txs that only output to one address
#all_oadrs = set(df.oadr) #Remove all repeating adrs
#all_iadrs= set(df.iadr)
#o_never_used_as_i = set(appeared_once_o).difference(all_iadrs)

starttime = time.time()    
#Change Transactions

#for a transaction (not coinbase), if only one of the output addresses is appearing for the first time
#and this output address has not already been seen in the user, then this is a change address

for i,user in enumerate(users):
    cadrs_for_user = set()
    for tx_id in user.sending_tx:    #CHECK THIS
        o = oadrs_from_tx(tx_id)
        potential_cadr = []
        for oadr in o:
            if oadr in not_seen and oadr not in user.adr:
                potential_cadr.append(oadr)
        if len(potential_cadr)==1:
            cadrs_for_user.add(potential_cadr[0])
            not_seen.remove(potential_cadr[0]) #Ensure that same change address won't be assigned to another user
    user.cadr.update(cadrs_for_user)
    
print("Total time to process heuristic 2:", time.time()-starttime)

Total time to process heuristic 2: 32.02320313453674


In [None]:
from collections import Counter

tx_ids = []
for user in users[1:]:#Drop out bitcoin user
    tx_ids += list(user.sending_tx)

#tx_ids = list.append([list(user.sending_tx) for user in users])
        
ads = []
for i,user in enumerate(users):
    for ad in user.adr:
        ads.append(ad)
        
cads = []
for i,user in enumerate(users):
    for cad in user.cadr:
        cads.append(cad)

if Counter(tx_ids).most_common(10)[0][1] == 1:  ##Repeated txids between bitcoin and the miners
    print("No repeats txid")
    
if Counter(ads).most_common(10)[0][1] == 1:
    print("No repeats ads")

In [None]:
#Save Users found
with open('../pickles/users/users_{}_to_{}.pickle'.format(first_block,last_block),'wb') as f:
    pickle.dump(users,f)

In [None]:
#Read Users found
#with open('../pickles/users/users_{}_to_{}.pickle'.format(first_block,last_block), 'rb') as f:
#    users = pickle.load(f)

In [None]:
#df.groupby('id_t').agg({'oadr':['nunique', 'count'],'iadr':['nunique', 'count'],'id_txo_in':['nunique', 'count'],'id_txo_out':['nunique', 'count']})
#df.to_csv('users.csv', columns=['input_user','output_user'])

## Interactions between users (constructing user graph)

In [None]:
#Construct User Graph
df['input_user'] = df['iadr']  
df['output_user'] = df['oadr']

starttime = time.time() 
#Replacing all input addresses and output addresses with a user corresponding to that address
for i, user in enumerate(users): 
    #assert(isinstance(i, int))
    df['input_user'] = df['input_user'].apply(lambda x: i if x in user.adr else x)
    df['output_user'] = df['output_user'].apply(lambda x: i if x in user.adr else x)
    df['output_user'] = df['output_user'].apply(lambda x: i if x in user.cadr else x)
    
for tx_id, output_user in df[['id_t','output_user']].values:
    if isinstance(output_user,int):
        users[output_user].receiving_tx.add(tx_id)
print("Total time to construct user graph:", time.time()-starttime)

df.to_pickle("../pickles/df/{}_to_{}_users.pickle".format(first_block,last_block))
#Save Users found
with open('../pickles/users/users_{}_to_{}.pickle'.format(first_block,last_block),'wb') as f:
    pickle.dump(users,f)
    
# can't trust input_val column now
# because dropped lots of inputs
edges_df0 = df.drop_duplicates(['input_user', 'id_txo_out'])
edges_df = edges_df0.groupby(['input_user', 'output_user']).apply(lambda group: group['output_val'].sum()).reset_index()
#edges_df0 = edges_df0.rename(columns={0: 'edge_amount'})

# edges_df2 = (
#     df.groupby(['input_user', 'output_user'])
#     .apply(lambda group: (group['output_val'] / group['num_txi']).sum())
#     .reset_index()
# )

# assert((edges_df == edges_df2).all())

# tups = []
# for i in range(0, edges_df.shape[0]):
#     tups.append((edges_df.at[i, 'input_user'],edges_df.at[i, 'output_user']))
    
tups = [(input_user, output_user, amount) for (index, input_user, output_user, amount) in edges_df.itertuples()]

with open("../pickles/user_graphs/{}_to_{}_users.pickle".format(first_block,last_block), 'wb') as f:
    pickle.dump(tups,f)
#tups2 = [(input_user, output_user, amount) for (index, input_user, output_user, amount) in edges_df2.itertuples()]

# assert(all(tups == tups2))


In [None]:
df.head()
df['iadr'].apply(type).value_counts()