## User Feature Extraction

In [1]:
import pandas as pd 
import time
import pickle
first_block = 401180
last_block = 401200

class User:
    def __init__(self, adrs, txs):
        self.adr = set(adrs)
        self.sending_tx = set(txs)
        self.cadr = set()
        self.receiving_tx = set(txs)
#Read Users found
with open('../pickles/users/users_{}_to_{}.pickle'.format(first_block,last_block), 'rb') as f:
    users = pickle.load(f)
df = pd.read_pickle('../pickles/df/{}_to_{}_users.pickle'.format(first_block,last_block))

  return f(*args, **kwds)


In [2]:
# user input features
user_input_df = df.groupby('input_user').agg({
    'id_txo_out': 'nunique', #Num unique times paid out
    'oadr':'nunique', #Num of unique out addresses paid out
    'output_user': 'nunique', #Num of unique users paid out (Out Degree)
    #'id_txi': 'nunique', #Num unique times paid in
    'id_t': 'nunique', #Num Txs involved in
    'input_val': ['max', 'min']
})

user_input_df.columns = ['_'.join(col) for col in user_input_df.columns]

user_input_df.rename(columns={
    'id_txo_out_nunique': 'unique_sent',
    'oadr_nunique': 'unique_sent_adr',
    'output_user_nunique': 'unique_sent_user',  # (Out Degree)
    'id_t_nunique': 'tx1',
    'input_val_max': 'max_sent',
    'input_val_min': 'min_sent'
}, inplace=True)

user_input_df['total_sent'] = (df['input_val'] / df['num_txo']).groupby(df['input_user']).sum()
#dummy1 = (df['input_val'] / df['num_txo']).groupby(df['input_user']).sum()
#dummy2 = (df['output_val'] / df['num_txi']).groupby(df['input_user']).sum()

# user output features
user_out_df = df.groupby('output_user').agg({
    'id_txi': 'nunique', #Num unique times paid in
    'iadr': 'nunique', #Num of unique in addresses paid this user
    'input_user': 'nunique', #Num of unique users paid in (In Degree)
    #'id_txo_out': 'nunique', #Num unique times paid
    'id_t': 'nunique', #Num Txs involved in
    'output_val': ['max', 'min']
})

user_out_df.columns = ['_'.join(col) for col in user_out_df.columns]

user_out_df.rename(columns={
    'id_txi_nunique': 'unique_rec',
    'iadr_nunique': 'unique_rec_adr',
    'input_user_nunique': 'unique_rec_user',  # (In Degree)
    'id_t_nunique': 'tx2',
    'output_val_max': 'max_rec',
    'output_val_min': 'min_rec'
}, inplace=True)

user_out_df['total_rec'] = (df['output_val'] / df['num_txi']).groupby(df['output_user']).sum()

# Merge input and output user features
user_df = user_input_df.merge(user_out_df, how='outer', left_index=True, right_index=True)
user_df = user_df.iloc[:len(users)]

# Name index
user_df.index.name = 'user'
# New columns
user_df['num_tx'] = user_df['tx1'] + user_df['tx2']
# user_df = user_df.drop(['tx1', 'tx2'], axis=1)

temp = df.groupby('output_user').agg({
    'iadr': lambda x: (x == '0').any(), #Num of unique in addresses paid this user
})
temp.rename(columns={
    'iadr': 'is_miner',
}, inplace=True)
# Name index
#temp.index.name = 'user'
user_df['is_miner'] = temp['is_miner'].iloc[:len(users)]

# Fill in NA values
user_df['is_miner'].fillna(False, inplace=True)
user_df.fillna(0, inplace=True)

# New columns
user_df['num_tx'] = user_df['tx1'] + user_df['tx2']
# user_df = user_df.drop(['tx1', 'tx2'], axis=1)

#Add column to indicate labelled users
user_df["activity"] = 0

user_total_sent = [(user,total_rec) for (user, total_rec) in user_df['total_sent'].iteritems()]
#users_identified = list(user_df.index.values) 

# Totals
total_amt_spent = (df['input_val'] / df['num_txo']).sum() ##Total amount spent in this block?
total_amt_recieved = (df['output_val'] / df['num_txi']).sum() ##Total amount received in this block?

The minimum supported version is 2.4.6



In [3]:
user_df.loc[user_df['is_miner'] == True]

Unnamed: 0_level_0,tx1,unique_sent_adr,max_sent,min_sent,unique_sent_user,unique_sent,total_sent,unique_rec_adr,unique_rec_user,max_rec,min_rec,unique_rec,tx2,total_rec,num_tx,is_miner,activity
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
9447,1.0,1.0,25.6122,25.6122,1.0,1.0,25.6122,1.0,1.0,25.0837,25.0837,1.0,1.0,25.0837,2.0,True,0
6289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.225143,0.225143,1.0,1.0,0.225143,1.0,True,0
9417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.7908,24.7908,1.0,1.0,24.7908,1.0,True,0
9420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25.1314,25.0525,3.0,3.0,75.2673,3.0,True,0
9424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25.3741,25.0627,2.0,2.0,50.4368,2.0,True,0
9430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25.1247,25.1003,2.0,2.0,50.225,2.0,True,0
9442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25.369,25.369,1.0,1.0,25.369,1.0,True,0
9444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25.4473,25.0065,10.0,10.0,252.3001,10.0,True,0
9453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,25.3107,25.3107,1.0,1.0,25.3107,1.0,True,0


## Tag Users with data from walletexplorer.com

In [4]:
#Dictionary structure - 
#'Address': 'Service'
data = pd.read_csv("../../wallet_explorer/wexplorergambling.csv")
service = []
for i in range(len(data)):
    service.append(data.iloc[i]['Col'])
    
dic_userlabels = {}
starttime = time.time() 
for block in range(first_block,last_block+1,1):
    for i in service:  
        service_df = pd.read_pickle('../../wallet_explorer/gambling/{}'.format(i))
        addr = service_df.loc[service_df['last used in block'] == block]['address'].tolist()
        if(len(addr)!=0):
            for a in addr:
                dic_userlabels[a] = i
print("Total time to construct dictionary mapping addresses to labels:", time.time()-starttime)

for i, user in enumerate(users):
    for key in dic_userlabels:
        if key in user.adr or key in user.cadr:
            user_df.loc[i, 'activity'] = 1

KeyboardInterrupt: 