In [0]:
"""
Consensus using AI:
Extracting deep node features using unsupervised training
"""

# Terminal

In [None]:
"""
Run these commands on terminal to generate txedges.dat
"""

In [0]:
%%bash

# !wget "https://senseable2015-6.mit.edu/bitcoin/txin.dat.xz"
# !wget "https://senseable2015-6.mit.edu/bitcoin/txout.dat.xz"

# !git clone https://github.com/dkondor/txedges.git

# !g++ -o txedge txedge.cpp -std=gnu++14 -O3 -march=native

# !./txedge -ix txin.dat.xz -ox txout.dat.xz > txedges.dat

--2019-08-22 18:35:43--  https://senseable2015-6.mit.edu/bitcoin/txin.dat.xz
Resolving senseable2015-6.mit.edu (senseable2015-6.mit.edu)... 18.7.25.22
Connecting to senseable2015-6.mit.edu (senseable2015-6.mit.edu)|18.7.25.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7546653316 (7.0G) [application/x-ns-proxy-autoconfig]
Saving to: ‘txin.dat.xz’


2019-08-22 18:37:14 (83.9 MB/s) - ‘txin.dat.xz’ saved [7546653316/7546653316]



# Imports

In [1]:
# from google.colab import drive
# drive.mount('./gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at ./gdrive


In [2]:
# from google.colab import files
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import keras
import matplotlib.pyplot as plt
from IPython import display

Using TensorFlow backend.


# Data

In [None]:
"""
path_to_txedges:Provide path to the txedges.dat file, 
                created using txin.dat and txout.dat and https://github.com/dkondor/txedges
                Refer Scrap part above
"""
path_to_txedges = './gdrive/My Drive/txedges.dat'

In [0]:
"""
ln: Number of transactions to consider
strt: Transaction number to start from
"""
ln = 10000
strt = 300000

In [0]:
graph = pd.read_csv(path_to_txedges,delimiter='\t',skiprows=strt,nrows=ln,header=None)

In [0]:
"""
in_add: Receiver address of transaction
out_add: Sender address of transaction
tr_amt: Transaction amount in Satoshis (1e-8 BTC)
"""
in_add = graph.iloc[:,1].values
out_add = graph.iloc[:,2].values
tr_amt = graph.iloc[:,3].values

In [0]:
"""
Normalisation of transaction amount for better training
"""
mmsc = MinMaxScaler()
sc_tr_amt = mmsc.fit_transform(np.log(tr_amt).reshape(-1,1))

In [8]:
"""
add_dic: Address dictionary. Contains each address, with the info on transactions they participated in.
"""
add_dic = {}
for i in tqdm(range(ln)):
    if in_add[i] not in add_dic:
        add_dic[in_add[i]] = []
    if out_add[i] not in add_dic:
        add_dic[out_add[i]] = []
    add_dic[in_add[i]].append([sc_tr_amt[i],i])
    add_dic[out_add[i]].append([-1*sc_tr_amt[i],i])

100%|██████████| 10000/10000 [00:00<00:00, 278604.29it/s]


In [0]:
"""
num_add: Total number of addresses
"""
num_add = len(add_dic)

In [11]:
"""
inpt_arr: Input array serves as the real data to the GAN. It contains transaction history of each address.
"""
inpt_arr = np.zeros((num_add,ln))
ctr = 0
for key,val in tqdm(add_dic.items()):
    for i in range(len(val)):
        inpt_arr[ctr][val[i][1]] = val[i][0]
    ctr+=1

100%|██████████| 6499/6499 [00:00<00:00, 294558.97it/s]


In [0]:
inpt_arr = np.expand_dims(inpt_arr,-1)

In [0]:
"""
datagen function creates batches for training the Discriminator
"""
def datagen(inpt_arr,gen_arr,size,check_flag,name=None):
    inp = inpt_arr[np.random.randint(low=0,high=num_add,size=size)]
    inp = np.concatenate([inp,gen_arr])
    if(check_flag):
        check(inp,name)
    out = [1 if i<size else 0 for i in range(2*size)]
    return(inp,out)

In [0]:
"""
It checks the difference between real and generated data, hence it's important for monitoring the training.
"""
def check(gen_arr,name=None):
    check_balance(gen_arr,name)
    check_variance(gen_arr,name)
    return None

In [0]:
def check_balance(gen_arr,name=None):
    sm = []
    for i in range(len(gen_arr)):
        sm.append(np.sum(gen_arr[i]))
    fig = plt.figure()
    plt.title('Bal')
    plt.plot(sm)
    if(name!=None):
        fig.savefig(name+'_bal.png')
#         files.download(name+'_bal.png')
    plt.show()

In [0]:
def check_variance(gen_arr,name=None):
    sm = []
    for i in range(len(gen_arr)):
        sm.append(np.std(gen_arr[i])**2)
    fig = plt.figure()
    plt.title('Var')
    plt.plot(sm)
    if(name!=None):
        fig.savefig(name+'_var.png')
#         files.download(name+'_var.png')
    plt.show()

# GAN

In [0]:
# keras.backend.clear_session()

## Discriminator

In [0]:
disc_inpt = keras.layers.Input((ln,1))
disc_lstm = keras.layers.CuDNNLSTM(32)
lstm_out = disc_lstm(disc_inpt)
disc_dense = keras.layers.Dense(1,activation='sigmoid')
disc_out = disc_dense(lstm_out)
discriminator = keras.models.Model(disc_inpt,disc_out)

In [152]:
discriminator.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10000, 1)          0         
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 32)                4480      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 4,513
Trainable params: 4,513
Non-trainable params: 0
_________________________________________________________________


## Generator

In [0]:
gen_inpt = keras.layers.Input((ln,1))
gen_lstm = keras.layers.CuDNNLSTM(32,return_sequences=True)
gen_lstm_out = gen_lstm(gen_inpt)
gen_dense = keras.layers.Dense(1,activation='tanh')
gen_out = gen_dense(gen_lstm_out)
generator = keras.models.Model(gen_inpt,gen_out)
main_lstm_out = disc_lstm(gen_out)
main_out = disc_dense(main_lstm_out)
main_model = keras.models.Model(gen_inpt,main_out)

In [155]:
main_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10000, 1)          0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 10000, 32)         4480      
_________________________________________________________________
dense_2 (Dense)              (None, 10000, 1)          33        
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 32)                4480      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 9,026
Trainable params: 9,026
Non-trainable params: 0
_________________________________________________________________


In [156]:
generator.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10000, 1)          0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 10000, 32)         4480      
_________________________________________________________________
dense_2 (Dense)              (None, 10000, 1)          33        
Total params: 4,513
Trainable params: 4,513
Non-trainable params: 0
_________________________________________________________________


## Training

### param

In [0]:
adam = keras.optimizers.Adam(lr=1e-5)

In [0]:
adam_fast = keras.optimizers.Adam(lr=1e-2)

In [0]:
disc_pretrain = True
disc_cycles = 10
cycles = 150
batch_size = 128

In [0]:
def w_loss(y_true,y_pred):
    return keras.backend.mean(y_true*y_pred)

In [0]:
# discriminator.load_weights('disc.h5')
# main_model.load_weights('main_model.h5')

In [0]:
"""
To monitor GAN training
"""
disc_loss = []
disc_acc = []
gan_loss = []
gan_acc = []

### Discriminator pretrain

In [0]:
"""
To monitor D pretraining
"""
disc_solo_loss = []
disc_solo_acc = []

In [0]:
if(disc_pretrain):
    main_model.layers[-2].trainable = True
    main_model.layers[-1].trainable = True
    discriminator.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
    for i in range(disc_cycles):
        noise = np.random.normal(0,1,[batch_size,ln,1])
        gen_arr = generator.predict(noise)
        dis_x,dis_y = datagen(inpt_arr,gen_arr,batch_size,False)
        print('Training Discriminator: ',i)
        discriminator.train_on_batch(dis_x,dis_y)
        evalt = discriminator.evaluate(dis_x,dis_y,batch_size=batch_size)
        print(evalt)
        disc_solo_loss.append(evalt[0])
        disc_solo_acc.append(evalt[1])

In [0]:
if(disc_pretrain):
    plt.figure()
    plt.plot(disc_solo_loss)
    plt.legend(['loss'])
    plt.figure()
    plt.plot(disc_solo_acc)
    plt.legend(['acc'])
    plt.show()

### GAN training

In [None]:
for i in range(cycles):
    noise = np.random.normal(0,1,[batch_size,ln,1])
    gen_arr = generator.predict(noise)
    main_model.layers[-2].trainable = True
    main_model.layers[-1].trainable = True
    check_flag = False
    name = None
    if((i+1)%10==0):
        main_model.save_weights('main_model.h5')
        discriminator.save_weights('disc.h5')
        check_flag = True
        name = str(i)
        display.clear_output()
    dis_x,dis_y = datagen(inpt_arr,gen_arr,batch_size,check_flag,name)
    print('Training Discriminator: ',i)
    discriminator.compile(optimizer=adam,loss='binary_crossentropy',metrics=['acc'])
    discriminator.train_on_batch(dis_x,dis_y)
    evalt = discriminator.evaluate(dis_x,dis_y,batch_size=batch_size*2)
    print(evalt)
    disc_loss.append(evalt[0])
    disc_acc.append(evalt[1])
    plt.figure()
    plt.plot(disc_loss)
    plt.figure()
    plt.plot(disc_acc)
    main_model.layers[-2].trainable = False
    main_model.layers[-1].trainable = False
    noise = np.random.normal(0,1,[2*batch_size,ln,1])
    gen_y = np.ones((2*batch_size,1))
    print('Training GAN: ',i)
    main_model.compile(optimizer=adam,loss=w_loss,metrics=['acc'])
    main_model.train_on_batch(noise,gen_y)
    main_model.save_weights('main_model.h5')
    evalt = main_model.evaluate(noise,gen_y,batch_size=batch_size)
    print(evalt)
    gan_loss.append(evalt[0])
    gan_acc.append(evalt[1])
    plt.figure()
    plt.plot(gan_loss)
    plt.figure()
    plt.plot(gan_acc)
    plt.show()

In [None]:
plt.figure()
plt.plot(disc_loss)
plt.legend(['loss'])
plt.figure()
plt.plot(disc_acc)
plt.legend(['acc'])
plt.figure()
plt.title('GAN')
plt.plot(gan_loss)
plt.legend(['loss'])
plt.figure()
plt.plot(gan_acc)
plt.legend(['acc'])
plt.show()

# Score

In [0]:
pred = discriminator.predict(inpt_arr)
eps = 0.05
score = np.abs(np.log(np.abs(pred-np.mean(pred))+eps))
score = 1-np.abs(score-np.mean(score))
plt.figure()
plt.plot(pred2)
plt.xlabel('Address Labels')
plt.ylabel('Score')
plt.show()