In [1]:
import os
import sys
import numpy as np
import pandas as pd

In [2]:
import findspark
findspark.init()

# Cargar Pyspark
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window
from pyspark import StorageLevel
import time

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import networkx as nx

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input

# verify that we're using the correct version of StellarGraph for this notebook
import stellargraph as sg
from stellargraph.layer import GCN_LSTM
from graphframes import GraphFrame

from ks_crypto.lib.spark_conn import create_yarn_connection
from ks_crypto.lib import constants as C

spark = create_yarn_connection()

spark.sparkContext.setCheckpointDir('/temp/')
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [3]:
def get_nodes_sample_list(n_nodes):
    
    node_list = \
    spark.read.format('bigquery') \
    .option('table', 'kschool-crypto.ks_crypto_dataset.nodes_ft') \
    .load()\
    .select(F.col('i_address').cast('string'))\
    .limit(n_nodes)\
    .toPandas()['i_address'].values
    
    return node_list

def sample_nodes(nodes_df, n_nodes):
    
    sample_nodes_df = \
        nodes_df \
        .select('i_address', F.lit(1).alias('is_sampled'))\
        .dropDuplicates()\
        .limit(n_nodes)
    
    output_df = \
        nodes_df \
        .join(sample_nodes_df,
              on='i_address',
              how='left')\
        .filter(F.col('is_sampled') == 1)\
        .drop('is_sampled')
        
    
    return output_df

def sample_most_illicit_communities(nodes_df, transactions_df, n_nodes, max_iter):
    
    graph = \
        GraphFrame(nodes_df.groupBy(F.col(C.I_ADDRESS_ID).alias(C.ID)).agg(F.max(C.CLASS).alias(C.CLASS)), 
                   transactions_df.select(F.col(C.I_INPUT_ADDRESS_ID).alias(C.SRC),
                                          F.col(C.I_OUTPUT_ADDRESS_ID).alias(C.DST)))
    
    w_ord = Window.orderBy(F.desc('pct'))
    
    lp = graph.labelPropagation(max_iter).persist()
    lp.count()
    
    sampled_communities = \
        lp\
        .groupBy('label')\
        .agg(F.count(F.lit(1)).alias('n'), 
             F.sum(C.CLASS).alias('n_class'))\
        .withColumn('pct', F.col('n_class') / F.col('n'))\
        .withColumn('cumsum', F.sum('n').over(w_ord))\
        .filter(F.col('cumsum') < n_nodes)\
        .select(F.col('label'),
                F.lit(1).alias('is_sampled'))
    
    sample_nodes_df = \
        lp \
        .join(sampled_communities, 
              on=['label'],
              how=C.LEFT)\
        .filter(F.col('is_sampled') == 1)\
        .select(F.col(C.ID).alias(C.I_ADDRESS_ID),
                F.lit(1).alias('is_sampled'))
    
    output_df = \
        nodes_df \
        .join(sample_nodes_df,
              on=[C.I_ADDRESS_ID],
              how=C.LEFT)\
        .filter(F.col('is_sampled') == 1)\
        .drop('is_sampled')
    
    return output_df
    

def sample_transactions(transactions_df, sampled_nodes_df):
    
    aux_cols = ['is_sampled_input', 'is_sampled_output']
    
    output_df = \
        transactions_df \
        .join(sampled_nodes_df\
              .select(F.col(C.I_ADDRESS_ID).alias(C.I_INPUT_ADDRESS_ID),
                      F.lit(1).alias('is_sampled_input')),
              how=C.LEFT,
              on=[C.I_INPUT_ADDRESS_ID])\
        .join(sampled_nodes_df\
              .select(F.col(C.I_ADDRESS_ID).alias(C.I_OUTPUT_ADDRESS_ID),
                      F.lit(1).alias('is_sampled_output')),
              how=C.LEFT,
              on=[C.I_OUTPUT_ADDRESS_ID])\
        .filter((F.col('is_sampled_input') == 1)|(F.col('is_sampled_input') == 1))\
        .drop(*aux_cols)
    
    return output_df

def train_test_split(data, data_y, train_portion):

    time_len = data.shape[0]
    train_size = int(time_len * train_portion)
    train_data = np.array(data.iloc[:train_size, :])
    train_data_y = np.array(data_y.iloc[:train_size, :])

    test_data = np.array(data.iloc[train_size:, :])
    test_data_y = np.array(data_y.iloc[train_size:, :])

    return train_data, train_data_y, test_data, test_data_y

def scale_data(train_data, test_data):
    max_var = train_data.max()
    min_var = train_data.min()
    train_scaled = (train_data - min_var) / (max_var - min_var)
    test_scaled = (test_data - min_var) / (max_var - min_var)
    return train_scaled, test_scaled
"""
def sequence_data_preparation(seq_len, pre_len, train_data, train_data_y, test_data, test_data_y):
    trainX, trainY, testX, testY = [], [], [], []

    for i in range(train_data.shape[1] - int(seq_len + pre_len - 1)):
        a = train_data[:, i : i + seq_len + pre_len]
        a_y = train_data_y[:, i : i + seq_len + pre_len]
        if ((i==0) | (i==1)):
            print(a)
        trainX.append(a[:, :seq_len])
        trainY.append(a_y[:, -1])

    for i in range(test_data.shape[1] - int(seq_len + pre_len - 1)):
        b = test_data[:, i : i + seq_len + pre_len]
        b_y = test_data_y[:, i : i + seq_len + pre_len]
        testX.append(b[:, :seq_len])
        testY.append(b_y[:, -1])

    trainX = np.array(trainX)
    trainY = np.array(trainY)
    testX = np.array(testX)
    testY = np.array(testY)

    return trainX, trainY, testX, testY
"""

def sequence_data_preparation(seq_len, pre_len, train_data, test_data):
    trainX, trainY, testX, testY = [], [], [], []

    for i in range(train_data.shape[1] - int(seq_len + pre_len - 1)):
        a = train_data[:, i : i + seq_len + pre_len]
        trainX.append(a[:, :seq_len])
        trainY.append(a[:, -1])

    for i in range(test_data.shape[1] - int(seq_len + pre_len - 1)):
        b = test_data[:, i : i + seq_len + pre_len]
        testX.append(b[:, :seq_len])
        testY.append(b[:, -1])

    trainX = np.array(trainX)
    trainY = np.array(trainY)
    testX = np.array(testX)
    testY = np.array(testY)

    return trainX, trainY, testX, testY



In [4]:
KS_CRYPTO_PROJ = 'kschool-crypto'
KS_CRYPTO_DS = 'ks_crypto_dataset'
T_TABLENAME = 'transactions_ft'
N_TABLENAME = 'nodes_ft'

T_FULL_TABLENAME = f"{KS_CRYPTO_PROJ}.{KS_CRYPTO_DS}.{T_TABLENAME}"
N_FULL_TABLENAME = f"{KS_CRYPTO_PROJ}.{KS_CRYPTO_DS}.{N_TABLENAME}"
MAX_N_SAMPLED_NODES = 50000
MAX_ITER_LB = 1

TRAIN_RATE = 0.8

N_LAG_SEQ = 10
N_LEAD_PRED = 12

In [5]:
nodes_df = \
    spark.read.format('bigquery') \
    .option('table', N_FULL_TABLENAME) \
    .load()\
    .withColumn('out_final_value', F.col('sum_out_sum_input_value') - F.col('sum_out_sum_output_value'))\
    .withColumn('in_final_value', F.col('sum_in_sum_input_value') - F.col('sum_in_sum_output_value'))\
    .withColumn('final_value', (F.col('in_final_value') - F.col('out_final_value')))\
    .select(C.I_ADDRESS_ID, C.PERIOD, 'final_value', C.CLASS)\
    .repartition(512, [C.I_ADDRESS_ID])\
    .persist(StorageLevel.DISK_ONLY)

nodes_df.count()

3827269

In [6]:
transactions_df = \
    spark.read.format('bigquery') \
    .option('table', T_FULL_TABLENAME) \
    .load()\
    .select(C.I_INPUT_ADDRESS_ID, C.I_OUTPUT_ADDRESS_ID)\
    .dropDuplicates()\
    .repartition(512)\
    .persist(StorageLevel.DISK_ONLY)

# .transform(lambda df: sample_transactions(df, nodes_df))\

transactions_df.count()

9064683

In [None]:
sampled_nodes_df = \
    nodes_df\
    .transform(lambda df: sample_most_illicit_communities(df, transactions_df, MAX_N_SAMPLED_NODES, MAX_ITER_LB))\
    .persist(StorageLevel.DISK_ONLY)

sampled_nodes_df.count()

58301

In [None]:
sampled_transactions_df = \
    transactions_df\
    .transform(lambda df: sample_transactions(df, sampled_nodes_df))\
    .persist(StorageLevel.DISK_ONLY)

sampled_transactions_df.count()

309174

In [None]:
all_nodes_pd = \
sampled_nodes_df\
.select(F.col(C.I_ADDRESS_ID).cast('string').alias(C.I_ADDRESS_ID))\
.dropDuplicates()\
.toPandas()

transactions_pd = \
    sampled_transactions_df\
    .toPandas()

In [None]:
all_nodes_comb_pd = \
    pd.DataFrame(data=np.full((len(set(all_nodes_pd[C.I_ADDRESS_ID].values)),
                               len(set(all_nodes_pd[C.I_ADDRESS_ID].values))), False), 
                 index=all_nodes_pd[C.I_ADDRESS_ID].values, 
                 columns=all_nodes_pd[C.I_ADDRESS_ID].values)

linked_nodes_comb_od = \
    pd.DataFrame(data=np.full((len(set(transactions_pd[C.I_INPUT_ADDRESS_ID].values)),
                               len(set(transactions_pd[C.I_OUTPUT_ADDRESS_ID].values))), True),
                 index=set(transactions_pd[C.I_INPUT_ADDRESS_ID].values),
                 columns=set(transactions_pd[C.I_OUTPUT_ADDRESS_ID].values))

In [None]:
i=1
total_i=len(transactions_pd)
for index, row in transactions_pd.iterrows():
    if i % 10000 == 0:
        print(round((i/total_i)*100, 4), '%')
    all_nodes_comb_pd[str(row[C.I_INPUT_ADDRESS_ID])][str(row[C.I_OUTPUT_ADDRESS_ID])] = True
    i = i + 1  

3.2344 %
6.4688 %
9.7033 %
12.9377 %
16.1721 %
19.4065 %
22.641 %
25.8754 %
29.1098 %
32.3442 %
35.5787 %
38.8131 %
42.0475 %
45.2819 %
48.5164 %
51.7508 %
54.9852 %
58.2196 %
61.4541 %
64.6885 %
67.9229 %
71.1573 %
74.3918 %
77.6262 %
80.8606 %
84.095 %
87.3295 %
90.5639 %
93.7983 %
97.0327 %


In [None]:
ts_nodes_X_df = \
    sampled_nodes_df\
    .groupBy(C.I_ADDRESS_ID)\
    .pivot(C.PERIOD)\
    .agg(F.sum('final_value').alias('final_value'))\
    .fillna(0)\
    .toPandas()\
    .set_index(C.I_ADDRESS_ID)

ts_nodes_Y_df = \
  sampled_nodes_df \
  .groupBy(C.I_ADDRESS_ID)\
  .pivot(C.PERIOD) \
  .agg(F.max(C.CLASS)) \
  .toPandas()\
  .set_index(C.I_ADDRESS_ID)

In [None]:
all_nodes_comb_pd = all_nodes_comb_pd.astype('float').values

In [None]:
data_dic = {
    'ts_nodes_X_df': ts_nodes_X_df,
    'ts_nodes_Y_df': ts_nodes_Y_df,
    'adj_matrix': all_nodes_comb_pd
}

In [None]:
ts_nodes_X_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,40,41,42,43,44,45,46,47,48,49
i_address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-670144763.0,-152576495.0
42923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-285576837.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
212328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
232598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-7914024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3031130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3094537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3106135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3115626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
ts_nodes_X_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,40,41,42,43,44,45,46,47,48,49
i_address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-670144763.0,-152576495.0
42923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-285576837.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
212328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
232598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-7914024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3031130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3094537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3106135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3115626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling

In [18]:
train_data, train_data_y, test_data, test_data_y = train_test_split(ts_nodes_X_df, ts_nodes_Y_df, TRAIN_RATE)
print("Train data: ", train_data.shape)
print("Test data: ", test_data.shape)

Train data:  (39692, 49)
Test data:  (9923, 49)


In [19]:
train_scaled, test_scaled = scale_data(train_data, test_data)

In [20]:
"""trainX, trainY, testX, testY = sequence_data_preparation(
    N_LAG_SEQ, N_LEAD_PRED, train_data, train_data_y, test_data, test_data_y
)"""
trainX, trainY, testX, testY = sequence_data_preparation(
    N_LAG_SEQ, N_LEAD_PRED, train_scaled, test_scaled
)

print(trainX.shape)
print(trainY.shape)
print(testX.shape)
print(testY.shape)

(28, 39692, 10)
(28, 39692)
(28, 9923, 10)
(28, 9923)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
gcn_lstm = GCN_LSTM(
    seq_len=N_LAG_SEQ,
    adj=all_nodes_comb_pd,
    gc_layer_sizes=[16, 10],
    gc_activations=["relu", "relu"],
    lstm_layer_sizes=[200, 200],
    lstm_activations=["tanh", "tanh"]
)

  import sys
  D = np.diag(np.ravel(adj.sum(axis=0)) ** (-0.5))


In [None]:
x_input, x_output = gcn_lstm.in_out_tensors()

In [71]:
model = Model(inputs=x_input, outputs=x_output)
model.compile(optimizer="adam", loss="mae", metrics=["mse"])

In [None]:
model.summary()

In [73]:
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 207, 10)]         0         
_________________________________________________________________
tf_op_layer_ExpandDims (Tens [(None, 207, 10, 1)]      0         
_________________________________________________________________
reshape (Reshape)            (None, 207, 10)           0         
_________________________________________________________________
fixed_adjacency_graph_convol (None, 207, 16)           43216     
_________________________________________________________________
fixed_adjacency_graph_convol (None, 207, 10)           43216     
_________________________________________________________________
reshape_1 (Reshape)          (None, 207, None, 1)      0         
_________________________________________________________________
permute (Permute)            (None, None, 207, 1)      0         
_________________________________________________________________
reshape_2 (Reshape)          (None, None, 207)         0         
_________________________________________________________________
lstm (LSTM)                  (None, None, 200)         326400    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 207)               41607     
=================================================================
Total params: 775,239
Trainable params: 689,541
Non-trainable params: 85,698
______________________________________

SyntaxError: invalid syntax (<ipython-input-73-5a1a7d52e4be>, line 3)

In [74]:
history = model.fit(
    trainX,
    trainY,
    epochs=100,
    batch_size=60,
    shuffle=True,
    verbose=1,
    validation_data=[testX, testY],
)

Epoch 1/100


ValueError: in user code:

    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:386 call
        inputs, training=training, mask=mask)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/layers/core.py:544 call
        result.set_shape(self.compute_output_shape(inputs.shape))
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/layers/core.py:535 compute_output_shape
        self.target_shape)
    /opt/conda/default/lib/python3.7/site-packages/tensorflow/python/keras/layers/core.py:523 _fix_unknown_dimension
        raise ValueError(msg)

    ValueError: total size of new array must be unchanged, input_shape = [7631, 10, 1], output_shape = [9539, 10]


In [None]:
model.summary()

In [None]:
print(
    "Train loss: ",
    history.history["loss"][-1],
    "\nTest loss:",
    history.history["val_loss"][-1],
)

In [None]:
sg.utils.plot_history(history)

In [None]:
ythat = model.predict(trainX)
yhat = model.predict(testX)

In [None]:
1