# Import Libraries and read data

In [1]:
import ujson as json
import pandas as pd

data_file = 'data/new.json'
records = map(json.loads, open(data_file))
df = pd.DataFrame.from_records(records)

# Data Preprocessing  

In [2]:
neighbor_nodes = []
for col1, row in zip(df["node_id"], df["node"]):
    for d in row:
        for k, v in d.items():
            neighbor_nodes.append({"node_id": col1, "neighbor_nodes": k, "direction": v})
df_1 = pd.DataFrame(neighbor_nodes)

In [3]:
transaction_time = []
transaction_list = []
for col1, col2 in zip(df["transaction_val"], df["transaction_time"]):
    for c1 in col1:
        transaction_list.append(c1)
    for c2 in col2:
        transaction_time.append(c2)
df_1["transaction_val"] = transaction_list
df_1["transaction_time"] = transaction_time

In [4]:
def convert_value(self, unit: str, factor: float) -> None:
    if unit == "EU":
        return factor * 1119.5
    elif unit == "USD":
        return factor * 1182.42
    pass

df_1["transaction_val"] = df_1["transaction_val"] * 1182.42

In [135]:
# filter out every transation that has value = 0.00
df_1 = df_1[df_1["transaction_val"] != 0.00]

# filter out pair of nodes with # of transactions < 20
df_pairs = df_1.groupby(["node_id","neighbor_nodes"]).count().direction.reset_index().rename(columns = {"direction": "nr_trx"})
df_pairs = df_pairs[df_pairs["nr_trx"] > 20]
out = df_pairs[["node_id","neighbor_nodes"]].apply(frozenset, axis=1).value_counts().index.map(tuple)
final_node_list = [x[0] for x in out]

# df_final
df_final = pd.merge(df_pairs.drop(["nr_trx"], axis=1),df_1, on = ["node_id","neighbor_nodes"], how="left")
df_final


Unnamed: 0,node_id,neighbor_nodes,direction,transaction_val,transaction_time
0,0x00192fb10df37c9fb26829eb2cc623cd1bf599e8,0x95a5147bd3979d0947603a662c16c5d2eac40e17,in,0.050043,1628079892
1,0x00192fb10df37c9fb26829eb2cc623cd1bf599e8,0x95a5147bd3979d0947603a662c16c5d2eac40e17,in,0.050127,1626948176
2,0x00192fb10df37c9fb26829eb2cc623cd1bf599e8,0x95a5147bd3979d0947603a662c16c5d2eac40e17,in,0.050012,1623277117
3,0x00192fb10df37c9fb26829eb2cc623cd1bf599e8,0x95a5147bd3979d0947603a662c16c5d2eac40e17,in,0.050309,1622007064
4,0x00192fb10df37c9fb26829eb2cc623cd1bf599e8,0x95a5147bd3979d0947603a662c16c5d2eac40e17,in,0.050091,1621102734
...,...,...,...,...,...
235,0xfa52274dd61e1643d2205169732f29114bc240b3,0x6390fd17e63e1362d3a9f18eac3b2acf576deb19,in,0.177433,1610006695
236,0xfa52274dd61e1643d2205169732f29114bc240b3,0x6390fd17e63e1362d3a9f18eac3b2acf576deb19,in,0.121816,1609836579
237,0xfa52274dd61e1643d2205169732f29114bc240b3,0x6390fd17e63e1362d3a9f18eac3b2acf576deb19,in,0.186176,1609747924
238,0xfa52274dd61e1643d2205169732f29114bc240b3,0x6390fd17e63e1362d3a9f18eac3b2acf576deb19,in,7.979179,1609746284


In [79]:
final_node_list

['0x95a5147bd3979d0947603a662c16c5d2eac40e17',
 '0x1ad91ee08f21be3de0ba2ba6918e714da6b45836',
 '0xfa52274dd61e1643d2205169732f29114bc240b3']

# FI_37 detection functions

In [137]:
def detection_FI_37(df_node, start_index):
    node_flag = False
    end_index = df_node.shape[0]
    time_window_start = df_node['transaction_time'].values[start_index]
    time_window_end =  time_window_start + 2592000
    index_latest_trx = df_node['transaction_time'].searchsorted(time_window_end, side = 'left') - 1
    tmp = (df_node.iloc[start_index:index_latest_trx + 1].groupby(["node_id", "neighbor_nodes"]).count() > 20).direction.reset_index().rename(columns={"direction":"nr_trx"})
    tmp = tmp.loc[tmp.nr_trx == True].drop(["nr_trx"], axis=1)

    #if number of transations in one month > 0 
    if len(tmp) > 0:
        node_flag = True
    # early-stopping
    elif (index_latest_trx == end_index):   
        return node_flag
    else:
        # check for duplicated timestamp to avoid unnecessary looping 
        duplicated_ts = df_node[df_node['transaction_time'] == time_window_start] 
        start_index = start_index + len(duplicated_ts)
        if start_index == end_index:      
            node_flag = False
        else:
            node_flag = detection_FI_37(df_node, start_index)
    return node_flag

In [138]:
def flagger_FI_37(df_candidates):
    start_index = 0
    node_flags = {}
    #loop over all candidates
    for node in df_candidates['node_id'].unique().tolist():
        df_node = df_candidates[df_candidates['node_id'] == node].sort_values(by='transaction_time').reset_index(drop=True)
        node_flag = detection_FI_37(df_node, start_index)
        node_flags[node] = node_flag
    return node_flags



# Apply functions on data

In [139]:
df_candidates = df_final[df_final["node_id"].isin(final_node_list)]
flagger_FI_37(df_candidates)

{'0x1ad91ee08f21be3de0ba2ba6918e714da6b45836': False,
 '0x95a5147bd3979d0947603a662c16c5d2eac40e17': False,
 '0xfa52274dd61e1643d2205169732f29114bc240b3': False}

# Testing

In [140]:
def detection_FI_37(df_node, start_index):
    node_flag = False
    end_index = df_node.shape[0]
    time_window_start = df_node['transaction_time'].values[start_index]
    time_window_end =  time_window_start + 2592000
    index_latest_trx = df_node['transaction_time'].searchsorted(time_window_end, side = 'left') - 1
    print("____________")
    print(df_node.iloc[start_index:index_latest_trx + 1].groupby(["node_id","neighbor_nodes"]).count())
    tmp = (df_node.iloc[start_index:index_latest_trx + 1].groupby(["node_id", "neighbor_nodes"]).count() > 20).direction.reset_index().rename(columns={"direction":"nr_trx"})
    tmp = tmp.loc[tmp.nr_trx == True].drop(["nr_trx"], axis=1)

    #if number of transations in one month > 0 
    if len(tmp) > 0:
        node_flag = True
    # early-stopping
    elif (index_latest_trx == end_index):   
        return node_flag
    else:
        print("________LOOP")
        # check for duplicated timestamp to avoid unnecessary looping 
        duplicated_ts = df_node[df_node['transaction_time'] == time_window_start] 
        start_index = start_index + len(duplicated_ts)
        if start_index == end_index:      
            node_flag = False
        else:
            node_flag = detection_FI_37(df_node, start_index)
    return node_flag

### Node_id == "0x1ad91ee08f21be3de0ba2ba6918e714da6b45836"

In [141]:
 df_test = df_final[df_final["node_id"] == "0x1ad91ee08f21be3de0ba2ba6918e714da6b45836"]
 df_test = df_test.sort_values(["transaction_time"])

In [142]:
detection_FI_37(df_test, 0)

____________
                                                                                       direction  \
node_id                                    neighbor_nodes                                          
0x1ad91ee08f21be3de0ba2ba6918e714da6b45836 0x6390fd17e63e1362d3a9f18eac3b2acf576deb19         16   

                                                                                       transaction_val  \
node_id                                    neighbor_nodes                                                
0x1ad91ee08f21be3de0ba2ba6918e714da6b45836 0x6390fd17e63e1362d3a9f18eac3b2acf576deb19               16   

                                                                                       transaction_time  
node_id                                    neighbor_nodes                                                
0x1ad91ee08f21be3de0ba2ba6918e714da6b45836 0x6390fd17e63e1362d3a9f18eac3b2acf576deb19                16  
________LOOP
____________
                       

False

### Node_id == "0x95a5147bd3979d0947603a662c16c5d2eac40e17"

In [143]:
 df_test = df_final[df_final["node_id"] == "0x95a5147bd3979d0947603a662c16c5d2eac40e17"]
 df_test = df_test.sort_values(["transaction_time"])

In [144]:
detection_FI_37(df_test, 0)

____________
                                                                                       direction  \
node_id                                    neighbor_nodes                                          
0x95a5147bd3979d0947603a662c16c5d2eac40e17 0x00192fb10df37c9fb26829eb2cc623cd1bf599e8          7   

                                                                                       transaction_val  \
node_id                                    neighbor_nodes                                                
0x95a5147bd3979d0947603a662c16c5d2eac40e17 0x00192fb10df37c9fb26829eb2cc623cd1bf599e8                7   

                                                                                       transaction_time  
node_id                                    neighbor_nodes                                                
0x95a5147bd3979d0947603a662c16c5d2eac40e17 0x00192fb10df37c9fb26829eb2cc623cd1bf599e8                 7  
________LOOP
____________
                       

False

### Node_id == "0xfa52274dd61e1643d2205169732f29114bc240b3"

In [145]:
 df_test = df_final[df_final["node_id"] == "0xfa52274dd61e1643d2205169732f29114bc240b3"]
 df_test = df_test.sort_values(["transaction_time"])


In [146]:
detection_FI_37(df_test, 0)

____________
                                                                                       direction  \
node_id                                    neighbor_nodes                                          
0xfa52274dd61e1643d2205169732f29114bc240b3 0x6390fd17e63e1362d3a9f18eac3b2acf576deb19         17   

                                                                                       transaction_val  \
node_id                                    neighbor_nodes                                                
0xfa52274dd61e1643d2205169732f29114bc240b3 0x6390fd17e63e1362d3a9f18eac3b2acf576deb19               17   

                                                                                       transaction_time  
node_id                                    neighbor_nodes                                                
0xfa52274dd61e1643d2205169732f29114bc240b3 0x6390fd17e63e1362d3a9f18eac3b2acf576deb19                17  
________LOOP
____________
                       

False