# Notebook for data split and transform for dynamic graph model 
## This notebook consists of steps to 
1. process raw data into edge, node list data for the data loader for model training 
2. train/test data split 

In [1]:
import pandas as pd
import numpy as np

# Load raw data

In [2]:
raw_data_path = '../../data/01_raw/financial_fraud/bs140513_032310.csv'

In [3]:
raw_trans_data = pd.read_csv(raw_data_path)

In [4]:
raw_trans_data.shape

(594643, 10)

### sort the raw data by time step index

In [5]:
raw_trans_data.head(10) #seems the raw data is already sorted by time step

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0
5,0,'C1315400589','3','F','28007','M348934600','28007','es_transportation',25.81,0
6,0,'C765155274','1','F','28007','M348934600','28007','es_transportation',9.1,0
7,0,'C202531238','4','F','28007','M348934600','28007','es_transportation',21.17,0
8,0,'C105845174','3','M','28007','M348934600','28007','es_transportation',32.4,0
9,0,'C39858251','5','F','28007','M348934600','28007','es_transportation',35.4,0


In [6]:
raw_trans_data.tail(10)

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
594633,179,'C1039390058','4','M','28007','M45060432','28007','es_hotelservices',190.31,0
594634,179,'C728039227','6','M','28007','M348934600','28007','es_transportation',27.93,0
594635,179,'C728039227','6','M','28007','M1823072687','28007','es_transportation',30.04,0
594636,179,'C1904086644','5','F','28007','M1823072687','28007','es_transportation',28.93,0
594637,179,'C748358246','2','M','28007','M1823072687','28007','es_transportation',51.17,0
594638,179,'C1753498738','3','F','28007','M1823072687','28007','es_transportation',20.53,0
594639,179,'C650108285','4','F','28007','M1823072687','28007','es_transportation',50.73,0
594640,179,'C123623130','2','F','28007','M349281107','28007','es_fashion',22.44,0
594641,179,'C1499363341','5','M','28007','M1823072687','28007','es_transportation',14.46,0
594642,179,'C616528518','4','F','28007','M1823072687','28007','es_transportation',26.93,0


In [7]:
raw_net_data_path = '../../data/01_raw/financial_fraud/bsNET140513_032310.csv'

In [8]:
raw_net_trans_data = pd.read_csv(raw_net_data_path)

In [9]:
raw_net_trans_data.shape

(594643, 5)

In [10]:
raw_net_trans_data.head(10)

Unnamed: 0,Source,Target,Weight,typeTrans,fraud
0,'C1093826151','M348934600',4.55,'es_transportation',0
1,'C352968107','M348934600',39.68,'es_transportation',0
2,'C2054744914','M1823072687',26.89,'es_transportation',0
3,'C1760612790','M348934600',17.25,'es_transportation',0
4,'C757503768','M348934600',35.72,'es_transportation',0
5,'C1315400589','M348934600',25.81,'es_transportation',0
6,'C765155274','M348934600',9.1,'es_transportation',0
7,'C202531238','M348934600',21.17,'es_transportation',0
8,'C105845174','M348934600',32.4,'es_transportation',0
9,'C39858251','M348934600',35.4,'es_transportation',0


# Process edge data for dynamic graph model 
## Customer can be treated as source node and merchant can be treated as target node 

In [24]:
#edges = raw_net_trans_data[['Source','Target']]
edges = raw_trans_data[['step','customer','merchant','category','amount','fraud']]

In [25]:
edges

Unnamed: 0,step,customer,merchant,category,amount,fraud
0,0,'C1093826151','M348934600','es_transportation',4.55,0
1,0,'C352968107','M348934600','es_transportation',39.68,0
2,0,'C2054744914','M1823072687','es_transportation',26.89,0
3,0,'C1760612790','M348934600','es_transportation',17.25,0
4,0,'C757503768','M348934600','es_transportation',35.72,0
...,...,...,...,...,...,...
594638,179,'C1753498738','M1823072687','es_transportation',20.53,0
594639,179,'C650108285','M1823072687','es_transportation',50.73,0
594640,179,'C123623130','M349281107','es_fashion',22.44,0
594641,179,'C1499363341','M1823072687','es_transportation',14.46,0


In [26]:
edges['category'].value_counts(normalize=True)

'es_transportation'        0.849449
'es_food'                  0.044151
'es_health'                0.027131
'es_wellnessandbeauty'     0.025370
'es_fashion'               0.010854
'es_barsandrestaurants'    0.010717
'es_hyper'                 0.010255
'es_sportsandtoys'         0.006730
'es_tech'                  0.003986
'es_home'                  0.003340
'es_hotelservices'         0.002933
'es_otherservices'         0.001534
'es_contents'              0.001488
'es_travel'                0.001224
'es_leisure'               0.000839
Name: category, dtype: float64

In [27]:
edges = edges.loc[edges.customer!=edges.merchant]
edges

Unnamed: 0,step,customer,merchant,category,amount,fraud
0,0,'C1093826151','M348934600','es_transportation',4.55,0
1,0,'C352968107','M348934600','es_transportation',39.68,0
2,0,'C2054744914','M1823072687','es_transportation',26.89,0
3,0,'C1760612790','M348934600','es_transportation',17.25,0
4,0,'C757503768','M348934600','es_transportation',35.72,0
...,...,...,...,...,...,...
594638,179,'C1753498738','M1823072687','es_transportation',20.53,0
594639,179,'C650108285','M1823072687','es_transportation',50.73,0
594640,179,'C123623130','M349281107','es_fashion',22.44,0
594641,179,'C1499363341','M1823072687','es_transportation',14.46,0


In [28]:
edges.shape

(594643, 6)

### check duplicated (customer, merchant) pairs 

In [52]:
customer_merchant_trans_count = edges.groupby(by=['customer','merchant']).agg({'step':'count'}) #there are 47132 unique pairs 
customer_merchant_trans_count

Unnamed: 0_level_0,Unnamed: 1_level_0,step
customer,merchant,Unnamed: 2_level_1
'C1000148617','M1053599405',1
'C1000148617','M1400236507',1
'C1000148617','M1741626453',1
'C1000148617','M1823072687',92
'C1000148617','M1842530320',1
...,...,...
'C999723254','M348934600',27
'C999723254','M349281107',1
'C999723254','M480139044',1
'C999723254','M855959430',2


In [32]:
customer_merchant_trans_fraud = edges.groupby(by=['customer','merchant']).agg({'fraud':'sum'})
customer_merchant_trans_fraud

Unnamed: 0_level_0,Unnamed: 1_level_0,fraud
customer,merchant,Unnamed: 2_level_1
'C1000148617','M1053599405',0
'C1000148617','M1400236507',0
'C1000148617','M1741626453',0
'C1000148617','M1823072687',0
'C1000148617','M1842530320',0
...,...,...
'C999723254','M348934600',0
'C999723254','M349281107',0
'C999723254','M480139044',1
'C999723254','M855959430',0


In [33]:
customer_merchant_trans_fraud.columns

Index(['fraud'], dtype='object')

### Oberservation: 1065 (customer, merchant) pairs had been flagged as fraud for more than 1 time

In [34]:
customer_merchant_trans_fraud.loc[customer_merchant_trans_fraud.fraud>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,fraud
customer,merchant,Unnamed: 2_level_1
'C1001065306','M17379832',2
'C1001065306','M480139044',3
'C1001065306','M980657600',2
'C1007572087','M732195782',2
'C1013313546','M980657600',2
...,...,...
'C974315171','M980657600',3
'C980181294','M480139044',2
'C980181294','M732195782',2
'C989137613','M732195782',2


### Oberservation: 1108 (customer, merchant) pairs had changing labels

In [35]:
customer_merchant_trans_fraud_consistency = edges.groupby(by=['customer','merchant']).agg({'fraud':'mean'})
customer_merchant_trans_fraud_consistency

Unnamed: 0_level_0,Unnamed: 1_level_0,fraud
customer,merchant,Unnamed: 2_level_1
'C1000148617','M1053599405',0.0
'C1000148617','M1400236507',0.0
'C1000148617','M1741626453',0.0
'C1000148617','M1823072687',0.0
'C1000148617','M1842530320',0.0
...,...,...
'C999723254','M348934600',0.0
'C999723254','M349281107',0.0
'C999723254','M480139044',1.0
'C999723254','M855959430',0.0


In [43]:
customer_merchant_trans_fraud_consistency.loc[(customer_merchant_trans_fraud_consistency.fraud!=1) & (customer_merchant_trans_fraud_consistency.fraud!=0) ]

Unnamed: 0_level_0,Unnamed: 1_level_0,fraud
customer,merchant,Unnamed: 2_level_1
'C100045114','M1198415165',0.250000
'C100045114','M2122776122',0.500000
'C1001065306','M480139044',0.500000
'C1001065306','M50039827',0.200000
'C1001065306','M980657600',0.666667
...,...,...
'C995844287','M1198415165',0.333333
'C995844287','M855959430',0.200000
'C997029022','M480139044',0.333333
'C998690782','M732195782',0.500000


# Dedupe (customer, merchant) pair, only keep the last transaction (the latest)

In [45]:
edges.shape

(594643, 6)

In [46]:
edges_deduped = edges.drop_duplicates(subset=['customer','merchant'], keep='last', )

In [47]:
edges_deduped.shape

(47132, 6)

In [50]:
edges_deduped.shape

(47132, 6)

In [53]:
edges_deduped.groupby(by=['customer','merchant']).agg({'step':'count'}) 

Unnamed: 0_level_0,Unnamed: 1_level_0,step
customer,merchant,Unnamed: 2_level_1
'C1000148617','M1053599405',1
'C1000148617','M1400236507',1
'C1000148617','M1741626453',1
'C1000148617','M1823072687',1
'C1000148617','M1842530320',1
...,...,...
'C999723254','M348934600',1
'C999723254','M349281107',1
'C999723254','M480139044',1
'C999723254','M855959430',1


In [54]:
edges_deduped, edges_deduped.index

(        step       customer       merchant             category  amount  fraud
 12         0   'C623601481'    'M50039827'          'es_health'   68.79      0
 148        0  'C2092526272'   'M840466850'            'es_tech'  163.56      0
 153        0   'C661876608'  'M1741626453'   'es_sportsandtoys'   11.83      0
 194        0  'C1436756684'   'M692898500'          'es_health'  187.62      0
 218        0  'C1960866892'   'M855959430'           'es_hyper'   22.65      0
 ...      ...            ...            ...                  ...     ...    ...
 594638   179  'C1753498738'  'M1823072687'  'es_transportation'   20.53      0
 594639   179   'C650108285'  'M1823072687'  'es_transportation'   50.73      0
 594640   179   'C123623130'   'M349281107'         'es_fashion'   22.44      0
 594641   179  'C1499363341'  'M1823072687'  'es_transportation'   14.46      0
 594642   179   'C616528518'  'M1823072687'  'es_transportation'   26.93      0
 
 [47132 rows x 6 columns],
 Int64Index

In [59]:
edges_array = np.array(edges_deduped[['customer','merchant']])

In [60]:
edges_array, edges_array.shape

(array([["'C623601481'", "'M50039827'"],
        ["'C2092526272'", "'M840466850'"],
        ["'C661876608'", "'M1741626453'"],
        ...,
        ["'C123623130'", "'M349281107'"],
        ["'C1499363341'", "'M1823072687'"],
        ["'C616528518'", "'M1823072687'"]], dtype=object),
 (47132, 2))

### convert str ids to int indexes 

In [61]:
vertexs, edges_1d = np.unique(edges_array, return_inverse=True)

In [62]:
vertexs, len(vertexs)

(array(["'C1000148617'", "'C100045114'", "'C1000699316'", ...,
        "'M933210764'", "'M97925176'", "'M980657600'"], dtype=object),
 4162)

### save str ids to int indexes mapping

In [63]:
vertex_to_id = {}
for i,vertex in enumerate(vertexs):
    vertex_to_id.setdefault(vertex,i)

In [66]:
vertex_to_id

{"'C1000148617'": 0,
 "'C100045114'": 1,
 "'C1000699316'": 2,
 "'C1001065306'": 3,
 "'C1002658784'": 4,
 "'C1002759277'": 5,
 "'C1004109477'": 6,
 "'C1004300450'": 7,
 "'C1004532392'": 8,
 "'C1005126300'": 9,
 "'C1005495267'": 10,
 "'C1005806982'": 11,
 "'C1006176917'": 12,
 "'C1007572087'": 13,
 "'C1007790716'": 14,
 "'C1008918174'": 15,
 "'C1009080922'": 16,
 "'C100992504'": 17,
 "'C10105795'": 18,
 "'C1010589026'": 19,
 "'C1010865894'": 20,
 "'C1010936270'": 21,
 "'C1011693897'": 22,
 "'C1012876398'": 23,
 "'C1013199152'": 24,
 "'C1013313546'": 25,
 "'C1014010834'": 26,
 "'C1014124305'": 27,
 "'C1014313887'": 28,
 "'C1014347664'": 29,
 "'C1014783484'": 30,
 "'C1014847580'": 31,
 "'C1017469037'": 32,
 "'C1017941602'": 33,
 "'C1018153896'": 34,
 "'C1018388493'": 35,
 "'C1018653381'": 36,
 "'C1018783569'": 37,
 "'C1018979501'": 38,
 "'C1019071188'": 39,
 "'C1019678929'": 40,
 "'C1020019215'": 41,
 "'C102010508'": 42,
 "'C1020192133'": 43,
 "'C1020355960'": 44,
 "'C1021409246'": 45,
 "'

In [74]:
vertex_to_id_df = pd.DataFrame.from_dict(vertex_to_id,  orient='index', columns=['idx']).reset_index().rename(columns={"index": "name"})

In [75]:
vertex_to_id_df

Unnamed: 0,name,idx
0,'C1000148617',0
1,'C100045114',1
2,'C1000699316',2
3,'C1001065306',3
4,'C1002658784',4
...,...,...
4157,'M85975013',4157
4158,'M923029380',4158
4159,'M933210764',4159
4160,'M97925176',4160


In [77]:
edges_idx = np.reshape(edges_1d, [-1, 2])

In [78]:
edges_idx, len(edges_idx)

(array([[3317, 4148],
        [2363, 4154],
        [3396, 4127],
        ...,
        [ 529, 4143],
        [1083, 4130],
        [3304, 4130]]),
 47132)

### check whether the node indexes for the top 3 edge list records are correct 

In [150]:
### manually checkingg the node id for the note indexes
(vertexs[3317], vertexs[4148]), (vertexs[2363], vertexs[4154]),(vertexs[3396], vertexs[4127]), (vertexs[3304], vertexs[4130])

(("'C623601481'", "'M50039827'"),
 ("'C2092526272'", "'M840466850'"),
 ("'C661876608'", "'M1741626453'"),
 ("'C616528518'", "'M1823072687'"))

In [89]:
### consistent with the raw data 
edges_deduped.head(3)

Unnamed: 0,step,customer,merchant,category,amount,fraud
12,0,'C623601481','M50039827','es_health',68.79,0
148,0,'C2092526272','M840466850','es_tech',163.56,0
153,0,'C661876608','M1741626453','es_sportsandtoys',11.83,0


In [87]:
print('vertex:', len(vertexs), 'edge:', len(edges_idx))

vertex: 4162 edge: 47132


# Find labels for the edge

In [91]:
from tqdm import tqdm

In [196]:
#edge_label = {}
edge_label_arr = np.zeros([edges_deduped.shape[0], 3], dtype=np.int32)
#for index, row in tqdm(raw_trans_data.iterrows(), total=raw_trans_data.shape[0]):
for idx, row in tqdm(edges_deduped.reset_index().iterrows(), total=edges_deduped.shape[0]): #using deduped trans 
    #edge_label.setdefault((vertex_to_id[row['customer']],vertex_to_id[row['merchant']]), []).append(row['fraud'])
    edge_label_arr[idx][0] = vertex_to_id[row['customer']]
    edge_label_arr[idx][1] = vertex_to_id[row['merchant']]
    edge_label_arr[idx][2] = row['fraud']

100%|██████████| 47132/47132 [00:04<00:00, 10222.60it/s]


In [198]:
edge_label_arr.shape

(47132, 3)

In [199]:
edge_label_arr

array([[3317, 4148,    0],
       [2363, 4154,    0],
       [3396, 4127,    0],
       ...,
       [ 529, 4143,    0],
       [1083, 4130,    0],
       [3304, 4130,    0]], dtype=int32)

In [205]:
#edge_label_postprocessed_df = pd.DataFrame.from_dict(edge_label,  orient='index', columns=['label']).reset_index().rename(columns={"index": "edge"})
edge_label_postprocessed_df = pd.DataFrame(edge_label_arr, columns=['source','target','label'])

In [206]:
edge_label_postprocessed_df

Unnamed: 0,source,target,label
0,3317,4148,0
1,2363,4154,0
2,3396,4127,0
3,948,4151,0
4,2086,4155,0
...,...,...,...
47127,1639,4130,0
47128,3369,4130,0
47129,529,4143,0
47130,1083,4130,0


In [305]:
edge_label_arr[0][0], edge_label_arr[0][1], edge_label_arr[0][2]

(3317, 4148, 0)

### check again the processed data are consistent with the raw data 

In [306]:
(vertexs[edge_label_arr[0][0]], vertexs[edge_label_arr[0][1]])

("'C623601481'", "'M50039827'")

In [307]:
edges_deduped.loc[(edges_deduped.customer ==vertexs[edge_label_arr[0][0]] )& (edges_deduped.merchant ==vertexs[edge_label_arr[0][1]])]

Unnamed: 0,step,customer,merchant,category,amount,fraud
12,0,'C623601481','M50039827','es_health',68.79,0


In [308]:
#check fraud ratio
edge_label_postprocessed_df['label'].value_counts(normalize=True)

0    0.912353
1    0.087647
Name: label, dtype: float64

# Split train/test data and generate data for graph dataloader 

In [161]:
edges_deduped.shape

(47132, 6)

In [162]:
edges_deduped.head(10)

Unnamed: 0,step,customer,merchant,category,amount,fraud
12,0,'C623601481','M50039827','es_health',68.79,0
148,0,'C2092526272','M840466850','es_tech',163.56,0
153,0,'C661876608','M1741626453','es_sportsandtoys',11.83,0
194,0,'C1436756684','M692898500','es_health',187.62,0
218,0,'C1960866892','M855959430','es_hyper',22.65,0
227,0,'C991774315','M349281107','es_fashion',79.71,0
253,0,'C1959067413','M209847108','es_wellnessandbeauty',86.09,0
256,0,'C1465698425','M2122776122','es_home',98.19,0
282,0,'C181787207','M1873032707','es_hotelservices',71.76,0
283,0,'C2063978670','M2011752106','es_hotelservices',194.59,0


In [175]:
edges_deduped.tail(10)

Unnamed: 0,step,customer,merchant,category,amount,fraud
594633,179,'C1039390058','M45060432','es_hotelservices',190.31,0
594634,179,'C728039227','M348934600','es_transportation',27.93,0
594635,179,'C728039227','M1823072687','es_transportation',30.04,0
594636,179,'C1904086644','M1823072687','es_transportation',28.93,0
594637,179,'C748358246','M1823072687','es_transportation',51.17,0
594638,179,'C1753498738','M1823072687','es_transportation',20.53,0
594639,179,'C650108285','M1823072687','es_transportation',50.73,0
594640,179,'C123623130','M349281107','es_fashion',22.44,0
594641,179,'C1499363341','M1823072687','es_transportation',14.46,0
594642,179,'C616528518','M1823072687','es_transportation',26.93,0


In [219]:
len(edge_label_arr), len(vertexs)

(47132, 4162)

In [273]:
vertex_to_id_df.shape, edge_label_arr.shape

((4162, 2), (47132, 3))

In [274]:
m = len(edge_label_arr) #edge number 
n = len(vertex_to_id_df) #node number 

In [275]:
m,n

(47132, 4162)

In [223]:
train_per = 0.5 #split in half 

In [224]:
train_num = int(np.floor(train_per * m))
train_num

23566

In [233]:
train = edge_label_arr[0:train_num, :] #first half being training samples
test = edge_label_arr[train_num:, :] #second half being test samples 

In [227]:
train.shape, test.shape

((23566, 3), (23566, 3))

In [229]:
train[:10,:]

array([[3317, 4148,    0],
       [2363, 4154,    0],
       [3396, 4127,    0],
       [ 948, 4151,    0],
       [2086, 4155,    0],
       [4094, 4143,    0],
       [2079, 4139,    0],
       [1008, 4140,    0],
       [1784, 4133,    0],
       [2306, 4137,    0]], dtype=int32)

In [230]:
edge_label_postprocessed_df.head(10)

Unnamed: 0,source,target,label
0,3317,4148,0
1,2363,4154,0
2,3396,4127,0
3,948,4151,0
4,2086,4155,0
5,4094,4143,0
6,2079,4139,0
7,1008,4140,0
8,1784,4133,0
9,2306,4137,0


In [231]:
test[-10:,:]

array([[  86, 4145,    0],
       [3520, 4142,    0],
       [3520, 4130,    0],
       [1963, 4130,    0],
       [3564, 4130,    0],
       [1639, 4130,    0],
       [3369, 4130,    0],
       [ 529, 4143,    0],
       [1083, 4130,    0],
       [3304, 4130,    0]], dtype=int32)

In [232]:
edge_label_postprocessed_df.tail(10)

Unnamed: 0,source,target,label
47122,86,4145,0
47123,3520,4142,0
47124,3520,4130,0
47125,1963,4130,0
47126,3564,4130,0
47127,1639,4130,0
47128,3369,4130,0
47129,529,4143,0
47130,1083,4130,0
47131,3304,4130,0


# Save edge list into sparse matrix 

In [245]:
from scipy.sparse import csr_matrix,coo_matrix,eye

In [237]:
train_mat = csr_matrix(
    (np.ones([np.size(train, 0)], dtype=np.int32), 
     (train[:, 0], train[:, 1])),
    shape=(n, n))

In [239]:
train_mat.shape

(4162, 4162)

In [240]:
train_mat = train_mat + train_mat.transpose() #enforce symmetry 

In [242]:
train_mat[3317,4148], train_mat[4148,3317]

(1, 1)

In [244]:
train_mat[86,4145], train_mat[4145,86] #being 0 because this edge is in the test set 

(0, 0)

In [246]:
train_mat = (train_mat + train_mat.transpose() + eye(n)).tolil()

In [248]:
train_mat[3317,4148], train_mat[4148,3317]

(2.0, 2.0)

In [249]:
train_mat[86,4145], train_mat[4145,86]

(0.0, 0.0)

In [250]:
train_mat[0,0], train_mat[1,1]

(1.0, 1.0)

In [251]:
headtail = train_mat.rows #store the indexes of edges

In [252]:
headtail

array([list([0, 4119, 4127, 4134, 4136, 4146, 4154]),
       list([1, 4121, 4122, 4133, 4140, 4146, 4149, 4151, 4152, 4155]),
       list([2, 4112, 4130, 4134, 4136, 4142, 4155, 4157]), ...,
       list([46, 147, 769, 1100, 1453, 1478, 1539, 1554, 1632, 1654, 1660, 1782, 1917, 1993, 2187, 2387, 2413, 2473, 2594, 2624, 2626, 2772, 2816, 2844, 2903, 2981, 3036, 3104, 3303, 3326, 3412, 3438, 3442, 3488, 3685, 3871, 4159]),
       list([25, 48, 50, 61, 94, 108, 111, 123, 126, 137, 144, 149, 173, 188, 210, 224, 275, 289, 291, 295, 297, 302, 323, 330, 355, 365, 409, 415, 416, 497, 498, 509, 510, 529, 530, 552, 556, 571, 588, 593, 595, 613, 617, 629, 676, 706, 707, 751, 773, 793, 801, 808, 818, 821, 832, 848, 867, 868, 870, 889, 890, 893, 919, 961, 964, 997, 1000, 1007, 1017, 1050, 1064, 1076, 1079, 1080, 1086, 1138, 1141, 1143, 1151, 1156, 1241, 1270, 1281, 1287, 1374, 1385, 1398, 1413, 1416, 1425, 1452, 1507, 1521, 1530, 1534, 1548, 1553, 1557, 1584, 1586, 1607, 1611, 1616, 1617, 1654, 1661

In [254]:
#check degrees of each source node 
degrees = np.array([len(x) for x in headtail])

# Creating snapshots of graphs for the dataloader of TADDY model

In [257]:
snap_size=5000

In [256]:
len(train), len(test)

(23566, 23566)

In [258]:
train_size = int(len(train) / snap_size + 0.5) #making slices of snapshots
test_size = int(len(test) / snap_size + 0.5)

In [259]:
train_size, test_size

(5, 5)

In [260]:
rows = []
cols = []
weis = []
labs = []
for ii in range(train_size):
    start_loc = ii * snap_size
    end_loc = (ii + 1) * snap_size

    row = np.array(train[start_loc:end_loc, 0], dtype=np.int32)
    col = np.array(train[start_loc:end_loc, 1], dtype=np.int32)
    lab = np.array(train[start_loc:end_loc, 2], dtype=np.int32)
    wei = np.ones_like(row, dtype=np.int32)

    rows.append(row)
    cols.append(col)
    weis.append(wei) #weights
    labs.append(lab) #label

In [261]:
for i in range(test_size):
    start_loc = i * snap_size
    end_loc = (i + 1) * snap_size

    row = np.array(test[start_loc:end_loc, 0], dtype=np.int32)
    col = np.array(test[start_loc:end_loc, 1], dtype=np.int32)
    lab = np.array(test[start_loc:end_loc, 2], dtype=np.int32)
    wei = np.ones_like(row, dtype=np.int32)

    rows.append(row)
    cols.append(col)
    weis.append(wei)
    labs.append(lab)

In [264]:
len(rows), rows[0].shape

(10, (5000,))

In [265]:
rows[0]

array([3317, 2363, 3396, ..., 1738, 2754, 2754], dtype=int32)

In [266]:
len(cols), cols[0].shape

(10, (5000,))

In [267]:
cols[0]

array([4148, 4154, 4127, ..., 4132, 4146, 4148], dtype=int32)

In [269]:
len(labs), labs[0].shape

(10, (5000,))

In [270]:
labs[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

### check intermediate processed data 

In [284]:
edge_data_loaded = np.load("../../data/02_intermediate/edge_list.npz")

In [287]:
type(edge_data_loaded)

numpy.lib.npyio.NpzFile

In [288]:
edge_data_loaded['data']

array([[3317, 4148,    0],
       [2363, 4154,    0],
       [3396, 4127,    0],
       ...,
       [ 529, 4143,    0],
       [1083, 4130,    0],
       [3304, 4130,    0]], dtype=int32)

# References

Edgar Alonso Lopez-Rojas and Stefan Axelsson. 2014. BANKSIM: A BANK PAYMENTS SIMULATOR FOR FRAUD DETECTION RESEARCH.

Yixin Liu, Shirui Pan, Yu Guang Wang, Fei Xiong, Liang Wang, Qingfeng Chen, and Vincent CS Lee. 2015. Anomaly Detection in Dynamic Graphs via Transformer.