In [24]:
import pandas as pd
pd.options.mode.use_inf_as_na = True
import numpy as np
from typing import List, Dict, Any
import dgl
import torch as th
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import make_column_transformer
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [25]:
train_transaction=pd.read_csv('dataset/train_transaction.csv')
train_identity=pd.read_csv('dataset/train_identity.csv')

In [26]:
#merge the two tables using TransactionID where the resulting dataframe contains the intersection of both frames.
merged_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='inner')
#sort the merged dataframe and perfrom sorting inplace to use less memory
merged_df.sort_values(by='TransactionDT', ascending=True, inplace=True)

In [27]:
#split data into 80% train set and 20% test set
tot_size = len(merged_df)
train_size = int(tot_size*0.8)
test_size  = tot_size - train_size
train_df = merged_df.head(train_size)
test_df  = merged_df.tail(test_size)

In [28]:
#Initially, define the graph as none and initiate a set of parameters for training. These parameters can be changed as desired
train_g = None
parameters = {
            'n_layers': 2,  # number of graph layers
            'n_epochs': 150,  # number of training epochs
            'n_hidden': 16,  # number of hidden units
            'dropout': 0.2,  # dropout rate
            'weight_decay': 5e-05,  # L2 penalization term 
            'lr': 0.01,  # learning rate
            'target_col': 'TransactionID',  # target (transaction-id) column
            'node_cols': 'card1,card2,card3,card4,card5,card6,ProductCD,addr1,addr2,P_emaildomain,R_emaildomain',  # columns to create nodes
            'label_col': 'isFraud',  # label column
            # categorical feature columns
            'cat_cols': 'M1,M2,M3,M4,M5,M6,M7,M8,M9,DeviceType,DeviceInfo,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38',
            # numerical feature columns
            'num_cols': 'TransactionAmt,dist1,dist2,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339',
            'class_weight': 1.  # class weight. Can take any value other than 1 to give importance to one class over the other
        }

In [29]:
#Function to map numerical features to log scale to decrease their range and the huge differences between them (feature smoothing)
def scale_numerical_features(data):
    return np.log10(data + 1e-9)

In [30]:
#Function to create a lookup dictionary mapping nodes of a certain type to integer ids
def nodes_to_ids(vals, lookup, new_node_id=0):
    ret=[]
    new_nodes=[]
    new_vals=[]
    for v in vals:
        if v not in lookup:
            lookup[v] = new_node_id
            new_nodes.append(new_node_id)
            new_vals.append(v)
            new_node_id += 1

        ret.append(lookup[v])

    return ret, lookup, new_nodes, new_vals

In [31]:
def normalize_train_data(data):
    mean = th.mean(data, dim=0)
    std = th.sqrt(th.sum((data - mean)**2, dim=0)/data.shape[0])
    std = std.numpy()
    std[np.isclose(std, 0.0)]=1.
    std = th.from_numpy(std)
    return mean, std, (data - mean) / std

In [32]:
def normalize_test_data(data, mean, std):
    return (data - mean) / std

In [33]:
#Function to construct the graph using DGL library (dgl.heterograph)
def construct_graph(train_df, target_col, node_cols, cat_cols, num_cols):

#construct a transformer to pre-process the different columns of train dataset. Numerical cols are passed to (scale_numerical_features) function and categorical cols are one-hot-encoded
        feature_processor= make_column_transformer(
            (
                #scale all numerical features to log scale to smooth the feature set
                FunctionTransformer(scale_numerical_features),
                num_cols.split(',')
            ),
            (
                #One-hot-encode the categorical features
                OneHotEncoder(handle_unknown='ignore', sparse=False),
                cat_cols.split(',')
            ),
            remainder='drop'
        )

        feature_processor.fit(train_df)

        # fill NaN values with 0
        features = np.nan_to_num(feature_processor.transform(train_df), nan=0.)

        # create edge lists to store the different edges of the graph
        edgelists = {}

        nodes_lookup = {}
        nodes_lookup['target'] = {}

        # map target column (TransactionID) to integer ids
        target_nodes, target_lookup, target_new_nodes, _ = nodes_to_ids(train_df[target_col], nodes_lookup['target'], 0)
        nodes_lookup['target'] = target_lookup

        # create self-relation edges
        edgelists[('target', 'self_relation', 'target')] = [(t, t) for t in target_nodes]

        # map nodes of type nc to integer ids
        for nc in node_cols.split(','):
            nodes_lookup[nc]={}
            nodes, lookup, new_nodes, _ = nodes_to_ids(train_df[nc], nodes_lookup[nc], 0)
            nodes_lookup[nc] = lookup

            # construct bidirectional edges between target nodes and nodes of type nc
            elist = []
            rlist = []
            for s, t in zip(target_nodes, nodes):
                elist.append((s, t))
                rlist.append((t, s))

            edgelists[('target', f'target<>{nc}', nc)] = elist
            edgelists[(nc, f'{nc}<>target', 'target')] = rlist

        # create the heterograph object from edge lists using DGL library
        g = dgl.heterograph(edgelists)
        print(
            "The heterograph was created successfully: \n Node types {} \n Edge types{}".format(
                g.ntypes, g.canonical_etypes))

        #Normalize the training features and add them to the graph target nodes
        g.nodes['target'].data['features'] = th.from_numpy(features.astype('float32'))

        train_mean, train_stdev, features = normalize_train_data(th.from_numpy(features.astype('float32')))

        g.nodes['target'].data['features'] = features

        return g, nodes_lookup, feature_processor, train_mean, train_stdev


In [34]:
train_g, nodes_lookup, feature_processor, train_mean, train_stdev = construct_graph(train_df, parameters['target_col'], parameters['node_cols'], parameters['cat_cols'], parameters['num_cols'])

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


The heterograph was created successfully: 
 Node types ['P_emaildomain', 'ProductCD', 'R_emaildomain', 'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'target'] 
 Edge types[('P_emaildomain', 'P_emaildomain<>target', 'target'), ('ProductCD', 'ProductCD<>target', 'target'), ('R_emaildomain', 'R_emaildomain<>target', 'target'), ('addr1', 'addr1<>target', 'target'), ('addr2', 'addr2<>target', 'target'), ('card1', 'card1<>target', 'target'), ('card2', 'card2<>target', 'target'), ('card3', 'card3<>target', 'target'), ('card4', 'card4<>target', 'target'), ('card5', 'card5<>target', 'target'), ('card6', 'card6<>target', 'target'), ('target', 'self_relation', 'target'), ('target', 'target<>P_emaildomain', 'P_emaildomain'), ('target', 'target<>ProductCD', 'ProductCD'), ('target', 'target<>R_emaildomain', 'R_emaildomain'), ('target', 'target<>addr1', 'addr1'), ('target', 'target<>addr2', 'addr2'), ('target', 'target<>card1', 'card1'), ('target', 'target<>card2', 'card2'),

In [35]:
class HeteroRGCNLayer(nn.Module):
    #in_size and out_size are the size of each input and output sample
    def __init__(self, in_size, out_size, etypes):
        #calls init method of superclass to perfrom both functions of super and subclass
        super(HeteroRGCNLayer, self).__init__()
        # A dictionary of linear modules to compute the layer weights specific to each realtion (edge type)
        self.weight = nn.ModuleDict({
                name: nn.Linear(in_size, out_size) for name in etypes
            })
    def forward(self, G, feat_dict):
        # feat_dict is a dictionary of node features for each node type
        funcs = {}
        for srctype, etype, dsttype in G.canonical_etypes:
            # Compute W * h, where (W) is the weight matrix and (h) are the layer features
            if srctype in feat_dict:
                #wh is the result of passing the features of the source node to the linear module representing the relation (edge)
                Wh = self.weight[etype](feat_dict[srctype])
                # Save it in graph for message passing
                G.nodes[srctype].data['Wh_%s' % etype] = Wh
                # Specify per-relation message passing functions: (message_func, reduce_func). Mean reduction is done per relation (etype) to get a weighted average of the contribution of all node features for this specific relation
                funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
        # After Message passing is done and per-relation reduction (mean) is perfromed, then results are summed over all etypes to account for the contribution of all relations. 
        G.multi_update_all(funcs, 'sum')
        # return the updated node feature dictionary
        return {ntype: G.nodes[ntype].data['h'] for ntype in G.ntypes if 'h' in G.nodes[ntype].data}

In [36]:
# Class defining the RGCN model using multiple HeteroRGCNLayers
class HeteroRGCN(nn.Module):
    def __init__(self, ntype_dict, etypes, in_size, hidden_size, out_size, n_layers, embedding_size):
        super(HeteroRGCN, self).__init__()
        # Use trainable node embeddings as featureless inputs.
        embed_dict = {ntype: nn.Parameter(th.Tensor(num_nodes, in_size))
                      for ntype, num_nodes in ntype_dict.items() if ntype != 'target'}
        for key, embed in embed_dict.items():
            nn.init.xavier_uniform_(embed)
        self.embed = nn.ParameterDict(embed_dict)
        # stack layers to create the model as desired
        self.layers = nn.ModuleList()
        self.layers.append(HeteroRGCNLayer(embedding_size, hidden_size, etypes))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(HeteroRGCNLayer(hidden_size, hidden_size, etypes))

        # output layer
        self.layers.append(nn.Linear(hidden_size, out_size))

    def forward(self, g, features):
        # Use the created trainable embeddings for all node types. For target node type, use passed in user features
        h_dict = {ntype: emb for ntype, emb in self.embed.items()}
        h_dict['target'] = features

        # pass through all layers
        for i, layer in enumerate(self.layers[:-1]):
            if i != 0:
                h_dict = {k: F.leaky_relu(h) for k, h in h_dict.items()}
            h_dict = layer(g, h_dict)

        # get final target predcitions
        return self.layers[-1](h_dict['target'])

In [37]:
def trainer(model, optimizer, loss, features, labels, train_g, device, n_epochs, test_mask):
    #Function to train the model layers and optimize its parameters through backward propagation

    train_mask = th.logical_not(test_mask)
    train_idx =  th.nonzero(train_mask, as_tuple=True)[0]

    duration = []
    for epoch in range(n_epochs):
        tic = time.time()
        loss_val = 0.

        #get model initial predictions
        pred = model(train_g, features.to(device))
        #compute the loss
        l = loss(th.index_select(pred, 0, train_idx), 
                 th.index_select(labels, 0, train_idx))

        #perfrom backward propagation to upfate all the weights (parameters) using thier gradient w.r.t the loss function
        optimizer.zero_grad()
        l.backward()
        optimizer.step()

        loss_val += l

        duration.append(time.time() - tic)
        
        preds = model(train_g, features.to(device))
        preds = th.argmax(preds, dim=1).numpy()
        train_mask = np.logical_not(test_mask.numpy().astype('bool'))
        train_labels = labels.numpy()
        train_labels = np.compress(train_mask, train_labels)
        train_preds= np.compress(train_mask, preds)
        
        #get the model metrics. Only the f1 score is displayed (could be changed as desired)
        cf_m = confusion_matrix(train_labels, train_preds)
        precision = cf_m[1,1] / (cf_m[1,1] + cf_m[0,1] + 10e-5)
        recall = cf_m[1,1] / (cf_m[1,1] + cf_m[1,0])
        f1 = 2 * (precision * recall) / (precision + recall + 10e-5)
        
        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | f1 {:.4f} ".format(epoch, np.mean(duration), loss_val, f1))

    return model

In [38]:
def train_graph(train_df: pd.DataFrame, params: Dict[str, Any] = None, test_mask: List[bool] = None):
        #Function to configure the model used for training and initialize its parameters
        
        device = th.device('cpu')

        in_feats = train_g.nodes['target'].data['features'].shape[1]
        n_classes = 2

        ntype_dict = {n_type: train_g.number_of_nodes(n_type) for n_type in train_g.ntypes}

        model = HeteroRGCN(ntype_dict, train_g.etypes, in_feats, params['n_hidden'], n_classes, params['n_layers'], in_feats)
        model = model.to(device)

        print("Configured Model")

        class_weights = [1. / params['class_weight'],
                         params['class_weight']]

        train_labels = train_df[params['label_col']].values

        if test_mask is None:
            test_mask = np.zeros_like(train_labels, dtype='bool')
        else:
            test_mask = np.asarray(test_mask)

        train_features = train_g.nodes['target'].data['features'].to(device)
        train_labels = th.from_numpy(train_labels).long().to(device)
        test_mask = th.from_numpy(test_mask).to(device)

        loss = th.nn.CrossEntropyLoss(weight=th.tensor(class_weights).float())
        optimizer = th.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])


        print("Training Started")
        model = trainer(model, optimizer, loss, train_features, train_labels, train_g, device, params['n_epochs'], test_mask)

        print("Training Finished")
        
        return model

In [39]:
trained_model = train_graph(train_df, params=parameters)

Configured Model
Training Started
Epoch 00000 | Time(s) 6.5014 | Loss 0.4219 | f1 0.2273 
Epoch 00001 | Time(s) 6.2087 | Loss 0.3224 | f1 0.2223 
Epoch 00002 | Time(s) 6.0875 | Loss 1.2985 | f1 0.2677 
Epoch 00003 | Time(s) 5.9608 | Loss 0.2699 | f1 0.0467 
Epoch 00004 | Time(s) 5.9134 | Loss 0.5356 | f1 0.0982 
Epoch 00005 | Time(s) 5.8552 | Loss 0.5974 | f1 0.2690 
Epoch 00006 | Time(s) 5.8356 | Loss 0.6131 | f1 0.1830 
Epoch 00007 | Time(s) 5.9256 | Loss 0.5301 | f1 0.2736 
Epoch 00008 | Time(s) 5.9842 | Loss 0.4593 | f1 0.4053 
Epoch 00009 | Time(s) 5.9646 | Loss 0.3579 | f1 0.5025 
Epoch 00010 | Time(s) 5.9556 | Loss 0.2724 | f1 0.5691 
Epoch 00011 | Time(s) 5.9455 | Loss 0.2641 | f1 0.5656 
Epoch 00012 | Time(s) 5.9385 | Loss 0.2664 | f1 0.5407 
Epoch 00013 | Time(s) 5.9519 | Loss 0.2546 | f1 0.5360 
Epoch 00014 | Time(s) 5.9813 | Loss 0.2495 | f1 0.5479 
Epoch 00015 | Time(s) 5.9687 | Loss 0.2239 | f1 0.5731 
Epoch 00016 | Time(s) 5.9620 | Loss 0.1867 | f1 0.6094 
Epoch 00017 | 

Epoch 00146 | Time(s) 6.2716 | Loss 0.0123 | f1 0.9818 
Epoch 00147 | Time(s) 6.2722 | Loss 0.0121 | f1 0.9832 
Epoch 00148 | Time(s) 6.2715 | Loss 0.0119 | f1 0.9831 
Epoch 00149 | Time(s) 6.2705 | Loss 0.0117 | f1 0.9829 
Training Finished


In [40]:
def extend_graph(test_df, train_g, target_col, node_cols, nodes_lookup, feature_processor, train_mean, train_stdev):
    #Function to extend the training graph using new test nodes and extract the relevant subgraph for inference based on the number of hops desired
        
        #Pre-process the features of test nodes
        features = np.nan_to_num(feature_processor.transform(test_df), nan=0.)

        added_nodes = {}

        target_nodes, target_lookup, target_new_nodes, target_new_vals = nodes_to_ids(test_df[target_col],
                                                                                         nodes_lookup['target'],
                                                                                         train_g.number_of_nodes('target'))

        target_new_nodes = set(target_new_nodes)

        target_nodes_to_add= [t for t in target_nodes if t in target_new_nodes]
        feature_sel= [True if t in target_new_nodes else False for t in target_nodes]

        new_features = np.compress(feature_sel, features, axis=0)
        new_features = normalize_test_data(th.from_numpy(new_features), train_mean, train_stdev)

        if len(target_new_nodes)> 0:
            train_g=dgl.add_nodes(train_g, len(target_new_nodes), ntype='target')
            train_g.nodes['target'].data['features'][-len(new_features):,:]=new_features
            added_nodes['target']=(list(target_new_nodes), target_new_vals)

        if len(target_nodes_to_add)>0:
            train_g = dgl.add_edges(train_g, target_nodes_to_add, target_nodes_to_add, etype=('target', 'self_relation', 'target'))

        for nc in node_cols.split(','):
            nodes, lookup, new_nodes, new_vals = nodes_to_ids(test_df[nc],
                                                                 nodes_lookup[nc],
                                                                 train_g.number_of_nodes(nc))

            if len(new_nodes)> 0:
                train_g = dgl.add_nodes(train_g, len(new_nodes), ntype=nc)
                added_nodes[nc] = (new_nodes, new_vals)

            elist_u = []
            elist_v = []
            rlist_u = []
            rlist_v = []
            for s, t in zip(nodes, target_nodes):
                if t in target_new_nodes:
                    elist_u.append(t)
                    elist_v.append(s)
                    rlist_u.append(s)
                    rlist_v.append(t)

            if len(elist_u)>0:
                train_g = dgl.add_edges(train_g, elist_u, elist_v, etype=('target', f'target<>{nc}', nc))
                train_g = dgl.add_edges(train_g, rlist_u, rlist_v, etype=(nc, f'{nc}<>target', 'target'))

        return train_g, target_nodes, added_nodes

In [73]:
def predict(train_g, trained_model, test_df: pd.DataFrame, nodes_lookup, feature_processor, train_mean, train_stdev, k: int = 2):
        #Function to perform model inference on extracted test subgraph and get model predictions for test nodes

        if train_g is None:
            raise RuntimeError("Model must be trained first!")

        device = th.device('cpu')
        
        #Extend the training graph using the new nodes and get the test subgraph for inference
        train_g, target_nodes, added_nodes = extend_graph(test_df, train_g, parameters['target_col'], 
                                                          parameters['node_cols'], nodes_lookup, 
                                                          cat_transformer, train_mean, train_stdev)

        test_g, inverse_target_nodes = dgl.khop_out_subgraph(train_g, {'target': target_nodes}, k=k)

        test_features = test_g.nodes['target'].data['features']
        test_features = test_features.to(device)

        train_n_nodes = th.sum(th.tensor([train_g.number_of_nodes(n_type) for n_type in train_g.ntypes]))

        test_n_nodes = th.sum(th.tensor([test_g.number_of_nodes(n_type) for n_type in test_g.ntypes]))
        test_n_edges = th.sum(th.tensor([test_g.number_of_edges(e_type) for e_type in test_g.etypes]))

        print("""----Test subgraph extracted------'
                    No. of Nodes: {}
                    No. of Edges: {}""".format(test_n_nodes,test_n_edges,))

        model = trained_model
        embed_copy = dict(model.embed)

        print("Started model inference")
        for ntype, emb_ in model.embed.items():
            train_num = train_g.number_of_nodes(ntype)
            test_num = test_g.number_of_nodes(ntype)

            mean_emb = th.mean(emb_, dim=0)

            new_emb = mean_emb.repeat(test_num, 1).detach().numpy()

            ### for nodes in subgraph, get their node-ids in train_g (full graph)
            train_g_ids = test_g.ndata[dgl.NID][ntype].numpy()

            ### filter out subgraph nodes that were added to train_g after training,
            ### Remaining nodes are the ones that will have learned embeddings
            emb_train_g_ids = np.where(train_g_ids<emb_.shape[0])[0]

            print(f"Number of nodes of type {ntype} in subgraph={test_num}")

            ### copy embeddings of the subgraph nodes that were "seen" during training
            ### and fill the embeddings of other nodes in subgraph with  the mean
            new_emb[emb_train_g_ids, :] = th.index_select(emb_, 0, th.from_numpy(train_g_ids[emb_train_g_ids])).detach().numpy()

            model.embed[ntype] = th.nn.Parameter(th.from_numpy(new_emb))


        raw_preds = model(test_g, test_features.to(device))
        pred_prob = th.softmax(raw_preds, dim=-1)
        fraud_prob = pred_prob[:, 1].detach().numpy()

        model.embed = th.nn.ParameterDict(embed_copy)

        ### clean-up graph: remove newly added nodes from graph and from node-id lookups
        for ntype, nodes_tup in added_nodes.items():
            new_node_ids, new_node_vals = nodes_tup
            train_g.remove_nodes(new_node_ids, ntype=ntype)

            for new_val in new_node_vals:
                del nodes_lookup[ntype][new_val]

        return fraud_prob[inverse_target_nodes['target'].numpy()]

In [74]:
#Model evaluation on test set
n_hops=3
fraud_prob=predict(train_g, trained_model, test_df, nodes_lookup, feature_processor, train_mean, train_stdev, k=n_hops)
auc=roc_auc_score(test_df.isFraud, fraud_prob)

  result = func(self.values, **kwargs)


----Test subgraph extracted------'
                    No. of Nodes: 276808
                    No. of Edges: 3317359
Started model inference
Number of nodes of type P_emaildomain in subgraph=60
Number of nodes of type ProductCD in subgraph=4
Number of nodes of type R_emaildomain in subgraph=61
Number of nodes of type addr1 in subgraph=60702
Number of nodes of type addr2 in subgraph=60519
Number of nodes of type card1 in subgraph=8499
Number of nodes of type card2 in subgraph=1384
Number of nodes of type card3 in subgraph=275
Number of nodes of type card4 in subgraph=5
Number of nodes of type card5 in subgraph=1062
Number of nodes of type card6 in subgraph=4


In [76]:
from sklearn.metrics import roc_auc_score, accuracy_score
print(f"AUC Score = {roc_auc_score(test_df.isFraud, fraud_prob)}")
print(f"Accuracy Score = {accuracy_score(test_df.isFraud, np.round(fraud_prob))}")

AUC Score = 0.8696365077867189
Accuracy Score = 0.931604672929594
