In [69]:
import dgl
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.dataloading import GraphDataLoader
from torch.utils.data.sampler import SubsetRandomSampler, WeightedRandomSampler, RandomSampler
from torch.utils.data import random_split
import dgl.data
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import numpy as np
import dgl.data
import pickle
import glob
import tqdm
import networkx as nx
import matplotlib
import numpy

In [70]:

# Load data
graphs = dgl.load_graphs('./fraud_homographs.dgl')

for i, s in enumerate(graphs[0]):
    graphs[0][i] = graphs[0][i].to('cuda:0')


dataset = tuple(zip(graphs[0], graphs[1]['glabels'].argmax(1)))


In [71]:
from sklearn.model_selection import train_test_split

device = torch.device("cuda:0")
# Spliting Datset into training and testing
train_dataset, val_dataset = random_split(dataset, (4459, 1115))


# Obtaining weight of training datset and apply weightedrandomsampler
labels = [l for _, l in train_dataset]

target_list = torch.tensor([0, 1, 2, 3, 4])

class_count = [0, 0, 0, 0, 0]

for index, data in enumerate(train_dataset):
    class_count[data[1]] += 1
    

class_weights = 1./torch.tensor(class_count, dtype=torch.float).to(device)

print(class_weights)

class_weights_all = class_weights[labels].to(device)

weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

print(class_weights_all)

tensor([0.0007, 0.0007, 0.0063, 0.0011, 0.0015], device='cuda:0')
tensor([0.0007, 0.0007, 0.0011,  ..., 0.0007, 0.0007, 0.0007], device='cuda:0')


In [79]:
from dgl.nn import GraphConv
import dgl.nn.pytorch as dglnn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import sys

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, n_classes, n_layer, pooling='mean', dropout=0.5, activation=F.relu):
        super(GCN, self).__init__()
        self.gcnlayer = torch.nn.ModuleList()

        self.n_layer = n_layer
        self.pooling = pooling

        # Input Layer
        self.gcnlayer.append(GraphConv(in_feats, h_feats, activation=activation))
        # Hidden Layer
        for layer in range(n_layer-1):
            self.gcnlayer.append(GraphConv(h_feats, h_feats, activation=activation))
        # Output Layer
        self.gcnlayer.append(GraphConv(h_feats, n_classes))

    def forward(self, g, in_feat):
        h = in_feat
        for i, layer in enumerate(self.gcnlayer):
            h = layer(g, h)
        g.ndata['h'] = h

        # Readout
        hg = dgl.readout_nodes(g, 'h', op=self.pooling)
        return hg

def train(model, batch_size, n_epoch, optimizer):
    train_dataloader = GraphDataLoader(
    train_dataset, sampler=weighted_sampler, batch_size=batch_size, drop_last=False)
    
    gpu_mem = 0
    for epoch in tqdm.tqdm(range(n_epoch)):
        epoch_loss = 0
        epoch_train_acc = 0
        nb_data = 0
        for iter, (batched_graph, labels) in enumerate(train_dataloader):
            model.train()
            pred = model(batched_graph, batched_graph.ndata['feature'].float())
            loss = F.cross_entropy(pred, labels.to(device), weight=class_weights)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss
            epoch_train_acc += accuracy_score(labels.cpu(), pred.cpu().argmax(1))
            nb_data += len(labels)
        epoch_loss /= (iter + 1)
        epoch_train_acc /= nb_data

        # print(epoch_loss, epoch_train_acc)



In [74]:
def pred(model, average):
    test_dataloader = GraphDataLoader(
    val_dataset, drop_last=False)

    y_pred = torch.tensor([], dtype=torch.int32).to('cuda:0')
    y_label = torch.tensor([], dtype=torch.int32).to('cuda:0')

    for batched_graph, labels in test_dataloader:
        pred = model(batched_graph, batched_graph.ndata['feature'].float())
        y_pred = torch.cat((y_pred, pred), 0)
        y_label = torch.cat((y_label, labels.to(device)), 0)

    print('Test accuracy:', accuracy_score(y_label.cpu(), y_pred.cpu().argmax(1)))

    print('Precision: {}, Recall: {}, F1-score: {}'.format(precision_score(y_label.cpu(), y_pred.cpu().argmax(1), average=average), recall_score(y_label.cpu(), y_pred.cpu().argmax(1), average=average), f1_score(y_label.cpu(), y_pred.cpu().argmax(1), average=average)))
    return y_label, y_pred

In [80]:
import warnings
warnings.simplefilter("ignore", UserWarning)

model = GCN(15, 32, 5, 4, activation=F.softmax).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

train(model, 2, 20, optimizer)

y_label, y_pred = pred(model, 'weighted')

 25%|██▌       | 5/20 [02:14<06:42, 26.83s/it]

In [None]:
#### Applting t-SNE to graph representation extrated by FDGNN

from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

data = y_pred.cpu()

n_components = 2

model1 = TSNE(n_components=n_components)

dot = model1.fit_transform(data.data)

In [None]:
df = pd.DataFrame(dict(X=dot[:,0], Y=dot[:,1], y_hat=y_label.cpu()))

fig, ax = plt.subplots()

colors = {0:'red', 1:'green', 2:'blue', 3:'skyblue', 4:'gray'}

ax.scatter(df['X'], df['Y'], c=df['y_hat'].map(colors))

plt.legend()

plt.show()