In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ellipticco/elliptic-data-set")

print("Path to dataset files:", path)

In [None]:
import pandas as pd

df_edges = pd.read_csv('~/.cache/kagglehub/datasets/ellipticco/elliptic-data-set/versions/1/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv')

In [None]:
#node indices need to be integers i such that 0<=i<num_nodes

num_seen = 0

node_dic = {}

for row in df_edges.values:
    
    for node in row:
        
        if node not in node_dic.keys():
            
            node_dic[node] = num_seen
            
            num_seen += 1
            
node_dic_inv = {node_dic[node]:node for node in node_dic.keys()}

sorted_keys = sorted(node_dic_inv.keys())

In [None]:
import torch
import numpy as np

sources = torch.from_numpy(np.array([node_dic[source] for source in df_edges['txId1'].values]))

targets = torch.from_numpy(np.array([node_dic[target] for target in df_edges['txId2'].values]))

edge_index = torch.stack([sources,targets])

In [None]:
df_features = pd.read_csv('~/.cache/kagglehub/datasets/ellipticco/elliptic-data-set/versions/1/elliptic_bitcoin_dataset/elliptic_txs_features.csv')

In [None]:
num_nodes = len(df_features) + 1 #no row for first node; column names are first node's features

In [None]:
#column names are first node's features

first_node = np.float64(df_features.columns[0])

first_node_features = np.array([np.float64(col) for col in df_features.columns[1:]])

In [None]:
#node features need to be ordered by node index

features_dic = {first_node:first_node_features}

for row in df_features.values:
    
    if row[0] not in features_dic.keys():
        
        features_dic[row[0]] = row[1:]
        
features = np.array([features_dic[node_dic_inv[key]] for key in sorted_keys])

x = torch.tensor(features, dtype = torch.float)

In [None]:
df_classes = pd.read_csv('~/.cache/kagglehub/datasets/ellipticco/elliptic-data-set/versions/1/elliptic_bitcoin_dataset/elliptic_txs_classes.csv')


In [None]:
#class labels need to be in range(num_classes); will not train to predict 'unknown' label

class_map = {'1':1, '2':0, 'unknown':3}

In [None]:
classes_dic = {}

for row in df_classes.values:
    
    classes_dic[row[0]] = class_map[row[1]]
    
y = torch.tensor(np.array([classes_dic[node_dic_inv[key]] for key in sorted_keys]), dtype = torch.long)

In [None]:
import math

val_split = math.floor(num_nodes*.75)

i, j = val_split, val_split

time_block = features[i][0]

while min([features[k][0] == time_block for k in [i,j]]):
    
    i -= 1
    
    j += 1

if features[i][0] != time_block:
    
    val_split = i + 1
        
else:
    
    val_split = j
    
val_time_block = features[val_split][0]

In [None]:
test_split = math.floor(num_nodes*.85)

i, j = test_split, test_split

time_block = features[i][0]

while min([features[k][0] == time_block for k in [i,j]]):
    
    i -= 1
    
    j += 1
    
if features[i][0] != time_block:
    
    test_split = i + 1
        
else:
    
    test_split = j
    
test_time_block = features[test_split][0]

In [None]:
train_mask = torch.tensor(np.array([i < val_split for i in range(num_nodes)]))

In [None]:
val_mask = torch.tensor(np.array([val_split <= i and i < test_split  for i in range(num_nodes)]))

In [None]:
test_mask = torch.tensor(np.array([test_split <= i  for i in range(num_nodes)]))

In [None]:
from torch_geometric.data import Data

data = Data(x = x, edge_index = edge_index, y = y, train_mask = train_mask, val_mask = val_mask, test_mask = test_mask)

In [None]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        self.conv2 = GCNConv(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return x

In [None]:
known_mask = y != 3 #so nodes n such that the licitness of n is uncertain may be filtered out prior to calculating loss

In [None]:
train_known_mask = torch.logical_and(train_mask, known_mask)

In [None]:
test_known_mask = torch.logical_and(train_mask, known_mask)

In [None]:
num_test_nodes = test_known_mask.sum()

In [None]:
class WeightedFocalLoss(torch.nn.Module):
    "Non weighted version of Focal Loss"
    def __init__(self, alpha=.25, gamma=2):
        super(WeightedFocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1-alpha])
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        targets = targets.type(torch.long)
        at = self.alpha.gather(0, targets.data.view(-1))
        pt = torch.exp(-BCE_loss)
        F_loss = at*(1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

In [None]:
focal_loss = WeightedFocalLoss(alpha = .05, gamma = 3)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    print('EPOCH: ', epoch, 'LOSS: ', loss)
    optimizer.zero_grad()
    out = model(data).squeeze(1)
    
    #print(out[train_known_mask][0])
    loss = focal_loss(out[train_known_mask], data.y[train_known_mask].float())
    loss.backward()
    optimizer.step()

In [None]:
model.eval()

In [None]:
pred = torch.sigmoid(model(data).squeeze(1))
pred = torch.Tensor(np.array([p>=.65 for p in pred]))
test_pred = pred[test_known_mask]
print(test_pred.sum()/num_test_nodes)
test_y = y[test_known_mask]
print(test_pred[:10])
print(test_y[:10])
correct = (test_pred == test_y).sum()
acc = int(correct) / num_test_nodes
print(f'Accuracy: {acc:.4f}')

In [None]:
test_y = y[test_known_mask]
true_pos = test_y.sum()

In [None]:
test_y_bool = test_y.bool()

In [None]:
num_threshs = 100
threshs = [i*(1/num_threshs) for i in range(num_threshs)]

precisions = {}

recalls = {}

accs = {}

best_acc = 0

best_precision = 0

for thresh in threshs:
    
    print('THRESH: ', thresh, 'ACC :', acc, 'PRECISION: ', precision, 'RECALL: ', recall)
    
    thresh_pred = torch.Tensor(np.array([p>=thresh for p in pred])).bool()
    test_pred = thresh_pred[test_known_mask]
    pred_pos = test_pred.sum()
    correct_pos = (test_pred & test_y_bool).sum()
    correct = (test_pred == test_y).sum()
    precision = correct_pos/pred_pos
    recall = correct_pos/true_pos
    acc = int(correct)/num_test_nodes
    
    if acc > best_acc:
        
        best_acc = acc
        
    if precision > best_precision:
        
        best_precision = precision
    
    precisions[thresh] = precision
    recalls[thresh] = recall
    accs[thresh] = acc