<a href="https://colab.research.google.com/github/baicheto/AML_Bitcoin/blob/Kri/AML_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DeepWalk

**It's needed to run the section `I. Centralities` only once.**

### I. Centralities

In [None]:
CENT_FILES = {
    'degree'     : 'degree_centrality.pkl',
    'betweenness': 'betweenness_centrality.pkl',
    'closeness'  : 'closeness_centrality.pkl',
    'eigenvector': 'eigen_centrality.pkl',
}

centralities = {}

In [None]:
fname = CENT_FILES['degree']
if os.path.exists(fname):
    with open(fname, 'rb') as f:
        deg = pickle.load(f)
else:
    deg = nx.degree_centrality(G_int_sub)

    with open(fname, 'wb') as f:
        pickle.dump(deg, f)
    print(f"Saved PKL {fname}")

inv_map = { i: tx for tx, i in tx_idx_sub.items() }

ser_deg = pd.Series(
    { inv_map[i]: deg[i] for i in deg },
    name='degree'
)

df_deg = ser_deg.reindex(all_nodes_sub).fillna(0).to_frame()
df_deg.index.name = 'txId'

csv_file = 'degree_centrality.csv'
df_deg.to_csv(csv_file)
print(f"Saved CSV {csv_file}")
files.download(csv_file)

centralities['degree'] = df_deg['degree'].to_dict()

In [None]:
fname = CENT_FILES['betweenness']
if os.path.exists(fname):
    with open(fname, 'rb') as f:
        btw = pickle.load(f)
else:
    btw = nx.betweenness_centrality(G_int_sub)

    with open(fname, 'wb') as f:
        pickle.dump(btw, f)
    print(f"Saved PKL {fname}")

inv_map = { i: tx for tx, i in tx_idx_sub.items() }

ser_btw = pd.Series(
    { inv_map[i]: btw[i] for i in btw },
    name='betweenness'
)

df_btw = ser_btw.reindex(all_nodes_sub).fillna(0).to_frame()
df_btw.index.name = 'txId'

csv_file = 'betweenness_centrality.csv'
df_btw.to_csv(csv_file)
print(f"Saved CSV {csv_file}")
files.download(csv_file)

centralities['betweenness'] = df_btw['betweenness'].to_dict()

In [None]:
fname = CENT_FILES['closeness']
if os.path.exists(fname):
    with open(fname, 'rb') as f:
        clo = pickle.load(f)
else:
    clo = nx.closeness_centrality(G_int_sub)

    with open(fname, 'wb') as f:
        pickle.dump(clo, f)
    print(f"Saved PKL {fname}")

inv_map = { i: tx for tx, i in tx_idx_sub.items() }

ser_clo = pd.Series(
    { inv_map[i]: clo[i] for i in clo },
    name='closeness'
)

df_clo = ser_clo.reindex(all_nodes_sub).fillna(0).to_frame()
df_clo.index.name = 'txId'

csv_file = 'closeness_centrality.csv'
df_clo.to_csv(csv_file)
print(f"Saved CSV {csv_file}")
files.download(csv_file)

centralities['closeness'] = df_clo['closeness'].to_dict()

In [None]:
fname = CENT_FILES['eigenvector']
if os.path.exists(fname):
    with open(fname, 'rb') as f:
        deg = pickle.load(f)
else:
    eig = nx.eigenvector_centrality(G_int_sub)

    with open(fname, 'wb') as f:
        pickle.dump(eig, f)
    print(f"Saved PKL {fname}")

inv_map = { i: tx for tx, i in tx_idx_sub.items() }

ser_eig = pd.Series(
    { inv_map[i]: eig[i] for i in eig },
    name='eigenvector'
)

df_eig = ser_eig.reindex(all_nodes_sub).fillna(0).to_frame()
df_eig.index.name = 'txId'

csv_file = 'eigenvector_centrality.csv'
df_eig.to_csv(csv_file)
print(f"Saved CSV {csv_file}")
files.download(csv_file)

centralities['eigenvector'] = df_eig['eigenvector'].to_dict()

### II.  Manual Features

In [None]:
with open('degree_centrality.pkl','rb')      as f: degree     = pickle.load(f)
with open('betweenness_centrality.pkl','rb') as f: betweenness= pickle.load(f)
with open('closeness_centrality.pkl','rb')   as f: closeness  = pickle.load(f)
with open('eigenvector_centrality.pkl','rb') as f: eigen      = pickle.load(f)

In [None]:
df_cent = pd.DataFrame({
    'degree'     : pd.Series(degree),
    'betweenness': pd.Series(betweenness),
    'closeness'  : pd.Series(closeness),
    'eigenvector': pd.Series(eigen),
})
df_cent = df_cent.reindex(all_nodes_sub).fillna(0)
df_cent.index.name = 'txId'

In [None]:
feat_cent = torch.tensor(df_cent.values, dtype=torch.float, device=device)

In [None]:
feat_tensor = torch.cat([feat_intr, feat_cent], dim=1)

In [None]:
N_sub = len(all_nodes_sub)
feat_tensor.shape == (N_sub, len(trans_features) + 4)

True

In [None]:
mask_known = labels >= 0
idx_known  = mask_known.nonzero(as_tuple=False).view(-1)
ts_known   = ts_tensor[mask_known]

train_idx = idx_known[ ts_known <= 30 ]
val_idx   = idx_known[(ts_known >= 31) & (ts_known <= 40)]
test_idx  = idx_known[ ts_known >= 41 ]

### DeepWalk Machinery

In [None]:
def random_walk(graph, start, walk_length, rng=None):
    walk = [start]
    if rng is None:
        rng = random.Random()
    for _ in range(walk_length - 1):
        cur = walk[-1]
        nbrs = list(graph[cur])
        if not nbrs:
            break
        walk.append(rng.choice(nbrs))
    return walk

In [None]:
def _walk_worker(args):
    graph, batch, wl, wpn = args
    rng = random.Random()
    walks = []
    for n in batch:
        for _ in range(wpn):
            walks.append(random_walk(graph, n, wl, rng))
    return walks

In [None]:
def build_corpus(graph, walk_length=3, walks_per_node=2, workers=None):
    if workers is None:
        workers = mp.cpu_count()
    nodes   = list(graph.nodes())
    np.random.shuffle(nodes)
    batches = np.array_split(nodes, workers)
    with mp.Pool(workers) as pool:
        results = pool.map(
            _walk_worker,
            [(graph, b, walk_length, walks_per_node) for b in batches]
        )
    return [w for sub in results for w in sub]

In [None]:
def train_word2vec(
    sentences,
    dim=5,
    window=2,
    epochs=176,
    neg=1
):
    return Word2Vec(
        sentences=sentences,
        vector_size=dim,
        window=window,
        min_count=0,
        sg=1,
        hs=0,
        negative=neg,
        workers=1,
        epochs=epochs,
        seed=42
    )

In [None]:
def save_embeddings(w2v, path_pt='dwS.pt', path_txt='dwS.txt'):
    vecs = torch.tensor(w2v.wv.vectors)
    torch.save(vecs, path_pt)
    with open(path_txt, 'w') as f:
        N, D = vecs.shape
        f.write(f"{N} {D}\n")
        for node in w2v.wv.key_to_index:
            coords = " ".join(map(str, w2v.wv[node]))
            f.write(f"{node} {coords}\n")

In [None]:
def run_deepwalk(graph, out_pt='dwS.pt', out_txt='dwS.txt'):
    params = {
        'walk_length':  3,
        'walks_per_node': 2,
        'dim':          5,
        'window':       2,
        'neg':          1,
        'epochs':      176,
        'out_pt':      out_pt,
        'out_txt':     out_txt
    }
    corpus = build_corpus(
        graph,
        walk_length=params['walk_length'],
        walks_per_node=params['walks_per_node']
    )
    w2v = train_word2vec(
        corpus,
        dim=params['dim'],
        window=params['window'],
        epochs=params['epochs'],
        neg=params['neg']
    )
    save_embeddings(w2v, params['out_pt'], params['out_txt'])
    return params['out_pt']

### Prepare training‐only subgraph, then full‐graph embeddings

In [None]:
tv_nodes = torch.cat([train_idx, val_idx]).cpu().numpy().tolist()
G_tv     = G_int_sub.subgraph(tv_nodes).copy()

_ = run_deepwalk(G_tv, out_pt='dwS_tv.pt', out_txt='ignore.txt')

In [None]:
emb_pt = run_deepwalk(G_int_sub, out_pt='dwS_full.pt', out_txt='ignore.txt')
Z_full = torch.load(emb_pt).to(device)

### Downstream “decoder” (2‐hidden‐layer NN)

In [None]:
class DeepWalkDecoder(nn.Module):
    def __init__(self, Z, feat, hidden=10):
        super().__init__()
        N, D = Z.shape
        F    = feat.size(1)
        self.emb  = nn.Embedding.from_pretrained(Z, freeze=False)
        self.fc1  = nn.Linear(D + F, hidden)
        self.act1 = nn.ReLU()
        self.fc2  = nn.Linear(hidden, hidden)
        self.act2 = nn.ReLU()
        self.drop = nn.Dropout(0.5)
        self.out  = nn.Linear(hidden, 2)

    def forward(self, idx):
        x_emb  = self.emb(idx)
        x_feat = feat_tensor[idx]
        x      = torch.cat([x_emb, x_feat], dim=1)
        x      = self.drop(self.act1(self.fc1(x)))
        x      = self.drop(self.act2(self.fc2(x)))
        return self.out(x)

In [None]:
decoder = DeepWalkDecoder(Z_full, feat_tensor).to(device)

In [None]:
opt  = optim.Adam(decoder.parameters(), lr=0.0554)
crit = nn.CrossEntropyLoss()

In [None]:
best_val, wait = -1, 0
val_pr_history = []

for epoch in range(1, 81):
    decoder.train()
    opt.zero_grad()
    logits = decoder(train_idx)
    loss   = crit(logits, labels[train_idx])
    loss.backward()
    opt.step()

    decoder.eval()
    with torch.no_grad():
        val_logits = decoder(val_idx)
        val_probs  = torch.softmax(val_logits, dim=1)[:,1].cpu().numpy()
        y_val      = labels[val_idx].cpu().numpy()
    val_pr = average_precision_score(y_val, val_probs)
    val_pr_history.append(val_pr)
    print(f"Epoch {epoch:03d}  Val PR: {val_pr:.4f}")

    if val_pr > best_val:
        best_val, wait = val_pr, 0
        torch.save(decoder.state_dict(), "dwS_decoder_best.pt")
    else:
        wait += 1
        if wait >= 15:
            break

Epoch 001  Val PR: 0.1460
Epoch 002  Val PR: 0.1070
Epoch 003  Val PR: 0.1472
Epoch 004  Val PR: 0.2669
Epoch 005  Val PR: 0.4503
Epoch 006  Val PR: 0.5651
Epoch 007  Val PR: 0.6015
Epoch 008  Val PR: 0.4995
Epoch 009  Val PR: 0.4820
Epoch 010  Val PR: 0.4850
Epoch 011  Val PR: 0.4733
Epoch 012  Val PR: 0.4453
Epoch 013  Val PR: 0.4266
Epoch 014  Val PR: 0.4212
Epoch 015  Val PR: 0.4345
Epoch 016  Val PR: 0.4566
Epoch 017  Val PR: 0.4876
Epoch 018  Val PR: 0.5187
Epoch 019  Val PR: 0.5444
Epoch 020  Val PR: 0.5705
Epoch 021  Val PR: 0.5722
Epoch 022  Val PR: 0.5587


### Final test‐set evaluation

In [None]:
decoder.eval()
with torch.no_grad():
    test_logits = decoder(test_idx)
    test_probs  = torch.softmax(test_logits, dim=1)[:,1].cpu().numpy()

M   = len(test_probs)
k   = max(1, int(0.01 * M))
th  = np.sort(test_probs)[-k]
top = (test_probs >= th).astype(int)

In [None]:
decoder.eval()
with torch.no_grad():
    test_logits = decoder(test_idx)
    test_probs  = torch.softmax(test_logits, dim=1)[:,1].cpu().numpy()
    test_true   = labels[test_idx].cpu().numpy()

M_test = len(test_probs)

train_true = labels[train_idx].cpu().numpy()
prevalence = train_true.mean()
k_prev     = max(1, int(prevalence * M_test))

cutoffs = {
    "Top 0.1%" : max(1, int(0.001 * M_test)),
    "Top 1%"   : max(1, int(0.01  * M_test)),
    "Top 10%"  : max(1, int(0.10  * M_test)),
    "Prevalence" : k_prev
}

In [None]:
n_runs = 100
metrics = {
    "roc_auc":     [],
    "pr_auc":      [],
    **{f"{name}_P": [] for name in cutoffs},
    **{f"{name}_R": [] for name in cutoffs},
    **{f"{name}_F1": [] for name in cutoffs},
}

rng = np.random.RandomState(42)
for _ in range(n_runs):
    idxs = rng.choice(M_test, size=M_test, replace=True)
    y_true  = test_true[idxs]
    y_score = test_probs[idxs]

    metrics["roc_auc"].append(roc_auc_score(y_true, y_score))
    metrics["pr_auc"].append(average_precision_score(y_true, y_score))

    sorted_idx = np.argsort(y_score)
    for name, k in cutoffs.items():
        topk = sorted_idx[-k:]
        y_pred = np.zeros_like(y_score, dtype=int)
        y_pred[topk] = 1

        metrics[f"{name}_P"].append(precision_score(y_true, y_pred))
        metrics[f"{name}_R"].append(recall_score(y_true, y_pred))
        metrics[f"{name}_F1"].append(f1_score(y_true, y_pred))

In [None]:
def fmt(name):
    vals = np.array(metrics[name])
    return f"{vals.mean():.3f} ± {vals.std():.3f}"

print("=== Bootstrap Test‐Set Results (n=100) ===")
print(f"ROC AUC : {fmt('roc_auc')}")
print(f"PR  AUC : {fmt('pr_auc')}")
for name in cutoffs:
    print(f"{name:12} Precision  : {fmt(name+'_P')}")
    print(f"{name:12} Recall     : {fmt(name+'_R')}")
    print(f"{name:12} F1-Score   : {fmt(name+'_F1')}")

=== Bootstrap Test‐Set Results (n=100) ===
ROC AUC : 0.787 ± 0.013
PR  AUC : 0.466 ± 0.027
Top 0.1%     Precision  : 0.942 ± 0.113
Top 0.1%     Recall     : 0.016 ± 0.002
Top 0.1%     F1-Score   : 0.032 ± 0.004
Top 1%       Precision  : 0.873 ± 0.033
Top 1%       Recall     : 0.165 ± 0.009
Top 1%       F1-Score   : 0.278 ± 0.014
Top 10%      Precision  : 0.302 ± 0.019
Top 10%      Recall     : 0.575 ± 0.022
Top 10%      F1-Score   : 0.396 ± 0.020
Prevalence   Precision  : 0.279 ± 0.018
Prevalence   Recall     : 0.583 ± 0.023
Prevalence   F1-Score   : 0.378 ± 0.020
