## Setup & Imports

In [1]:
import sys
import os
import torch
import numpy as np
import optuna
from tqdm.notebook import tqdm

# Add 'src' to path so we can import FastRP
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from loaders import load_dataset
from fastrp_layer import FastRP

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on: {device}")

  from pkg_resources import parse_version


Running on: cuda


## Load Data

In [2]:
# Datasets to run sequentially
dataset_names = ['blogcatalog', 'flickr']
data_root = '../data'

def load_data_for(dataset_name, data_root):
    if dataset_name == 'flickr':
        flickr_path = os.path.join(data_root, 'flickr.mat')
        if not os.path.exists(flickr_path):
            raise FileNotFoundError(
                "Missing flickr.mat. Run: python ../src/get_flickr.py to download and convert it. "
                "If you already have a .mat file, place it at ../data/flickr.mat."
            )

    print(f"Loading {dataset_name}...")
    adj, features, labels = load_dataset(dataset_name, root_dir=data_root)

    # --- FIX START: Convert Labels to Dense Array ---
    # BlogCatalog labels are often a sparse matrix. We must densify them for slicing.
    if hasattr(labels, 'toarray'):
        labels = labels.toarray()
        
    # If labels are single-column (N, 1), flatten them for easier use
    # (Optional: keeps multi-label datasets like BlogCatalog as-is)
    if labels.shape[1] == 1:
        labels = labels.flatten()
    # --- FIX END ---

    print(f"   Nodes: {adj.shape[0]}, Edges: {adj.nnz}")
    print(f"   Labels Shape: {labels.shape}")

    # Convert Adj to PyTorch Sparse Tensor
    # Coalesce is critical for performance on GPU
    adj_coo = adj.tocoo()
    indices = torch.from_numpy(np.vstack((adj_coo.row, adj_coo.col))).long()
    values = torch.from_numpy(adj_coo.data).float()
    shape = torch.Size(adj_coo.shape)
    adj_tensor = torch.sparse_coo_tensor(indices, values, shape).to(device).coalesce()

    # Handle Features (Hybrid Variant)
    feat_tensor = None
    if features is not None:
        if hasattr(features, 'todense'):
            features = features.todense()
        feat_tensor = torch.FloatTensor(features).to(device)
        print(f"   Features loaded on GPU: {feat_tensor.shape}")

    print(f"Data preparation complete on {device}")
    return adj, features, labels, adj_tensor, feat_tensor

# Load the first dataset by default for interactive work
dataset_name = dataset_names[0]
adj, features, labels, adj_tensor, feat_tensor = load_data_for(dataset_name, data_root)

Loading blogcatalog...
Loading blogcatalog from .mat file...


   Nodes: 10312, Edges: 667966
   Labels Shape: (10312, 39)
Data preparation complete on cuda


## The Optuna Objective Function

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

def objective(trial):
    # 1. Suggest Hyperparameters
    dim = trial.suggest_categorical('dim', [64, 128, 256, 512])
    window_size = trial.suggest_int('window_size', 1, 5)
    normalization = trial.suggest_categorical('normalization', [True, False])
    # Tuning your research variable!
    g = trial.suggest_int('g', 2, 10) 
    
    # 2. Initialize Model
    model = FastRP(
        embedding_dim=512,
        window_size=4,
        normalization=True,
        group_size=3,
        input_matrix='trans',
        alpha=-0.6,
        weights=[1.0, 1.0, 7.81, 45.28],
        projection_type='gaussian' # Try 'gaussian' first, then 'striped'
    ).to(device)
    
    # 3. Generate Embeddings (Fast!)
    with torch.no_grad():
        embeddings = model(adj_tensor, features=None) # Use feat_tensor for Hybrid
        
    # 4. Evaluate (Downstream Task)
    X = embeddings.cpu().numpy()
    Y = labels
    
    # Quick Train/Test Split (80/20) for tuning
    # Note: For speed, we just use a fixed random seed here
    indices = np.arange(X.shape[0])
    np.random.seed(42) 
    np.random.shuffle(indices)
    split = int(0.8 * X.shape[0])
    
    clf = OneVsRestClassifier(LogisticRegression(solver='liblinear', max_iter=100))
    clf.fit(X[indices[:split]], Y[indices[:split]])
    y_pred = clf.predict(X[indices[split:]])
    
    macro_f1 = f1_score(Y[indices[split:]], y_pred, average='macro')
    
    return macro_f1

## Run the Optimization

In [4]:
from pathlib import Path
import json

# Persistent Optuna storage so you can resume without rerunning everything
OPTUNA_DB = Path("optuna_studies.db")
RESULTS_JSON = Path("optuna_results.json")
N_TRIALS = 50

all_results = {}
all_studies = {}

for name in dataset_names:
    dataset_name = name
    adj, features, labels, adj_tensor, feat_tensor = load_data_for(dataset_name, data_root)
    
    print(f"\n=== Optuna: {dataset_name} ===")
    study_name = f"fastrp_{dataset_name}"
    storage = f"sqlite:///{OPTUNA_DB.resolve()}"
    study = optuna.create_study(
        direction='maximize',
        study_name=study_name,
        storage=storage,
        load_if_exists=True,
    )
    
    # Resume if trials already exist
    remaining = max(0, N_TRIALS - len(study.trials))
    if remaining > 0:
        study.optimize(objective, n_trials=remaining, show_progress_bar=True)
    else:
        print(f"Skipping {dataset_name}: already has {len(study.trials)} trials.")
    
    all_studies[dataset_name] = study
    all_results[dataset_name] = {
        'best_params': study.best_params,
        'best_macro_f1': study.best_value,
        'n_trials': len(study.trials),
    }
    print("Best Hyperparameters:", study.best_params)
    print("Best Macro-F1:", study.best_value)

RESULTS_JSON.write_text(json.dumps(all_results, indent=2))

print("\n=== Summary ===")
for name, result in all_results.items():
    print(f"{name}: {result['best_macro_f1']:.4f} | params: {result['best_params']}")

Loading blogcatalog...
Loading blogcatalog from .mat file...
   Nodes: 10312, Edges: 667966
   Labels Shape: (10312, 39)
Data preparation complete on cuda

=== Optuna: blogcatalog ===


[I 2026-02-13 10:33:14,765] A new study created in RDB with name: fastrp_blogcatalog


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-02-13 10:35:04,414] Trial 0 finished with value: 0.2311658922370656 and parameters: {'dim': 64, 'window_size': 3, 'normalization': True, 'g': 4}. Best is trial 0 with value: 0.2311658922370656.
[I 2026-02-13 10:36:48,808] Trial 1 finished with value: 0.23988424803990901 and parameters: {'dim': 256, 'window_size': 5, 'normalization': True, 'g': 4}. Best is trial 1 with value: 0.23988424803990901.
[I 2026-02-13 10:38:31,477] Trial 2 finished with value: 0.23028394114253692 and parameters: {'dim': 128, 'window_size': 4, 'normalization': True, 'g': 4}. Best is trial 1 with value: 0.23988424803990901.
[I 2026-02-13 10:40:13,229] Trial 3 finished with value: 0.2247793471532376 and parameters: {'dim': 256, 'window_size': 2, 'normalization': True, 'g': 9}. Best is trial 1 with value: 0.23988424803990901.
[I 2026-02-13 10:41:55,088] Trial 4 finished with value: 0.24139291388232825 and parameters: {'dim': 64, 'window_size': 2, 'normalization': False, 'g': 10}. Best is trial 4 with value:

[I 2026-02-13 12:00:15,850] A new study created in RDB with name: fastrp_flickr


   Nodes: 89250, Edges: 899756
   Labels Shape: (89250,)
   Features loaded on GPU: torch.Size([89250, 500])
Data preparation complete on cuda

=== Optuna: flickr ===


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-02-13 12:00:49,299] Trial 0 finished with value: 0.1964746600072515 and parameters: {'dim': 64, 'window_size': 4, 'normalization': False, 'g': 9}. Best is trial 0 with value: 0.1964746600072515.
[I 2026-02-13 12:01:24,201] Trial 1 finished with value: 0.19835858937941492 and parameters: {'dim': 128, 'window_size': 2, 'normalization': True, 'g': 4}. Best is trial 1 with value: 0.19835858937941492.
[I 2026-02-13 12:01:58,641] Trial 2 finished with value: 0.19542684811361524 and parameters: {'dim': 128, 'window_size': 5, 'normalization': False, 'g': 9}. Best is trial 1 with value: 0.19835858937941492.
[I 2026-02-13 12:02:32,282] Trial 3 finished with value: 0.1941765273103003 and parameters: {'dim': 128, 'window_size': 4, 'normalization': False, 'g': 8}. Best is trial 1 with value: 0.19835858937941492.
[I 2026-02-13 12:03:06,700] Trial 4 finished with value: 0.1959102681669343 and parameters: {'dim': 512, 'window_size': 2, 'normalization': True, 'g': 5}. Best is trial 1 with value

## Visualize the Research Insight

In [5]:
import importlib
import sys
import subprocess
import plotly

In [9]:
from IPython.display import display

if 'all_studies' not in globals() or not all_studies:
    raise RuntimeError("Run the optimization cell first to create studies.")

for name, study in all_studies.items():
    print(f"\n{str(name).upper()} - Slice: g vs dim")
    display(optuna.visualization.plot_slice(study, params=['g', 'dim']))
    print(f"\n{str(name).upper()} - Slice: window_size vs normalization")
    display(optuna.visualization.plot_slice(study, params=['window_size', 'normalization']))


BLOGCATALOG - Slice: g vs dim



BLOGCATALOG - Slice: window_size vs normalization



FLICKR - Slice: g vs dim



FLICKR - Slice: window_size vs normalization


In [10]:
if 'all_studies' not in globals() or not all_studies:
    raise RuntimeError("Run the optimization cell first to create studies.")

plots = {}
for name, study in all_studies.items():
    print(f"Rendering plots for {name}...")
    plots[name] = {
        'slice_g_dim': optuna.visualization.plot_slice(study, params=['g', 'dim']),
        'slice_window_norm': optuna.visualization.plot_slice(study, params=['window_size', 'normalization']),
    }

plots

Rendering plots for blogcatalog...
Rendering plots for flickr...


{'blogcatalog': {'slice_g_dim': Figure({
      'data': [{'marker': {'color': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                                     26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
                                     38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
                           'colorbar': {'title': {'text': 'Trial'}, 'x': 1.0, 'xpad': 40},
                           'colorscale': [[0.0, 'rgb(247,251,255)'], [0.125,
                                          'rgb(222,235,247)'], [0.25,
                                          'rgb(198,219,239)'], [0.375,
                                          'rgb(158,202,225)'], [0.5,
                                          'rgb(107,174,214)'], [0.625,
                                          'rgb(66,146,198)'], [0.75,
                                          'rgb(33,113,181)'], [0.875,
                                

## HRP vs FastRP

In [None]:
import torch
import numpy as np
import pandas as pd
from fastrp_layer import FastRP # Your fixed module

# 1. Setup Fixed Seeds (Fair Comparison)
def set_seeds(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# 2. Define the Common Parameters (The Control Variables)
# We use dim=512 to give the model maximum capacity
params = {
    'embedding_dim': 512, 
    'window_size': 4,
    'normalization': True,
    'group_size': 10,         # Your research variable
    'input_matrix': 'trans',
    'alpha': -0.6,           # The value that fixed the signal
    'weights': [1.0, 1.0, 7.81, 45.28]
}

results = []

# --- RUN 1: BASELINE (Gaussian) ---
print("Round 1: Running Baseline (Gaussian)...")
set_seeds(42)
model_base = FastRP(**params, projection_type='gaussian').to(device)

with torch.no_grad():
    emb_base = model_base(adj_tensor, features=None)
    
# Evaluate
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

def quick_eval(embeddings, labels, name):
    X = embeddings.cpu().numpy()
    Y = labels
    idx_train, idx_test = train_test_split(np.arange(X.shape[0]), test_size=0.2, random_state=42)
    
    clf = OneVsRestClassifier(LogisticRegression(solver='liblinear', C=1.0))
    clf.fit(X[idx_train], Y[idx_train])
    y_pred = clf.predict(X[idx_test])
    
    macro = f1_score(Y[idx_test], y_pred, average='macro')
    print(f"{name} Macro-F1: {macro:.4f}")
    return macro

score_base = quick_eval(emb_base, labels, "Gaussian (Baseline)")


# --- RUN 2: Striped Sparse ---
print("\nRound 2: Running Innovation (Striped Sparse)...")
set_seeds(42)
model_striped = FastRP(**params, projection_type='striped').to(device)

with torch.no_grad():
    # Measure time for the efficiency claim
    import time
    t0 = time.time()
    emb_striped = model_striped(adj_tensor, features=None)
    t1 = time.time()
    print(f"Striped Time: {t1-t0:.4f}s")

score_striped = quick_eval(emb_striped, labels, "Striped (Ours)")

# --- CONCLUSION ---
print("\nFinal Result:")
diff = score_striped - score_base
print(f"   Improvement: {diff:+.4f}")
if diff > 0:
    print("SUCCESS: Striped Sparse outperformed the Baseline!")
else:
    print("NOTE: Striped is lower. We may need to tune 'group_size' (try g=2 or g=10).")

Round 1: Running Baseline (Gaussian)...



F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.



Gaussian (Baseline) Macro-F1: 0.2440

Round 2: Running Innovation (Striped Sparse)...
Striped Time: 0.0147s
Striped (Ours) Macro-F1: 0.2497

Final Result:
   Improvement: +0.0056
SUCCESS: Striped Sparse outperformed the Baseline!



F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.



## Loading CORA

In [None]:
# 1. Force Reload Data (Size 2708)
dataset_name = 'cora' 
adj, features, labels = load_dataset(dataset_name, root_dir='../data')
print(f"Data Loaded. Nodes: {adj.shape[0]}") # Should be 2708

# 2. Update Tensors
# Important: We must recreate the tensor to match the new 2708 size
adj_coo = adj.tocoo()
indices = torch.from_numpy(np.vstack((adj_coo.row, adj_coo.col))).long()
values = torch.from_numpy(adj_coo.data).float()
adj_tensor = torch.sparse_coo_tensor(indices, values, torch.Size(adj_coo.shape)).to(device).coalesce()

if features is not None:
    if hasattr(features, 'todense'): features = features.todense()
    feat_tensor = torch.FloatTensor(features).to(device)
    print(f"   Features Loaded. Shape: {feat_tensor.shape}")

# 3. Re-Run the Model (To update 'emb_hybrid' to size 2708)
print("\nRe-generating Embeddings for CORA...")

params = {
    'embedding_dim': 512, 
    'window_size': 4,
    'normalization': True,
    'group_size': 10,       
    'input_matrix': 'trans',
    'alpha': -0.6,           
    'weights': [1.0, 1.0, 7.81, 45.28]
}

# Re-initialize models with new graph size
model_striped = FastRP(**params, projection_type='striped').to(device)

# --- CRITICAL STEP: THIS UPDATES THE EMBEDDING VARIABLE ---
with torch.no_grad():
    # Hybrid Run
    emb_hybrid = model_striped(adj_tensor, features=feat_tensor)
    
print(f"New Embeddings Shape: {emb_hybrid.shape}") # MUST be (2708, 512)

# 4. Evaluate (Now indices will match)
score_hybrid = quick_eval(emb_hybrid, labels, "Hybrid (Cora)")

Loading cora from .mat file...
Data Loaded. Nodes: 2708
   Features Loaded. Shape: torch.Size([2708, 1433])

Re-generating Embeddings for CORA...
New Embeddings Shape: torch.Size([2708, 512])


Hybrid (Cora) Macro-F1: 0.7682


## HRP Experiment

In [None]:
# Common Params
params = {
    'embedding_dim': 512, 
    'window_size': 4,
    'normalization': True,
    'group_size': 10,        # <--- The Winner!
    'input_matrix': 'trans',
    'alpha': -0.6,           
    'weights': [1.0, 1.0, 7.81, 45.28]
}

# 1. Baseline (Gaussian)
print("Running Gaussian...")
model_base = FastRP(**params, projection_type='gaussian').to(device)
with torch.no_grad():
    # Note: Baseline FastRP ignores features usually, but let's see pure structure baseline first
    emb_base = model_base(adj_tensor, features=None)
score_base = quick_eval(emb_base, labels, "Gaussian (Baseline)")

# 2. Structured (Ours, g=10)
print("\nRunning Striped (Structure Only)...")
model_striped = FastRP(**params, projection_type='striped').to(device)
with torch.no_grad():
    emb_striped = model_striped(adj_tensor, features=None)
score_striped = quick_eval(emb_striped, labels, "Striped (Ours)")

# 3. Hybrid (Ours, g=10 + Features)
print("\nRunning Hybrid (Structure + Features)...")
# We reuse the striped model class but pass features this time
with torch.no_grad():
    # The model handles the fusion internally if features are passed
    emb_hybrid = model_striped(adj_tensor, features=feat_tensor) 
score_hybrid = quick_eval(emb_hybrid, labels, "Hybrid (Ours)")

print("\nFinal Standings:")
print(f"1. Hybrid: {score_hybrid:.4f}")
print(f"2. Striped: {score_striped:.4f}")
print(f"3. Gaussian: {score_base:.4f}")

Running Gaussian...
Gaussian (Baseline) Macro-F1: 0.7232

Running Striped (Structure Only)...
Striped (Ours) Macro-F1: 0.7356

Running Hybrid (Structure + Features)...
Hybrid (Ours) Macro-F1: 0.7541

Final Standings:
1. Hybrid: 0.7541
2. Striped: 0.7356
3. Gaussian: 0.7232
