## Setup & Imports

In [1]:
import sys
import os
import torch
import numpy as np
import optuna
from tqdm.notebook import tqdm

# Add 'src' to path so we can import FastRP
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from loaders import load_dataset
from fastrp_layer import FastRP

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on: {device}")

  from pkg_resources import parse_version


Running on: cuda


## Load Data

In [2]:
# Change this variable to switch datasets
dataset_name = 'blogcatalog' 

print(f"Loading {dataset_name}...")
adj, features, labels = load_dataset(dataset_name, root_dir='../data')

# --- FIX START: Convert Labels to Dense Array ---
# BlogCatalog labels are often a sparse matrix. We must densify them for slicing.
if hasattr(labels, 'toarray'):
    labels = labels.toarray()
    
# If labels are single-column (N, 1), flatten them for easier use
# (Optional: keeps multi-label datasets like BlogCatalog as-is)
if labels.shape[1] == 1:
    labels = labels.flatten()
# --- FIX END ---

print(f"   Nodes: {adj.shape[0]}, Edges: {adj.nnz}")
print(f"   Labels Shape: {labels.shape}")

# Convert Adj to PyTorch Sparse Tensor
# Coalesce is critical for performance on GPU
adj_coo = adj.tocoo()
indices = torch.from_numpy(np.vstack((adj_coo.row, adj_coo.col))).long()
values = torch.from_numpy(adj_coo.data).float()
shape = torch.Size(adj_coo.shape)
adj_tensor = torch.sparse_coo_tensor(indices, values, shape).to(device).coalesce()

# Handle Features (Hybrid Variant)
feat_tensor = None
if features is not None:
    if hasattr(features, 'todense'):
        features = features.todense()
    feat_tensor = torch.FloatTensor(features).to(device)
    print(f"   Features loaded on GPU: {feat_tensor.shape}")

print(f"Data preparation complete on {device}")

Loading blogcatalog...
Loading blogcatalog from .mat file...
   Nodes: 10312, Edges: 667966
   Labels Shape: (10312, 39)
Data preparation complete on cuda


## The Optuna Objective Function

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

def objective(trial):
    # 1. Suggest Hyperparameters
    dim = trial.suggest_categorical('dim', [64, 128, 256, 512])
    window_size = trial.suggest_int('window_size', 1, 5)
    normalization = trial.suggest_categorical('normalization', [True, False])
    # Tuning your research variable!
    g = trial.suggest_int('g', 2, 10) 
    
    # 2. Initialize Model
    model = FastRP(
        embedding_dim=512,
        window_size=4,
        normalization=True,
        group_size=3,
        input_matrix='trans',
        alpha=-0.6,
        weights=[1.0, 1.0, 7.81, 45.28],
        projection_type='gaussian' # Try 'gaussian' first, then 'striped'
    ).to(device)
    
    # 3. Generate Embeddings (Fast!)
    with torch.no_grad():
        embeddings = model(adj_tensor, features=None) # Use feat_tensor for Hybrid
        
    # 4. Evaluate (Downstream Task)
    X = embeddings.cpu().numpy()
    Y = labels
    
    # Quick Train/Test Split (80/20) for tuning
    # Note: For speed, we just use a fixed random seed here
    indices = np.arange(X.shape[0])
    np.random.seed(42) 
    np.random.shuffle(indices)
    split = int(0.8 * X.shape[0])
    
    clf = OneVsRestClassifier(LogisticRegression(solver='liblinear', max_iter=100))
    clf.fit(X[indices[:split]], Y[indices[:split]])
    y_pred = clf.predict(X[indices[split:]])
    
    macro_f1 = f1_score(Y[indices[split:]], y_pred, average='macro')
    
    return macro_f1

## Run the Optimization

In [4]:
study = optuna.create_study(direction='maximize')
# Run 50 trials (experiments)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best Hyperparameters:", study.best_params)
print("Best Macro-F1:", study.best_value)

[I 2026-01-15 05:07:52,145] A new study created in memory with name: no-name-25341dc5-0fd4-4ee2-baee-ea6ccf678a9c


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-01-15 05:09:05,233] Trial 0 finished with value: 0.2387362414799981 and parameters: {'dim': 512, 'window_size': 4, 'normalization': False, 'g': 10}. Best is trial 0 with value: 0.2387362414799981.
[I 2026-01-15 05:10:20,214] Trial 1 finished with value: 0.23612416702690717 and parameters: {'dim': 64, 'window_size': 4, 'normalization': False, 'g': 10}. Best is trial 0 with value: 0.2387362414799981.
[I 2026-01-15 05:11:35,264] Trial 2 finished with value: 0.2332408812995404 and parameters: {'dim': 128, 'window_size': 2, 'normalization': True, 'g': 10}. Best is trial 0 with value: 0.2387362414799981.
[I 2026-01-15 05:12:49,549] Trial 3 finished with value: 0.23317248464028814 and parameters: {'dim': 512, 'window_size': 3, 'normalization': False, 'g': 2}. Best is trial 0 with value: 0.2387362414799981.
[I 2026-01-15 05:14:04,087] Trial 4 finished with value: 0.21964489042109378 and parameters: {'dim': 128, 'window_size': 5, 'normalization': False, 'g': 5}. Best is trial 0 with val

## Visualize the Research Insight

In [5]:
import importlib
import sys
import subprocess
import plotly

In [6]:
optuna.visualization.plot_slice(study, params=['g', 'dim'])

## HRP vs FastRP

In [7]:
import torch
import numpy as np
import pandas as pd
from fastrp_layer import FastRP # Your fixed module

# 1. Setup Fixed Seeds (Fair Comparison)
def set_seeds(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# 2. Define the Common Parameters (The Control Variables)
# We use dim=512 to give the model maximum capacity
params = {
    'embedding_dim': 512, 
    'window_size': 4,
    'normalization': True,
    'group_size': 10,         # Your research variable
    'input_matrix': 'trans',
    'alpha': -0.6,           # The value that fixed the signal
    'weights': [1.0, 1.0, 7.81, 45.28]
}

results = []

# --- RUN 1: BASELINE (Gaussian) ---
print("Round 1: Running Baseline (Gaussian)...")
set_seeds(42)
model_base = FastRP(**params, projection_type='gaussian').to(device)

with torch.no_grad():
    emb_base = model_base(adj_tensor, features=None)
    
# Evaluate
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

def quick_eval(embeddings, labels, name):
    X = embeddings.cpu().numpy()
    Y = labels
    idx_train, idx_test = train_test_split(np.arange(X.shape[0]), test_size=0.2, random_state=42)
    
    clf = OneVsRestClassifier(LogisticRegression(solver='liblinear', C=1.0))
    clf.fit(X[idx_train], Y[idx_train])
    y_pred = clf.predict(X[idx_test])
    
    macro = f1_score(Y[idx_test], y_pred, average='macro')
    print(f"{name} Macro-F1: {macro:.4f}")
    return macro

score_base = quick_eval(emb_base, labels, "Gaussian (Baseline)")


# --- RUN 2: CHALLENGER (Striped Sparse) ---
print("\nRound 2: Running Innovation (Striped Sparse)...")
set_seeds(42)
model_striped = FastRP(**params, projection_type='striped').to(device)

with torch.no_grad():
    # Measure time for the efficiency claim
    import time
    t0 = time.time()
    emb_striped = model_striped(adj_tensor, features=None)
    t1 = time.time()
    print(f"Striped Time: {t1-t0:.4f}s")

score_striped = quick_eval(emb_striped, labels, "Striped (Ours)")

# --- CONCLUSION ---
print("\nFinal Result:")
diff = score_striped - score_base
print(f"   Improvement: {diff:+.4f}")
if diff > 0:
    print("SUCCESS: Striped Sparse outperformed the Baseline!")
else:
    print("NOTE: Striped is lower. We may need to tune 'group_size' (try g=2 or g=10).")

Round 1: Running Baseline (Gaussian)...



F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.



Gaussian (Baseline) Macro-F1: 0.2440

Round 2: Running Innovation (Striped Sparse)...
Striped Time: 0.0147s
Striped (Ours) Macro-F1: 0.2497

Final Result:
   Improvement: +0.0056
SUCCESS: Striped Sparse outperformed the Baseline!



F-score is ill-defined and being set to 0.0 in labels with no true nor predicted samples. Use `zero_division` parameter to control this behavior.



## Loading CORA

In [8]:
# 1. Force Reload Data (Size 2708)
dataset_name = 'cora' 
adj, features, labels = load_dataset(dataset_name, root_dir='../data')
print(f"Data Loaded. Nodes: {adj.shape[0]}") # Should be 2708

# 2. Update Tensors
# Important: We must recreate the tensor to match the new 2708 size
adj_coo = adj.tocoo()
indices = torch.from_numpy(np.vstack((adj_coo.row, adj_coo.col))).long()
values = torch.from_numpy(adj_coo.data).float()
adj_tensor = torch.sparse_coo_tensor(indices, values, torch.Size(adj_coo.shape)).to(device).coalesce()

if features is not None:
    if hasattr(features, 'todense'): features = features.todense()
    feat_tensor = torch.FloatTensor(features).to(device)
    print(f"   Features Loaded. Shape: {feat_tensor.shape}")

# 3. Re-Run the Model (To update 'emb_hybrid' to size 2708)
print("\nRe-generating Embeddings for CORA...")

params = {
    'embedding_dim': 512, 
    'window_size': 4,
    'normalization': True,
    'group_size': 10,       
    'input_matrix': 'trans',
    'alpha': -0.6,           
    'weights': [1.0, 1.0, 7.81, 45.28]
}

# Re-initialize models with new graph size
model_striped = FastRP(**params, projection_type='striped').to(device)

# --- CRITICAL STEP: THIS UPDATES THE EMBEDDING VARIABLE ---
with torch.no_grad():
    # Hybrid Run
    emb_hybrid = model_striped(adj_tensor, features=feat_tensor)
    
print(f"New Embeddings Shape: {emb_hybrid.shape}") # MUST be (2708, 512)

# 4. Evaluate (Now indices will match)
score_hybrid = quick_eval(emb_hybrid, labels, "Hybrid (Cora)")

Loading cora from .mat file...
Data Loaded. Nodes: 2708
   Features Loaded. Shape: torch.Size([2708, 1433])

Re-generating Embeddings for CORA...
New Embeddings Shape: torch.Size([2708, 512])


Hybrid (Cora) Macro-F1: 0.7682


## HRP Experiment

In [10]:
# Common Params
params = {
    'embedding_dim': 512, 
    'window_size': 4,
    'normalization': True,
    'group_size': 10,        # <--- The Winner!
    'input_matrix': 'trans',
    'alpha': -0.6,           
    'weights': [1.0, 1.0, 7.81, 45.28]
}

# 1. Baseline (Gaussian)
print("Running Gaussian...")
model_base = FastRP(**params, projection_type='gaussian').to(device)
with torch.no_grad():
    # Note: Baseline FastRP ignores features usually, but let's see pure structure baseline first
    emb_base = model_base(adj_tensor, features=None)
score_base = quick_eval(emb_base, labels, "Gaussian (Baseline)")

# 2. Structured (Ours, g=10)
print("\nRunning Striped (Structure Only)...")
model_striped = FastRP(**params, projection_type='striped').to(device)
with torch.no_grad():
    emb_striped = model_striped(adj_tensor, features=None)
score_striped = quick_eval(emb_striped, labels, "Striped (Ours)")

# 3. Hybrid (Ours, g=10 + Features)
print("\nRunning Hybrid (Structure + Features)...")
# We reuse the striped model class but pass features this time
with torch.no_grad():
    # The model handles the fusion internally if features are passed
    emb_hybrid = model_striped(adj_tensor, features=feat_tensor) 
score_hybrid = quick_eval(emb_hybrid, labels, "Hybrid (Ours)")

print("\nFinal Standings:")
print(f"1. Hybrid: {score_hybrid:.4f}")
print(f"2. Striped: {score_striped:.4f}")
print(f"3. Gaussian: {score_base:.4f}")

Running Gaussian...
Gaussian (Baseline) Macro-F1: 0.7232

Running Striped (Structure Only)...
Striped (Ours) Macro-F1: 0.7356

Running Hybrid (Structure + Features)...
Hybrid (Ours) Macro-F1: 0.7541

Final Standings:
1. Hybrid: 0.7541
2. Striped: 0.7356
3. Gaussian: 0.7232
