In [11]:
import torch
from torch_geometric.utils import dropout_edge

# make a simple undirected edge_index with both directions
edge_index = torch.tensor([
    [0, 1, 1, 2, 2, 3, 3, 0],  # sources
    [1, 0, 2, 1, 3, 2, 0, 3]   # targets
], dtype=torch.long)

print("Original edge_index shape:", edge_index.shape)
print(edge_index)

# drop 50% of undirected edges (both directions together)
# torch.manual_seed(42)  # for reproducibility

for i in range(3):
    edge_index_dropped, edge_id = dropout_edge(
        edge_index, p=0.5, force_undirected=True, training=True
    )

    print("\nAfter dropout:")
    print("Dropped edge_index shape:", edge_index_dropped.shape)
    print(edge_index_dropped)

    print("\nedge_id (which original undirected pairs were kept):")
    print(edge_id)


Original edge_index shape: torch.Size([2, 8])
tensor([[0, 1, 1, 2, 2, 3, 3, 0],
        [1, 0, 2, 1, 3, 2, 0, 3]])

After dropout:
Dropped edge_index shape: torch.Size([2, 4])
tensor([[0, 0, 1, 3],
        [1, 3, 0, 0]])

edge_id (which original undirected pairs were kept):
tensor([0, 7, 0, 7])

After dropout:
Dropped edge_index shape: torch.Size([2, 6])
tensor([[0, 1, 2, 1, 2, 3],
        [1, 2, 3, 0, 1, 2]])

edge_id (which original undirected pairs were kept):
tensor([0, 2, 4, 0, 2, 4])

After dropout:
Dropped edge_index shape: torch.Size([2, 6])
tensor([[0, 1, 0, 1, 2, 3],
        [1, 2, 3, 0, 1, 0]])

edge_id (which original undirected pairs were kept):
tensor([0, 2, 7, 0, 2, 7])


In [6]:
edge_index

tensor([[0, 1, 1, 2, 2, 3, 3, 0],
        [1, 0, 2, 1, 3, 2, 0, 3]])

In [1]:
# =========================
# JUPYTER NOTEBOOK: DATASET SIMULATION & EXPORT (PICKLE)
# =========================

# --- Imports and setup ---
import sys
import os
import torch
import pickle
import numpy as np

# keep this exact sys.path append (mandatory)
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(''), os.path.pardir)))

from utils.genlink import DataProcessor, NullSimulator  # datasets only, no neural nets

# --- Paths ---
base_dir = '/disk/10tb/home/shmelev/simulated_dataset_for_workshop'
seed_csv = '/disk/10tb/home/shmelev/cr_only/CR.csv'

print("=== CONFIG ===")
print(f"base_dir: {base_dir}")
print(f"seed_csv: {seed_csv}")
print("================\n")

# ensure output dir exists
if not os.path.exists(base_dir):
    print(f"[INFO] Creating base_dir: {base_dir}")
    os.makedirs(base_dir)
else:
    print(f"[INFO] base_dir already exists: {base_dir}")

# --- 1) Load reference dataset (CR) and compute simulation parameters ---
print("\n[1/5] Loading CR dataset and computing simulation parameters...")
dp = DataProcessor(seed_csv)
dp.compute_simulation_params()
print("[OK] Computed parameters for CR dataset.")

print("Shapes:")
print("  edge_probs:", dp.edge_probs.shape)
print("  mean_weight:", dp.mean_weight.shape)
print("Applying np.nan_to_num on edge_probs (no assignment, same style as original).")
np.nan_to_num(dp.edge_probs)

print("Class counts (CR):")
print(dp.node_classes_sorted.iloc[:, 1].value_counts())
print("Classes (dp):", dp.classes, "\n")

# --- 2) Prepare population sizes (NO modifications to probabilities/weights) ---
print("[2/5] Preparing population sizes to resemble the real dataset (no modifications).")
pop_sizes = []
for i in range(len(dp.classes)):
    pop_sizes.append(dp.node_classes_sorted.iloc[:, 1].value_counts().loc[i])
print("pop_sizes:", pop_sizes)
print("Total individuals (base):", sum(pop_sizes))

# --- 3) Simulate synthetic dataset that resembles the original (no tweaks, no multipliers) ---
print("\n[3/5] Simulating synthetic dataset with NullSimulator (unchanged probs/weights, CR-like).")
ns = NullSimulator(len(dp.classes), np.nan_to_num(dp.edge_probs), dp.mean_weight)
print(f"[OK] NullSimulator initialized with {len(dp.classes)} classes.")

means, counts, pop_index = ns.generate_matrices(np.array(pop_sizes), np.random.default_rng(42))
print("[OK] Generated matrices from NullSimulator.")
print("  means shape:", means.shape)
print("  counts shape:", counts.shape)
print("  pop_index sample (first 20):", pop_index[:20])

simulated_csv = os.path.join(base_dir, 'simulated_dataset_cr.csv')
ns.simulate_graph(means, counts, pop_index, simulated_csv)
print(f"[OK] Simulated dataset saved to: {simulated_csv}")

# --- 4) Build train/valid/test graphs (ONE-HOT features), save pickles, mask test labels, write CSV map ---
print("\n[4/5] Building train/valid/test graphs with GENLINK DataProcessor (one_hot features)...")
dataset = DataProcessor(simulated_csv)
print("[OK] DataProcessor created for simulated CR-like dataset.")

dataset.generate_random_train_valid_test_nodes(train_size=0.6, 
                                               valid_size=0.2, 
                                               test_size=0.2, 
                                               random_state=42,
                                               save_dir=None, 
                                               mask_size=None,
                                               sub_train_size=None, 
                                               keep_train_nodes=None, 
                                               mask_random_state=None)
print("[OK] Generated random train/valid/test node splits.")

# Use ONE-HOT features (changed from graph_based)
dataset.make_train_valid_test_datasets_with_numba('one_hot',
                                                  'homogeneous',
                                                  'multiple',
                                                  'multiple',
                                                  'simulated_dataset',
                                                  masking=False,
                                                  no_mask_class_in_df=True)
print("[OK] Constructed train/valid/test datasets (one_hot, homogeneous, one/multiple).")

print("Counts summary:")
print("  Training graphs:", len(dataset.array_of_graphs_for_training))
print("  Validation graphs:", len(dataset.array_of_graphs_for_validation))
print("  Test graphs:", len(dataset.array_of_graphs_for_testing))

# peek at one example per split if present
if len(dataset.array_of_graphs_for_training) > 0:
    g0 = dataset.array_of_graphs_for_training[0]
    print("First training graph stats:")
    print("  x:", tuple(g0.x.shape), "y:", tuple(g0.y.shape),
          "edge_index:", tuple(g0.edge_index.shape), "weight:", tuple(g0.weight.shape))
if len(dataset.array_of_graphs_for_validation) > 0:
    g1 = dataset.array_of_graphs_for_validation[0]
    print("First validation graph stats:")
    print("  x:", tuple(g1.x.shape), "y:", tuple(g1.y.shape),
          "edge_index:", tuple(g1.edge_index.shape), "weight:", tuple(g1.weight.shape))
if len(dataset.array_of_graphs_for_testing) > 0:
    g2 = dataset.array_of_graphs_for_testing[0]
    print("First test graph stats (before masking):")
    print("  x:", tuple(g2.x.shape), "y:", tuple(g2.y.shape),
          "edge_index:", tuple(g2.edge_index.shape), "weight:", tuple(g2.weight.shape))
    print("  last-node true label (will be hidden):", int(g2.y[-1]))

# Save pickles
print("\n[5/5] Saving training/validation graphs, masking test graphs, writing CSV mapping and sample submission...")
train_pkl = os.path.join(base_dir, 'train_graphs.pickle')
val_pkl = os.path.join(base_dir, 'validation_graphs.pickle')
test_pkl = os.path.join(base_dir, 'test_graphs.pickle')

with open(train_pkl, 'wb') as handle:
    pickle.dump(dataset.array_of_graphs_for_training, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"[OK] Saved training graphs to: {train_pkl}")

with open(val_pkl, 'wb') as handle:
    pickle.dump(dataset.array_of_graphs_for_validation, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"[OK] Saved validation graphs to: {val_pkl}")

# Mask the specific node in every test graph and write mapping CSV
masked_test_graphs = []
lines = ["sample,label\n"]

for i in range(len(dataset.array_of_graphs_for_testing)):
    g = dataset.array_of_graphs_for_testing[i]
    true_label = int(g.y[-1])
    sample_name = f"graph_{i}"
    lines.append(f"{sample_name},{true_label}\n")
    g.y[-1] = -1  # hide the label
    masked_test_graphs.append(g)
    if i < 5:
        print(f"  [mask] {sample_name}: true_label={true_label} -> y[-1] set to -1")

with open(test_pkl, 'wb') as handle:
    pickle.dump(masked_test_graphs, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"[OK] Saved masked test graphs to: {test_pkl}")

test_csv = os.path.join(base_dir, 'test_labels.csv')
with open(test_csv, 'w') as f:
    for line in lines:
        f.write(line)
print(f"[OK] Wrote test labels CSV to: {test_csv}")

# Generate a sample submission file (tab-separated, as discussed)
sample_sub_csv = os.path.join(base_dir, 'sample_submission.csv')
with open(sample_sub_csv, 'w') as f:
    f.write("ID\tPREDICTED_CLASS\n")
    for i in range(len(dataset.array_of_graphs_for_validation)):
        f.write(f"{i}\t0\n")  # dummy class 0 for all, demonstrates required format
print(f"[OK] Wrote sample submission file to: {sample_sub_csv}")

print("\n=== SUMMARY ===")
print(f"Train graphs: {len(dataset.array_of_graphs_for_training)}  -> {train_pkl}")
print(f"Valid graphs: {len(dataset.array_of_graphs_for_validation)} -> {val_pkl}")
print(f"Test  graphs: {len(masked_test_graphs)}            -> {test_pkl}")
print(f"Test labels CSV preview (first 6 lines):\n{''.join(lines[:6])}")
print("Sample submission preview (first 6 lines):")
with open(sample_sub_csv, 'r') as f:
    for k, line in enumerate(f):
        if k > 5:
            break
        print(line.rstrip())
print("All done (CR dataset; one_hot features; no modifications to edge/weight probabilities).")


=== CONFIG ===
base_dir: /disk/10tb/home/shmelev/simulated_dataset_for_workshop
seed_csv: /disk/10tb/home/shmelev/cr_only/CR.csv

[INFO] base_dir already exists: /disk/10tb/home/shmelev/simulated_dataset_for_workshop

[1/5] Loading CR dataset and computing simulation parameters...
[OK] Computed parameters for CR dataset.
Shapes:
  edge_probs: (4, 4)
  mean_weight: (4, 4)
Applying np.nan_to_num on edge_probs (no assignment, same style as original).
Class counts (CR):
0    1586
2    1582
1     792
3     673
Name: class_id, dtype: int64
Classes (dp): ['Southern Russians', 'Ukranians', 'Northen Russians', 'Belarusians'] 

[2/5] Preparing population sizes to resemble the real dataset (no modifications).
pop_sizes: [1586, 792, 1582, 673]
Total individuals (base): 4633

[3/5] Simulating synthetic dataset with NullSimulator (unchanged probs/weights, CR-like).
[OK] NullSimulator initialized with 4 classes.
[OK] Generated matrices from NullSimulator.
  means shape: (4633, 4633)
  counts shape: (