# Data Manipulation

#### This code is used to read the data using `dgl` and save into pytorch, so it's framework-agnostic.

This way, the whole code can be converted to use PyTorch only and not have to rely on old PyTorch and CUDA due to `dgl` requirements.

In [1]:
# # Uncomment to download data
# !if [ ! -s "../data/dblp.bin" ] && [ -s "../data/pokec_n.bin" ] && [ -s "../data/pokec_z.bin" ]; then sh ../data/download_datasets.sh && unzip -o ../data/data.zip -d ../data && rm ../data/data.zip && else echo "Skipping download. Files already exist." && fi

In [2]:
import torch
import dgl
from src.paths import DATA_DIR

**Read using `dgl` and save using PyTorch.**

In [3]:
for data_file in ['dblp.bin', 'pokec_n.bin', 'pokec_z.bin']:
    path = DATA_DIR + "/" + data_file
    graph_list, _ = dgl.load_graphs(path)
    print(data_file)
    print(graph_list)
    g = graph_list[0]
    
    idx_train = torch.where(g.ndata['train_index'])[0]
    idx_val   = torch.where(g.ndata['val_index'])[0]
    idx_test  = torch.where(g.ndata['test_index'])[0]
    
    index_split = {
        'train_index': idx_train,
        'val_index': idx_val,
        'test_index': idx_test
    }
    
    data = {
        'num_nodes': g.num_nodes(),
        'edge_index': torch.stack(g.edges(), dim=0),
        'x': g.ndata['feature'],
        'y': g.ndata['label'],
        'sensitive': g.ndata['sensitive'],
        'split': index_split
    }
    new_path = DATA_DIR + "/" + data_file[:-4] + ".pt"
    torch.save(data, new_path)
    print("saved", new_path, "\n")

In [5]:
# !zip -j ../data/data_pt.zip ../data/dblp.pt ../data/pokec_n.pt ../data/pokec_z.pt -x "*.DS_Store" -x "**/.*" -x "__MACOSX"

#### This code is used to download the credit dataset in `.csv` and `.txt` format and convert it into a matching format to the others.

The original datasets are downloaded from the repository and in the format of the PyG-Debias library (https://github.com/yushundong/PyGDebias/).

In [1]:
!wget -O ../data/credit.csv https://raw.githubusercontent.com/PyGDebias-Team/data/main/2023-7-26/credit/credit.csv
!wget -O ../data/credit_edges.txt https://raw.githubusercontent.com/PyGDebias-Team/data/main/2023-7-26/credit/credit_edges.txt

In [2]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path

DATA_DIR = "../data/"
CSV_PATH = DATA_DIR + "credit.csv"
EDGE_PATH = DATA_DIR + "credit_edges.txt"
OUTPUT_NAME = "credit_no_self_loops.pt"

np.random.seed(0)   # For reproducible train, val, test indexes
torch.manual_seed(0)    # For reproducible train, val, test indexes

# load 
df = pd.read_csv(CSV_PATH)

# label and sensitive attribute
y = torch.tensor(df["NoDefaultNextMonth"].values, dtype=torch.long)
sensitive = torch.tensor(df["Age"].values, dtype=torch.long)

# node features (drop label & sensitive)
feature_cols = df.columns.drop(["NoDefaultNextMonth", "Age"])
x = torch.tensor(df[feature_cols].values, dtype=torch.float32)

num_nodes = len(df)

# ----------------------------
# Load edges
# ----------------------------
edges = np.loadtxt(EDGE_PATH)

# convert scientific notation to int
src = torch.tensor(edges[:, 0], dtype=torch.long)
dst = torch.tensor(edges[:, 1], dtype=torch.long)

edge_index = torch.stack([src, dst], dim=0)


# train/val/test split stratified by label
label_idx_0 = np.where(y.numpy() == 0)[0]
label_idx_1 = np.where(y.numpy() == 1)[0]

np.random.shuffle(label_idx_0)
np.random.shuffle(label_idx_1)

label_number = min(len(label_idx_0), len(label_idx_1)) * 2

idx_train = np.append(
    label_idx_0[: min(int(0.5 * len(label_idx_0)), label_number // 2)],
    label_idx_1[: min(int(0.5 * len(label_idx_1)), label_number // 2)],
)

idx_val = np.append(
    label_idx_0[int(0.5 * len(label_idx_0)) : int(0.75 * len(label_idx_0))],
    label_idx_1[int(0.5 * len(label_idx_1)) : int(0.75 * len(label_idx_1))],
)

idx_test = np.append(
    label_idx_0[int(0.75 * len(label_idx_0)) :],
    label_idx_1[int(0.75 * len(label_idx_1)) :],
)

idx_train = torch.tensor(idx_train, dtype=torch.long)
idx_val   = torch.tensor(idx_val, dtype=torch.long)
idx_test  = torch.tensor(idx_test, dtype=torch.long)

index_split = {
    "train_index": idx_train,
    "val_index": idx_val,
    "test_index": idx_test,
}

# ----------------------------
# Unified data object
# ----------------------------
data = {
    "num_nodes": num_nodes,
    "edge_index": edge_index,
    "x": x,
    "y": y,
    "sensitive": sensitive,
    "split": index_split,
}

# ----------------------------
# Save
# ----------------------------
Path(DATA_DIR).mkdir(parents=True, exist_ok=True)
save_path = Path(DATA_DIR) / OUTPUT_NAME
torch.save(data, save_path)

print(f"Saved unified dataset to {save_path}")
print("Nodes:", num_nodes)
print("Edges:", edge_index.shape[1])
print("Train / Val / Test:", len(idx_train), len(idx_val), len(idx_test))


#### Adding self-loops to the credit dataset since it does not have any

In [3]:
import torch

data = torch.load("../data/credit_no_self_loops.pt")

edge_index = data["edge_index"]
num_nodes = data["num_nodes"]

self_loops = torch.arange(num_nodes, dtype=torch.long)
self_loop_edges = torch.stack([self_loops, self_loops], dim=0)

edge_index = torch.cat([edge_index, self_loop_edges], dim=1)

data["edge_index"] = edge_index
torch.save(data, "../data/credit.pt")

print(f"Added {self_loop_edges.size(1)} self-loops.")
print("Total edges:", edge_index.shape[1])