In [1]:
# ===== PyTorch (CUDA 12.1) =====
!pip install -q torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0+cu121 \
  --index-url https://download.pytorch.org/whl/cu121

# ===== PyTorch Geometric (prebuilt wheels, no compilation) =====
!pip install -q torch-geometric

# ===== Fairness + utilities =====
!pip install -q fairlearn scikit-learn pandas numpy



[31mERROR: Could not find a version that satisfies the requirement torch==2.1.0+cu121 (from versions: 2.2.0+cu121, 2.2.1+cu121, 2.2.2+cu121, 2.3.0+cu121, 2.3.1+cu121, 2.4.0+cu121, 2.4.1+cu121, 2.5.0+cu121, 2.5.1+cu121)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.1.0+cu121[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [3]:
!git clone https://github.com/ananyabatra04/fairness-graph-gnn.git

Cloning into 'fairness-graph-gnn'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 25 (delta 0), reused 2 (delta 0), pack-reused 22 (from 1)[K
Receiving objects: 100% (25/25), 52.29 MiB | 13.45 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Updating files: 100% (10/10), done.


In [4]:
nodes = pd.read_csv("fairness-graph-gnn/datasets/region_job.csv")
feature_cols = [
    c for c in nodes.columns
    if c not in ["user_id", "region", "I_am_working_in_field"]
]
nodes["I_am_working_in_field"].value_counts().sort_index()

# Drop unknown labels
nodes = nodes[nodes["I_am_working_in_field"] != -1]

# Binarize: 0 -> 0, 1–4 -> 1
nodes["label"] = (nodes["I_am_working_in_field"] > 0).astype(int)

nodes["label"].value_counts()
nodes["label"].unique()


array([1, 0])

In [5]:
import torch

X = torch.tensor(
    nodes[feature_cols].values,
    dtype=torch.float
)

X = (X - X.mean(dim=0)) / (X.std(dim=0) + 1e-6) #normalization


y = torch.tensor(
    nodes["label"].values,
    dtype=torch.long
)

sensitive = torch.tensor(
    (nodes["region"] == "Bratislava").astype(int).values,
    dtype=torch.long
)
X.shape, y.shape, sensitive.shape


(torch.Size([10262, 276]), torch.Size([10262]), torch.Size([10262]))

In [6]:
scaler = StandardScaler()
X = torch.tensor(
    scaler.fit_transform(X),
    dtype=torch.float
)

In [7]:
edges = pd.read_csv(
    "fairness-graph-gnn/datasets/region_job_relationship.txt",
    sep="\t",
    names=["src", "dst"]
)



In [8]:
#GCNs require node indices from 0 … N-1. Pokec user IDs are arbitrary.

id_map = {
    uid: i for i, uid in enumerate(nodes["user_id"].values)
}

src_list = []
dst_list = []

for src, dst in zip(edges["src"], edges["dst"]):
    if src in id_map and dst in id_map:
        src_list.append(id_map[src])
        dst_list.append(id_map[dst])

edge_index = torch.tensor(
    [src_list, dst_list],
    dtype=torch.long
)

# make undirected
edge_index = torch.cat(
    [edge_index, edge_index.flip(0)],
    dim=1
)


In [9]:
edge_index.min(), edge_index.max(), X.size(0)


(tensor(1), tensor(10260), 10262)

In [10]:
num_nodes = nodes.shape[0]

perm = torch.randperm(num_nodes)

train_size = int(0.6 * num_nodes)
val_size   = int(0.2 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask   = torch.zeros(num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size+val_size]] = True
test_mask[perm[train_size+val_size:]] = True


In [11]:
from torch_geometric.data import Data

data = Data(
    x=X,
    edge_index=edge_index,
    y=y
)

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
data.sensitive = sensitive


In [12]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


class SupervisedGCN(torch.nn.Module):
    def __init__(self, encoder, hidden_dim, num_classes):
        super().__init__()
        self.encoder = encoder
        self.classifier = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index):
        z = self.encoder(x, edge_index)
        out = self.classifier(z)
        return out


In [13]:
device = torch.device("cpu")
encoder = GCNEncoder(
    in_channels=data.num_features,
    hidden_channels=64
)

model = SupervisedGCN(
    encoder=encoder,
    hidden_dim=64,
    num_classes=2
).to(device)


data = data.to(device)

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.01,
    weight_decay=5e-4
)

criterion = torch.nn.CrossEntropyLoss()


In [14]:
def train():
    model.train()
    optimizer.zero_grad()

    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])

    loss.backward()
    optimizer.step()

    return loss.item()

@torch.no_grad()
def test(mask):
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    correct = (pred[mask] == data.y[mask]).sum()
    acc = int(correct) / int(mask.sum())
    return acc



In [15]:
@torch.no_grad()
def fairness_metrics(y_true, y_pred, sensitive):
    """
    y_true: (N,) ground truth labels {0,1}
    y_pred: (N,) predicted labels {0,1}
    sensitive: (N,) sensitive attribute {0,1}
    """

    metrics = {}

    for s_val in [0, 1]:
        mask_s = (sensitive == s_val)

        # P(ŷ = 1 | s)
        if mask_s.sum() > 0:
            metrics[f"P_yhat1_s{s_val}"] = y_pred[mask_s].float().mean()
        else:
            metrics[f"P_yhat1_s{s_val}"] = torch.tensor(0.0)

        # P(ŷ = 1 | y = 1, s)
        mask_y1_s = mask_s & (y_true == 1)
        if mask_y1_s.sum() > 0:
            metrics[f"TPR_s{s_val}"] = y_pred[mask_y1_s].float().mean()
        else:
            metrics[f"TPR_s{s_val}"] = torch.tensor(0.0)

    # Fairness gaps
    delta_sp = torch.abs(
        metrics["P_yhat1_s0"] - metrics["P_yhat1_s1"]
    )

    delta_eo = torch.abs(
        metrics["TPR_s0"] - metrics["TPR_s1"]
    )

    return delta_sp.item(), delta_eo.item()

@torch.no_grad()
def evaluate(mask):
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)

    acc = (pred[mask] == data.y[mask]).float().mean().item()

    delta_sp, delta_eo = fairness_metrics(
        y_true=data.y[mask],
        y_pred=pred[mask],
        sensitive=data.sensitive[mask]
    )

    return acc, delta_sp, delta_eo


In [16]:
device = torch.device("cpu")
data = data.to(device)
model = model.to(device)

In [17]:
for epoch in range(1, 201):
    loss = train()

    if epoch % 20 == 0:
        train_acc, _, _ = evaluate(data.train_mask)
        val_acc, val_sp, val_eo = evaluate(data.val_mask)
        test_acc, test_sp, test_eo = evaluate(data.test_mask)

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss:.4f} | "
            f"Train {train_acc:.3f} | "
            f"Val {val_acc:.3f} | "
            f"Test {test_acc:.3f} | "
            f"ΔSP {test_sp:.3f} | "
            f"ΔEO {test_eo:.3f}"
        )


Epoch 020 | Loss 0.4891 | Train 0.756 | Val 0.687 | Test 0.693 | ΔSP 0.462 | ΔEO 0.642
Epoch 040 | Loss 0.3506 | Train 0.839 | Val 0.672 | Test 0.691 | ΔSP 0.526 | ΔEO 0.698
Epoch 060 | Loss 0.2260 | Train 0.896 | Val 0.656 | Test 0.683 | ΔSP 0.599 | ΔEO 0.758
Epoch 080 | Loss 0.1439 | Train 0.944 | Val 0.659 | Test 0.675 | ΔSP 0.504 | ΔEO 0.663
Epoch 100 | Loss 0.1961 | Train 0.905 | Val 0.674 | Test 0.687 | ΔSP 0.495 | ΔEO 0.667
Epoch 120 | Loss 0.1545 | Train 0.947 | Val 0.665 | Test 0.686 | ΔSP 0.544 | ΔEO 0.710
Epoch 140 | Loss 0.0962 | Train 0.973 | Val 0.657 | Test 0.676 | ΔSP 0.534 | ΔEO 0.693
Epoch 160 | Loss 0.0607 | Train 0.989 | Val 0.651 | Test 0.670 | ΔSP 0.535 | ΔEO 0.688
Epoch 180 | Loss 0.0389 | Train 0.997 | Val 0.652 | Test 0.667 | ΔSP 0.537 | ΔEO 0.686
Epoch 200 | Loss 0.1076 | Train 0.902 | Val 0.628 | Test 0.634 | ΔSP 0.633 | ΔEO 0.744
