<a href="https://colab.research.google.com/github/ananyabatra04/fairness-graph-gnn/blob/main/Baseline_NBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GCN Baseline on NBA Dataset

In [1]:
%pip install -q torch torchvision torchaudio
%pip install -q torch-geometric
%pip install -q fairlearn scikit-learn pandas numpy

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
# Paths
csv_path = "nba.csv"
edge_path = "nba_relationship.txt"

nodes = pd.read_csv(csv_path)

# Drop unknown labels
nodes["SALARY"] = pd.to_numeric(nodes["SALARY"], errors="coerce")
nodes = nodes[nodes["SALARY"].notna() & (nodes["SALARY"] != -1)].copy()

# Binarize label: salary > 0
nodes["label"] = (nodes["SALARY"] > 0).astype(int)

# Sensitive attribute: country (binary)
country = pd.to_numeric(nodes["country"], errors="coerce").fillna(0)
nodes["sensitive"] = (country > 0).astype(int)

feature_cols = [
    c for c in nodes.columns
    if c not in ["user_id", "SALARY", "country", "label", "sensitive"]
]

nodes["label"].value_counts().sort_index()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,154
1,159


In [4]:
scaler = StandardScaler()
X = torch.tensor(
    scaler.fit_transform(nodes[feature_cols].values),
    dtype=torch.float
)

y = torch.tensor(nodes["label"].values, dtype=torch.long)
sensitive = torch.tensor(nodes["sensitive"].values, dtype=torch.long)

X.shape, y.shape, sensitive.shape

(torch.Size([313, 95]), torch.Size([313]), torch.Size([313]))

In [5]:
edges = pd.read_csv(
    edge_path,
    sep="\t",
    names=["src", "dst"]
)

# Map arbitrary user_id to 0..N-1
id_map = {
    uid: i for i, uid in enumerate(nodes["user_id"].values)
}

src_list = []
dst_list = []

for src, dst in zip(edges["src"], edges["dst"]):
    if src in id_map and dst in id_map:
        src_list.append(id_map[src])
        dst_list.append(id_map[dst])

edge_index = torch.tensor(
    [src_list, dst_list],
    dtype=torch.long
)

# Make undirected
edge_index = torch.cat(
    [edge_index, edge_index.flip(0)],
    dim=1
)

edge_index.min(), edge_index.max(), X.size(0)

(tensor(0), tensor(312), 313)

In [6]:
num_nodes = nodes.shape[0]

perm = torch.randperm(num_nodes)

train_size = int(0.6 * num_nodes)
val_size = int(0.2 * num_nodes)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[perm[:train_size]] = True
val_mask[perm[train_size:train_size + val_size]] = True
test_mask[perm[train_size + val_size:]] = True

In [7]:
data = Data(
    x=X,
    edge_index=edge_index,
    y=y
)

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
data.sensitive = sensitive

In [8]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


class SupervisedGCN(torch.nn.Module):
    def __init__(self, encoder, hidden_dim, num_classes):
        super().__init__()
        self.encoder = encoder
        self.classifier = torch.nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index):
        z = self.encoder(x, edge_index)
        out = self.classifier(z)
        return out

In [9]:
device = torch.device("cpu")
encoder = GCNEncoder(
    in_channels=data.num_features,
    hidden_channels=64
)

model = SupervisedGCN(
    encoder=encoder,
    hidden_dim=64,
    num_classes=2
).to(device)

data = data.to(device)

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.01,
    weight_decay=5e-4
)

criterion = torch.nn.CrossEntropyLoss()

In [10]:
def train():
    model.train()
    optimizer.zero_grad()

    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])

    loss.backward()
    optimizer.step()

    return loss.item()


@torch.no_grad()
def fairness_metrics(y_true, y_pred, sensitive):
    metrics = {}

    for s_val in [0, 1]:
        mask_s = (sensitive == s_val)

        if mask_s.sum() > 0:
            metrics[f"P_yhat1_s{s_val}"] = y_pred[mask_s].float().mean()
        else:
            metrics[f"P_yhat1_s{s_val}"] = torch.tensor(0.0)

        mask_y1_s = mask_s & (y_true == 1)
        if mask_y1_s.sum() > 0:
            metrics[f"TPR_s{s_val}"] = y_pred[mask_y1_s].float().mean()
        else:
            metrics[f"TPR_s{s_val}"] = torch.tensor(0.0)

    delta_sp = torch.abs(
        metrics["P_yhat1_s0"] - metrics["P_yhat1_s1"]
    )

    delta_eo = torch.abs(
        metrics["TPR_s0"] - metrics["TPR_s1"]
    )

    return delta_sp.item(), delta_eo.item()


@torch.no_grad()
def evaluate(mask):
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)

    acc = (pred[mask] == data.y[mask]).float().mean().item()

    delta_sp, delta_eo = fairness_metrics(
        y_true=data.y[mask],
        y_pred=pred[mask],
        sensitive=data.sensitive[mask]
    )

    return acc, delta_sp, delta_eo

In [11]:
for epoch in range(1, 201):
    loss = train()

    if epoch % 20 == 0:
        train_acc, _, _ = evaluate(data.train_mask)
        val_acc, val_sp, val_eo = evaluate(data.val_mask)
        test_acc, test_sp, test_eo = evaluate(data.test_mask)

        print(
            f"Epoch {epoch:03d} | "
            f"Loss {loss:.4f} | "
            f"Train {train_acc:.3f} | "
            f"Val {val_acc:.3f} | "
            f"Test {test_acc:.3f} | "
            f"ΔSP {test_sp:.3f} | "
            f"ΔEO {test_eo:.3f}"
        )

Epoch 020 | Loss 0.3892 | Train 0.856 | Val 0.694 | Test 0.672 | ΔSP 0.158 | ΔEO 0.167
Epoch 040 | Loss 0.2486 | Train 0.925 | Val 0.694 | Test 0.672 | ΔSP 0.201 | ΔEO 0.208
Epoch 060 | Loss 0.1067 | Train 0.989 | Val 0.694 | Test 0.609 | ΔSP 0.154 | ΔEO 0.000
Epoch 080 | Loss 0.0258 | Train 1.000 | Val 0.694 | Test 0.562 | ΔSP 0.091 | ΔEO 0.000
Epoch 100 | Loss 0.0120 | Train 1.000 | Val 0.677 | Test 0.578 | ΔSP 0.089 | ΔEO 0.042
Epoch 120 | Loss 0.0098 | Train 1.000 | Val 0.677 | Test 0.562 | ΔSP 0.043 | ΔEO 0.042
Epoch 140 | Loss 0.0097 | Train 1.000 | Val 0.694 | Test 0.562 | ΔSP 0.043 | ΔEO 0.042
Epoch 160 | Loss 0.0097 | Train 1.000 | Val 0.694 | Test 0.562 | ΔSP 0.043 | ΔEO 0.042
Epoch 180 | Loss 0.0178 | Train 0.989 | Val 0.661 | Test 0.500 | ΔSP 0.074 | ΔEO 0.125
Epoch 200 | Loss 0.0775 | Train 0.947 | Val 0.710 | Test 0.531 | ΔSP 0.234 | ΔEO 0.167
