In [1]:
%cd ..
from tqdm import tqdm
from utils.data import StackDataset
import numpy as np
import torch
import pickle
import os

from torch_geometric.data import HeteroData


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

/home/lingze/embedding_fusion


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = StackDataset(cache_dir="/home/lingze/.cache/relbench/stack")
db = dataset.get_db()

Loading Database object from /home/lingze/.cache/relbench/stack/db...
Done in 11.37 seconds.


In [3]:
for table_name, table in db.table_dict.items():
    n = len(table.df)
    print(f"Table {table_name} has {n} rows")

Table tags has 1597 rows
Table postHistory has 1175368 rows
Table comments has 623967 rows
Table badges has 463463 rows
Table postTag has 648577 rows
Table users has 255360 rows
Table postLinks has 77337 rows
Table votes has 1317876 rows
Table posts has 333893 rows


In [4]:
cache_path = "./data/stack-tensor-frame/"

In [5]:
# [NOTE]: the dataset has been materialized

# get infer_type in cache
type_path = os.path.join(cache_path,"col_type_dict.pkl")
col_type_dict = pickle.load(open(type_path, "rb"))
len(col_type_dict)

# add "compress_text" in each table in case 
for table_name, table in db.table_dict.items():
    table.df["text_compress"] = np.nan

In [6]:
from utils.resource import get_text_embedder_cfg
text_embedder_cfg = get_text_embedder_cfg(
    model_name = "sentence-transformers/average_word_embeddings_glove.6B.300d", 
    device = device)

  return self.fget.__get__(instance, owner)()


In [7]:
from utils.builder import build_pyg_hetero_graph
data, col_stats_dict = build_pyg_hetero_graph(
    db,
    col_type_dict,
    text_embedder_cfg,
    cache_path,
    True,
)

-----> Materialize tags Tensor Frame
-----> Materialize postHistory Tensor Frame
-----> Materialize comments Tensor Frame
-----> Materialize badges Tensor Frame
-----> Build edge between posts and tags
-----> Materialize users Tensor Frame
-----> Materialize postLinks Tensor Frame
-----> Materialize votes Tensor Frame
-----> Materialize posts Tensor Frame


In [8]:
# add new edges:
from utils.util import load_np_dict
from torch_geometric.utils import sort_edge_index
edge_dict = load_np_dict("./edges/rel-stack-edges.npz")

for edge_name, edge_np in edge_dict.items():
    src_table, dst_table = edge_name.split('-')[0], edge_name.split('-')[1]
    edge_index = torch.from_numpy(edge_np.astype(int)).t()
    # [2, edge_num]
    edge_type = (src_table, f"appendix", dst_table)
    data[edge_type].edge_index = sort_edge_index(edge_index)
data.validate()

True

In [9]:
# get the relbench tasks
from relbench.tasks import get_task
from relbench.modeling.graph import get_node_train_table_input
from torch_geometric.loader import NeighborLoader
from relbench.base import BaseTask
from model.base import CompositeModel, FeatureEncodingPart, NodeRepresentationPart
from relbench.modeling.nn import HeteroTemporalEncoder
# start to fine-train on the task a
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import math
import copy

In [10]:
task_a = get_task("rel-stack", "user-badge", download = True)
entity_table = task_a.entity_table

In [11]:
def generate_loader_dict(task: BaseTask, data:HeteroData) -> dict:
    loader_dict = {}
    for split, table in [
        ("train", task.get_table("train")),
        ("val",task.get_table("val")),
        ("test", task.get_table("test")),
    ]:
        table_input = get_node_train_table_input(
            table=table,
            task=task,
        )
        loader_dict[split] = NeighborLoader(
            data,
            num_neighbors=[
                128, 64
            ],  # we sample subgraphs of depth 2, 128 neighbors per node.
            time_attr="time",
            input_nodes=table_input.nodes,
            input_time=table_input.time,
            transform=table_input.transform,
            batch_size=512,
            temporal_strategy="uniform",
            shuffle=split == "train",
            num_workers=0,
            persistent_workers=False,
        )
    return loader_dict

In [12]:
@torch.no_grad()
def test(loader: NeighborLoader, model: torch.nn.Module, task: BaseTask)-> np.ndarray:
    model.eval()
    pred_list = []
    for batch in loader:
        batch = batch.to(device)
        pred = model(
            batch,
            task.entity_table,
        )
        pred = pred.view(-1) if pred.size(1) == 1 else pred
        pred_list.append(pred.detach().cpu())
    return torch.cat(pred_list, dim=0)

In [13]:
# construct bottom model
channels = 128

temporal_encoder = HeteroTemporalEncoder(
    node_types=[
                node_type for node_type in data.node_types if "time" in data[node_type]
            ],
    channels=channels,
)

feat_encoder = FeatureEncodingPart(
    data=data,
    node_to_col_stats=col_stats_dict,
    channels=channels,
)

node_encoder = NodeRepresentationPart(
    data=data,
    channels=channels,
    num_layers=1,
    normalization="layer_norm",
    dropout_prob=0.2
)

net = CompositeModel(
    data=data,
    channels=channels,
    out_channels=1,
    dropout=0.2,
    aggr="sum",
    norm="batch_norm",
    num_layer=2,
    feature_encoder=feat_encoder,
    node_encoder=node_encoder,
    temporal_encoder=temporal_encoder
)

In [14]:
# training
task_loader_dict = generate_loader_dict(task_a,data)
lr = 0.005
epoches = 40
loss_fn = BCEWithLogitsLoss()
tune_metric = "auroc"
higher_is_better = True
early_stop = 5
max_round_epoch = 50
# optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr = lr)
optimizer = torch.optim.Adam(net.parameters(), lr = lr)

In [15]:
best_val_metric = -math.inf if higher_is_better else math.inf
net.to(device)
best_epoch = 0
patience = 0
for epoch in range(1, epoches + 1):
    net.train()
    cnt = 0
    loss_accum = count_accum = 0
    for batch in tqdm(task_loader_dict["train"], leave=False):
        cnt += 1
        if cnt > max_round_epoch:
            break
        
        batch = batch.to(device)
        optimizer.zero_grad()
        pred = net(
            batch,
            entity_table,
        )
        pred = pred.view(-1) if pred.size(1) == 1 else pred
        loss = loss_fn(pred, batch[entity_table].y.float())
        
        loss.backward()
        optimizer.step()
        
        loss_accum += loss.detach().item() * pred.size(0)
        count_accum += pred.size(0)
    
    train_loss = loss_accum / count_accum
    val_logits = test(task_loader_dict["val"], net, task_a)
    val_logits = torch.sigmoid(val_logits).numpy()
    
    val_pred = (val_logits > 0.5).astype(int)
    val_pred_hat = task_a.get_table("val").df[task_a.target_col].to_numpy()
    val_metrics = {
            "auroc": roc_auc_score(val_pred_hat, val_logits),
        # "accuracy": accuracy_score(val_pred_hat, val_pred),
        # "precision": precision_score(val_pred_hat, val_pred),
        # "recall": recall_score(val_pred_hat, val_pred),
        # "f1": f1_score(val_pred_hat, val_pred),
    }
    
    test_logits = test(task_loader_dict["test"], net, task_a)
    test_logits =  torch.sigmoid(test_logits).numpy()
    print("*"*30 + f"<Epoch: {epoch:02d}>" + "*"*30)
    print(f", Train loss: {train_loss}, Val metrics: {val_metrics}")
    
    test_pred = (test_logits > 0.5).astype(int)
    test_pred_hat = task_a.get_table("test", mask_input_cols = False).df[task_a.target_col].to_numpy()
    test_metrics = {
        "auroc": roc_auc_score(test_pred_hat, test_logits),
        # "accuracy": accuracy_score(test_pred_hat, test_pred),
        # "precision": precision_score(test_pred_hat, test_pred),
        # "recall": recall_score(test_pred_hat, test_pred),
        # "f1score": f1_score(test_pred_hat, test_pred),
    }

    print(f"Test metrics: {test_metrics}")

    
    if (higher_is_better and val_metrics[tune_metric] > best_val_metric) or (
        not higher_is_better and val_metrics[tune_metric] < best_val_metric
    ):
        patience = 0
        best_epoch = epoch
        best_val_metric = val_metrics[tune_metric]
        state_dict = copy.deepcopy(net.state_dict())
    else:
        patience += 1
    
    if patience > early_stop:
        break

# print the best epoch
best_epoch

                                                  

******************************<Epoch: 01>******************************
, Train loss: 0.16720126315951347, Val metrics: {'auroc': 0.850543419395753}
Test metrics: {'auroc': 0.8311490931377215}


                                                 

******************************<Epoch: 02>******************************
, Train loss: 0.16418032839894295, Val metrics: {'auroc': 0.8578007596421857}
Test metrics: {'auroc': 0.83630041247649}


                                                 

******************************<Epoch: 03>******************************
, Train loss: 0.1595326642692089, Val metrics: {'auroc': 0.8718988938838561}
Test metrics: {'auroc': 0.8515498681501039}


                                                 

******************************<Epoch: 04>******************************
, Train loss: 0.15555403023958206, Val metrics: {'auroc': 0.8817653434626853}
Test metrics: {'auroc': 0.8633104405335507}


                                                 

******************************<Epoch: 05>******************************
, Train loss: 0.1545327265560627, Val metrics: {'auroc': 0.8864634392273488}
Test metrics: {'auroc': 0.8669738698700313}


                                                 

******************************<Epoch: 06>******************************
, Train loss: 0.1551961813867092, Val metrics: {'auroc': 0.8906459575770339}
Test metrics: {'auroc': 0.8729412788898706}


                                                 

******************************<Epoch: 07>******************************
, Train loss: 0.14918058395385742, Val metrics: {'auroc': 0.8901570324043065}
Test metrics: {'auroc': 0.872993693674527}


                                                 

******************************<Epoch: 08>******************************
, Train loss: 0.14431664004921912, Val metrics: {'auroc': 0.88231664612049}
Test metrics: {'auroc': 0.8670441154635574}


                                                 

******************************<Epoch: 09>******************************
, Train loss: 0.14620534867048263, Val metrics: {'auroc': 0.8919580374228253}
Test metrics: {'auroc': 0.8746455811342487}


                                                 

******************************<Epoch: 10>******************************
, Train loss: 0.14805956110358237, Val metrics: {'auroc': 0.8934695983488895}
Test metrics: {'auroc': 0.877525991169696}


                                                 

******************************<Epoch: 11>******************************
, Train loss: 0.15033212974667548, Val metrics: {'auroc': 0.8945934860960411}
Test metrics: {'auroc': 0.8774497777867357}


                                                 

******************************<Epoch: 12>******************************
, Train loss: 0.1432385416328907, Val metrics: {'auroc': 0.895037077356371}
Test metrics: {'auroc': 0.8772120659171098}


                                                 

******************************<Epoch: 13>******************************
, Train loss: 0.14894546791911126, Val metrics: {'auroc': 0.8973817510364227}
Test metrics: {'auroc': 0.8808261593179711}


                                                 

******************************<Epoch: 14>******************************
, Train loss: 0.1445087245106697, Val metrics: {'auroc': 0.8949183345433452}
Test metrics: {'auroc': 0.8798205130444618}


                                                 

******************************<Epoch: 15>******************************
, Train loss: 0.13874147355556488, Val metrics: {'auroc': 0.8966692787556461}
Test metrics: {'auroc': 0.8812358827299216}


                                                 

******************************<Epoch: 16>******************************
, Train loss: 0.1456520874798298, Val metrics: {'auroc': 0.8969507542726318}
Test metrics: {'auroc': 0.8804005692780225}


                                                 

******************************<Epoch: 17>******************************
, Train loss: 0.13997410103678704, Val metrics: {'auroc': 0.8953687343334539}
Test metrics: {'auroc': 0.8794192975993055}


                                                 

******************************<Epoch: 18>******************************
, Train loss: 0.1507315741479397, Val metrics: {'auroc': 0.8942069418723386}
Test metrics: {'auroc': 0.8791345272047486}


                                                 

******************************<Epoch: 19>******************************
, Train loss: 0.14626726642251014, Val metrics: {'auroc': 0.893650064605988}
Test metrics: {'auroc': 0.8790294548748631}


13

In [16]:
net.load_state_dict(state_dict)
test_logits = test(task_loader_dict["test"], net, task_a)
test_logits =  torch.sigmoid(test_logits).numpy()

test_pred = (test_logits > 0.5).astype(int)
test_pred_hat = task_a.get_table("test", mask_input_cols = False).df[task_a.target_col].to_numpy()
test_metrics = {
    "auroc": roc_auc_score(test_pred_hat, test_logits),
    "accuracy": accuracy_score(test_pred_hat, test_pred),
    "precision": precision_score(test_pred_hat, test_pred),
    "recall": recall_score(test_pred_hat, test_pred),
    "f1score": f1_score(test_pred_hat, test_pred),
}
test_metrics

{'auroc': 0.8808259234251636,
 'accuracy': 0.9743969298245614,
 'precision': 0.5545706371191136,
 'recall': 0.148626577579807,
 'f1score': 0.23442622950819672}