awslabs · wangz10 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 6, 2023
diff --git a/inference_scripts/lp_infer/ml_lp_none_train_etype_infer.yaml b/inference_scripts/lp_infer/ml_lp_none_train_etype_infer.yaml
@@ -23,3 +23,9 @@ gsf:
     lp_decoder_type: "dot_product"
     sparse_optimizer_lr: 1e-2
     use_node_embeddings: false
+  link_prediction:
+    num_negative_edges: 4
+    num_negative_edges_eval: 100
+    train_negative_sampler: joint
+    exclude_training_targets: false
+    reverse_edge_types_map: []
diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py
@@ -2076,6 +2076,14 @@ def eval_metric(self):
 
         return eval_metric
 
+    @property
+    def eval_k(self):
+        """ k used for link prediction evaluation metrics including hits@k
+        """
+        if hasattr(self, "_eval_k"):
+            return self._eval_k
+        return 100
+
     @property
     def model_select_etype(self):
         """ Canonical etype used for selecting the best model
@@ -2494,6 +2502,8 @@ def _add_task_general_args(parser):
             help="Whether report evaluation metrics per node type or edge type."
                  "If True, report evaluation results for each node type/edge type."
                  "If False, report an average evaluation result.")
+    group.add_argument('--eval-k', type=int, default=argparse.SUPPRESS,
+            help="k used for link prediction evaluation metrics including hits@k.")
     return parser
 
 def _add_inference_args(parser):

diff --git a/python/graphstorm/eval/eval_func.py b/python/graphstorm/eval/eval_func.py
@@ -27,7 +27,7 @@
 SUPPORTED_CLASSIFICATION_METRICS = {'accuracy', 'precision_recall', \
     'roc_auc', 'f1_score', 'per_class_f1_score', 'per_class_roc_auc'}
 SUPPORTED_REGRESSION_METRICS = {'rmse', 'mse', 'mae'}
-SUPPORTED_LINK_PREDICTION_METRICS = {"mrr"}
+SUPPORTED_LINK_PREDICTION_METRICS = {"mrr", "hits"}
 
 class ClassificationMetrics:
     """ object that compute metrics for classification tasks.
@@ -127,12 +127,19 @@ def init_best_metric(self, metric):
 class LinkPredictionMetrics:
     """ object that compute metrics for LP tasks.
     """
-    def __init__(self):
+    def __init__(self, k=100):
         self.supported_metrics = SUPPORTED_LINK_PREDICTION_METRICS
+        self.k = k
 
         # This is the operator used to compare whether current value is better than the current best
         self.metric_comparator = {}
         self.metric_comparator["mrr"] = operator.le
+        self.metric_comparator["hits"] = operator.le
+
+        # This is the operator used to measure each metric performance
+        self.metric_function = {}
+        self.metric_function["mrr"] = gen_mrr_score
+        self.metric_function["hits"] = partial(hits_at_k, k=k)
 
     def assert_supported_metric(self, metric):
         """ check if the given metric is supported.
@@ -414,3 +421,33 @@ def compute_mae(pred, labels):
 
     diff = th.abs(pred.cpu() - labels.cpu())
     return th.mean(diff).cpu().item()
+
+def gen_mrr_score(ranking):
+    """ Get link prediction mrr metrics
+
+        Parameters
+        ----------
+        ranking:
+            ranking of each positive edge
+
+        Returns
+        -------
+        link prediction eval metric: tensor
+    """
+    logs = th.div(1.0, ranking)
+    return th.tensor(th.div(th.sum(logs),len(logs)))
+
+def hits_at_k(rankings, k):
+    """ Hits@k metric for link prediction
+        Parameters
+        ----------
+        ranking:
+            ranking of each positive edge
+        k: int
+            the maximum rank considered to accept a positive
+        Returns
+        -------
+        link prediction eval metric: tensor
+    """
+    score = th.tensor(th.div(th.sum(rankings < k), len(rankings)))
+    return score
diff --git a/python/graphstorm/eval/evaluator.py b/python/graphstorm/eval/evaluator.py
@@ -17,6 +17,7 @@
 """
 import logging
 import abc
+from collections import defaultdict
 from statistics import mean
 import torch as th
 
@@ -26,7 +27,6 @@
                              EARLY_STOP_CONSECUTIVE_INCREASE_STRATEGY,
                              LINK_PREDICTION_MAJOR_EVAL_ETYPE_ALL)
 from ..utils import get_rank, get_world_size, barrier
-from .utils import gen_mrr_score
 
 def early_stop_avg_increase_judge(val_score, val_perf_list, comparator):
     """
@@ -781,12 +781,14 @@ def val_perf_rank_list(self):
 
 
 class GSgnnMrrLPEvaluator(GSgnnLPEvaluator):
-    """ The class for link prediction evaluation using Mrr metric.
+    """ The class for link prediction evaluation using Mrr or Hits metrics.
 
     Parameters
     ----------
     eval_frequency: int
         The frequency (# of iterations) of doing evaluation.
+    eval_metric: list of string
+        Evaluation metric used during evaluation.
     data: GSgnnEdgeData
         The processed dataset
     num_negative_edges_eval: int
@@ -802,14 +804,16 @@ class GSgnnMrrLPEvaluator(GSgnnLPEvaluator):
     early_stop_strategy: str
         The early stop strategy. GraphStorm supports two strategies:
         1) consecutive_increase and 2) average_increase.
+    k: int
+        k used for computing metrics such as hits@k.
     """
-    def __init__(self, eval_frequency, data,
+    def __init__(self, eval_frequency, eval_metric, data,
                  num_negative_edges_eval, lp_decoder_type,
                  use_early_stop=False,
                  early_stop_burnin_rounds=0,
                  early_stop_rounds=3,
-                 early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY):
-        eval_metric = ["mrr"]
+                 early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY,
+                 k=100):
         super(GSgnnMrrLPEvaluator, self).__init__(eval_frequency,
             eval_metric, use_early_stop, early_stop_burnin_rounds,
             early_stop_rounds, early_stop_strategy)
@@ -819,7 +823,7 @@ def __init__(self, eval_frequency, data,
         self.num_negative_edges_eval = num_negative_edges_eval
         self.lp_decoder_type = lp_decoder_type
 
-        self.metrics_obj = LinkPredictionMetrics()
+        self.metrics_obj = LinkPredictionMetrics(k)
 
         self._best_val_score = {}
         self._best_test_score = {}
@@ -834,7 +838,7 @@ def compute_score(self, rankings, train=False): # pylint:disable=unused-argument
 
             Parameters
             ----------
-            rankings: dict of tensors
+            rankings: dict of tensors or tensor
                 Rankings of positive scores in format of {etype: ranking}
             train: bool
                 TODO: Reversed for future use cases when we want to use different
@@ -845,25 +849,30 @@ def compute_score(self, rankings, train=False): # pylint:disable=unused-argument
             -------
             Evaluation metric values: dict
         """
-        # We calculate global mrr, etype is ignored.
-        # User can develop its own per etype MRR evaluator
-        ranking = []
-        for _, rank in rankings.items():
-            ranking.append(rank)
-        ranking = th.cat(ranking, dim=0)
+        # We calculate global lp metrics, etype is ignored.
+        # User can develop its own per etype LP evaluator
+        if isinstance(rankings, dict):
+            ranking = []
+            for _, rank in rankings.items():
+                ranking.append(rank)
+            ranking = th.cat(ranking, dim=0)
+        else:
+            ranking = rankings
 
-        metrics = gen_mrr_score(ranking)
+        scores = {}
+        for metric in self._metric:
+            scores[metric] = self.metrics_obj.metric_function[metric](ranking)
 
+        return_scores = {}
         # When world size == 1, we do not need the barrier
         if get_world_size() > 1:
             barrier()
-            for _, metric_val in metrics.items():
+            for metric, metric_val in scores.items():
                 th.distributed.all_reduce(metric_val)
-        return_metrics = {}
-        for metric, metric_val in metrics.items():
+        for metric, metric_val in scores.items():
             return_metric = metric_val / get_world_size()
-            return_metrics[metric] = return_metric.item()
-        return return_metrics
+            return_scores[metric] = return_metric.item()
+        return return_scores
 
     def evaluate(self, val_scores, test_scores, total_iters):
         """ GSgnnLinkPredictionModel.fit() will call this function to do user defined evalution.
@@ -888,7 +897,7 @@ def evaluate(self, val_scores, test_scores, total_iters):
             if test_scores is not None:
                 test_score = self.compute_score(test_scores)
             else:
-                test_score = {"mrr": "N/A"} # Dummy
+                test_score = {metric: "N/A" for metric in self.metric} # Dummy
 
             if val_scores is not None:
                 val_score = self.compute_score(val_scores)
@@ -902,7 +911,7 @@ def evaluate(self, val_scores, test_scores, total_iters):
                             self._best_test_score[metric] = test_score[metric]
                             self._best_iter[metric] = total_iters
             else:
-                val_score = {"mrr": "N/A"} # Dummy
+                val_score = {metric: "N/A" for metric in self.metric} # Dummy
 
         return val_score, test_score
 
@@ -915,6 +924,8 @@ class GSgnnPerEtypeMrrLPEvaluator(GSgnnMrrLPEvaluator):
     ----------
     eval_frequency: int
         The frequency (# of iterations) of doing evaluation.
+    eval_metric: list of string
+        Evaluation metric used during evaluation.
     data: GSgnnEdgeData
         The processed dataset
     num_negative_edges_eval: int
@@ -932,16 +943,19 @@ class GSgnnPerEtypeMrrLPEvaluator(GSgnnMrrLPEvaluator):
     early_stop_strategy: str
         The early stop strategy. GraphStorm supports two strategies:
         1) consecutive_increase and 2) average_increase.
+    k: int
+        k used for computing metrics such as hits@k.
     """
-    def __init__(self, eval_frequency, data,
+    def __init__(self, eval_frequency, eval_metric, data,
                  num_negative_edges_eval, lp_decoder_type,
                  major_etype = LINK_PREDICTION_MAJOR_EVAL_ETYPE_ALL,
                  use_early_stop=False,
                  early_stop_burnin_rounds=0,
                  early_stop_rounds=3,
-                 early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY):
+                 early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY,
+                 k=100):
         self.major_etype = major_etype
-        super(GSgnnPerEtypeMrrLPEvaluator, self).__init__(eval_frequency,
+        super(GSgnnPerEtypeMrrLPEvaluator, self).__init__(eval_frequency, eval_metric,
             data, num_negative_edges_eval, lp_decoder_type,
             use_early_stop, early_stop_burnin_rounds,
             early_stop_rounds, early_stop_strategy)
@@ -961,28 +975,16 @@ def compute_score(self, rankings, train=False): # pylint:disable=unused-argument
 
             Returns
             -------
-            Evaluation metric values: dict
+            Evaluation metric values: dict of dict
         """
-        # We calculate global mrr, etype is ignored.
-        # User can develop its own per etype MRR evaluator
-        metrics = {}
+        scores = {}
         for etype, rank in rankings.items():
-            metrics[etype] = gen_mrr_score(rank)
-
-        # When world size == 1, we do not need the barrier
-        if get_world_size() > 1:
-            barrier()
-            for _, metric in metrics.items():
-                for _, metric_val in metric.items():
-                    th.distributed.all_reduce(metric_val)
-
-        return_metrics = {}
-        for etype, metric in metrics.items():
-            for metric_key, metric_val in metric.items():
-                return_metric = metric_val / get_world_size()
-                if metric_key not in return_metrics:
-                    return_metrics[metric_key] = {}
-                return_metrics[metric_key][etype] = return_metric.item()
+            scores[etype] = super().compute_score(rank)
+        # reorganize the nested dict to be keyed by metric, then by etype:
+        return_metrics = defaultdict(dict)
+        for metric in self.metric:
+            for etype in rankings.keys():
+                return_metrics[metric][etype] = scores[etype][metric]
         return return_metrics
 
     def _get_major_score(self, score):

diff --git a/python/graphstorm/eval/utils.py b/python/graphstorm/eval/utils.py
@@ -266,22 +266,6 @@ def gen_lp_score(ranking):
         metrics[metric] = th.tensor(sum(log[metric] for log in logs) / len(logs))
     return metrics
 
-def gen_mrr_score(ranking):
-    """ Get link prediction mrr metrics
-
-        Parameters
-        ----------
-        ranking:
-            ranking of each positive edge
-
-        Returns
-        -------
-        link prediction eval metrics: list of dict
-    """
-    logs = th.div(1.0, ranking)
-    metrics = {"mrr": th.tensor(th.div(th.sum(logs),len(logs)))}
-    return metrics
-
 
 def broadcast_data(rank, world_size, data_tensor):
     """ Broadcast local data to all trainers in the cluster using all2all

diff --git a/python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py b/python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py
@@ -54,27 +54,29 @@ def get_evaluator(config, train_data):
         train_data: GSgnnEdgeData
             Training data
     """
-    assert len(config.eval_metric) == 1, \
-        "GraphStorm doees not support computing multiple metrics at the same time."
     if config.report_eval_per_type:
         return GSgnnPerEtypeMrrLPEvaluator(config.eval_frequency,
+                                           config.eval_metric,
                                            train_data,
                                            config.num_negative_edges_eval,
                                            config.lp_decoder_type,
                                            config.model_select_etype,
                                            config.use_early_stop,
                                            config.early_stop_burnin_rounds,
                                            config.early_stop_rounds,
-                                           config.early_stop_strategy)
+                                           config.early_stop_strategy,
+                                           config.eval_k)
     else:
         return GSgnnMrrLPEvaluator(config.eval_frequency,
+                                   config.eval_metric,
                                    train_data,
                                    config.num_negative_edges_eval,
                                    config.lp_decoder_type,
                                    config.use_early_stop,
                                    config.early_stop_burnin_rounds,
                                    config.early_stop_rounds,
-                                   config.early_stop_strategy)
+                                   config.early_stop_strategy,
+                                   config.eval_k)
 
 def main(config_args):
     """ main function

diff --git a/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py b/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py
@@ -63,27 +63,29 @@ def get_evaluator(config, train_data):
         train_data: GSgnnEdgeData
             Training data
     """
-    assert len(config.eval_metric) == 1, \
-        "GraphStorm doees not support computing multiple metrics at the same time."
     if config.report_eval_per_type:
         return GSgnnPerEtypeMrrLPEvaluator(config.eval_frequency,
+                                           config.eval_metric,
                                            train_data,
                                            config.num_negative_edges_eval,
                                            config.lp_decoder_type,
                                            config.model_select_etype,
                                            config.use_early_stop,
                                            config.early_stop_burnin_rounds,
                                            config.early_stop_rounds,
-                                           config.early_stop_strategy)
+                                           config.early_stop_strategy,
+                                           config.eval_k)
     else:
         return GSgnnMrrLPEvaluator(config.eval_frequency,
+                                   config.eval_metric,
                                    train_data,
                                    config.num_negative_edges_eval,
                                    config.lp_decoder_type,
                                    config.use_early_stop,
                                    config.early_stop_burnin_rounds,
                                    config.early_stop_rounds,
-                                   config.early_stop_strategy)
+                                   config.early_stop_strategy,
+                                   config.eval_k)
 
 def main(config_args):
     """ main function