Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented hits@k metric for link-prediction #675

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions inference_scripts/lp_infer/ml_lp_none_train_etype_infer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,9 @@ gsf:
lp_decoder_type: "dot_product"
sparse_optimizer_lr: 1e-2
use_node_embeddings: false
link_prediction:
num_negative_edges: 4
num_negative_edges_eval: 100
train_negative_sampler: joint
exclude_training_targets: false
reverse_edge_types_map: []
10 changes: 10 additions & 0 deletions python/graphstorm/config/argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -2076,6 +2076,14 @@ def eval_metric(self):

return eval_metric

@property
def eval_k(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to have a list of ks instead of just 1 k?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think most of the LP/retrieval datasets only select one specific k.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed to eval_hit_k

""" k used for link prediction evaluation metrics including hits@k
"""
if hasattr(self, "_eval_k"):
return self._eval_k
return 100

@property
def model_select_etype(self):
""" Canonical etype used for selecting the best model
Expand Down Expand Up @@ -2494,6 +2502,8 @@ def _add_task_general_args(parser):
help="Whether report evaluation metrics per node type or edge type."
"If True, report evaluation results for each node type/edge type."
"If False, report an average evaluation result.")
group.add_argument('--eval-k', type=int, default=argparse.SUPPRESS,
help="k used for link prediction evaluation metrics including hits@k.")
return parser

def _add_inference_args(parser):
Expand Down
41 changes: 39 additions & 2 deletions python/graphstorm/eval/eval_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
SUPPORTED_CLASSIFICATION_METRICS = {'accuracy', 'precision_recall', \
'roc_auc', 'f1_score', 'per_class_f1_score', 'per_class_roc_auc'}
SUPPORTED_REGRESSION_METRICS = {'rmse', 'mse', 'mae'}
SUPPORTED_LINK_PREDICTION_METRICS = {"mrr"}
SUPPORTED_LINK_PREDICTION_METRICS = {"mrr", "hits"}

class ClassificationMetrics:
""" object that compute metrics for classification tasks.
Expand Down Expand Up @@ -127,12 +127,19 @@ def init_best_metric(self, metric):
class LinkPredictionMetrics:
""" object that compute metrics for LP tasks.
"""
def __init__(self):
def __init__(self, k=100):
self.supported_metrics = SUPPORTED_LINK_PREDICTION_METRICS
self.k = k

# This is the operator used to compare whether current value is better than the current best
self.metric_comparator = {}
self.metric_comparator["mrr"] = operator.le
self.metric_comparator["hits"] = operator.le

# This is the operator used to measure each metric performance
self.metric_function = {}
self.metric_function["mrr"] = gen_mrr_score
self.metric_function["hits"] = partial(hits_at_k, k=k)

def assert_supported_metric(self, metric):
""" check if the given metric is supported.
Expand Down Expand Up @@ -414,3 +421,33 @@ def compute_mae(pred, labels):

diff = th.abs(pred.cpu() - labels.cpu())
return th.mean(diff).cpu().item()

def gen_mrr_score(ranking):
""" Get link prediction mrr metrics

Parameters
----------
ranking:
ranking of each positive edge

Returns
-------
link prediction eval metric: tensor
"""
logs = th.div(1.0, ranking)
return th.tensor(th.div(th.sum(logs),len(logs)))

def hits_at_k(rankings, k):
""" Hits@k metric for link prediction
Parameters
----------
ranking:
ranking of each positive edge
k: int
the maximum rank considered to accept a positive
Returns
-------
link prediction eval metric: tensor
"""
score = th.tensor(th.div(th.sum(rankings < k), len(rankings)))
return score
90 changes: 46 additions & 44 deletions python/graphstorm/eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""
import logging
import abc
from collections import defaultdict
from statistics import mean
import torch as th

Expand All @@ -26,7 +27,6 @@
EARLY_STOP_CONSECUTIVE_INCREASE_STRATEGY,
LINK_PREDICTION_MAJOR_EVAL_ETYPE_ALL)
from ..utils import get_rank, get_world_size, barrier
from .utils import gen_mrr_score

def early_stop_avg_increase_judge(val_score, val_perf_list, comparator):
"""
Expand Down Expand Up @@ -781,12 +781,14 @@ def val_perf_rank_list(self):


class GSgnnMrrLPEvaluator(GSgnnLPEvaluator):
""" The class for link prediction evaluation using Mrr metric.
""" The class for link prediction evaluation using Mrr or Hits metrics.

Parameters
----------
eval_frequency: int
The frequency (# of iterations) of doing evaluation.
eval_metric: list of string
Evaluation metric used during evaluation.
data: GSgnnEdgeData
The processed dataset
num_negative_edges_eval: int
Expand All @@ -802,14 +804,16 @@ class GSgnnMrrLPEvaluator(GSgnnLPEvaluator):
early_stop_strategy: str
The early stop strategy. GraphStorm supports two strategies:
1) consecutive_increase and 2) average_increase.
k: int
k used for computing metrics such as hits@k.
"""
def __init__(self, eval_frequency, data,
def __init__(self, eval_frequency, eval_metric, data,
num_negative_edges_eval, lp_decoder_type,
use_early_stop=False,
early_stop_burnin_rounds=0,
early_stop_rounds=3,
early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY):
eval_metric = ["mrr"]
early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY,
k=100):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is better to define a hit@k evaluator.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that will lead to a lot of duplicated codes. I saw GSgnnRegressionEvaluator allows multiple metrics. What's the reasoning for having evaluator vs metric objects?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have two choices:

  1. define a GSgnnRankingLPEvaluator class and both GSgnnMrrLPEvaluator and GSgnnHitKLPEvaluator will inherate from GSgnnRankingLPEvaluator.
  2. define a GSgnnRankingLPEvaluator class which implements mrr() and hit(), and rename it to GSgnnMrrLPEvaluator and GSgnnRankingLPEvaluator.

I would prefer the first one.

super(GSgnnMrrLPEvaluator, self).__init__(eval_frequency,
eval_metric, use_early_stop, early_stop_burnin_rounds,
early_stop_rounds, early_stop_strategy)
Expand All @@ -819,7 +823,7 @@ def __init__(self, eval_frequency, data,
self.num_negative_edges_eval = num_negative_edges_eval
self.lp_decoder_type = lp_decoder_type

self.metrics_obj = LinkPredictionMetrics()
self.metrics_obj = LinkPredictionMetrics(k)

self._best_val_score = {}
self._best_test_score = {}
Expand All @@ -834,7 +838,7 @@ def compute_score(self, rankings, train=False): # pylint:disable=unused-argument

Parameters
----------
rankings: dict of tensors
rankings: dict of tensors or tensor
Rankings of positive scores in format of {etype: ranking}
train: bool
TODO: Reversed for future use cases when we want to use different
Expand All @@ -845,25 +849,30 @@ def compute_score(self, rankings, train=False): # pylint:disable=unused-argument
-------
Evaluation metric values: dict
"""
# We calculate global mrr, etype is ignored.
# User can develop its own per etype MRR evaluator
ranking = []
for _, rank in rankings.items():
ranking.append(rank)
ranking = th.cat(ranking, dim=0)
# We calculate global lp metrics, etype is ignored.
# User can develop its own per etype LP evaluator
if isinstance(rankings, dict):
ranking = []
for _, rank in rankings.items():
ranking.append(rank)
ranking = th.cat(ranking, dim=0)
else:
ranking = rankings

metrics = gen_mrr_score(ranking)
scores = {}
for metric in self._metric:
scores[metric] = self.metrics_obj.metric_function[metric](ranking)

return_scores = {}
# When world size == 1, we do not need the barrier
if get_world_size() > 1:
barrier()
for _, metric_val in metrics.items():
for metric, metric_val in scores.items():
th.distributed.all_reduce(metric_val)
return_metrics = {}
for metric, metric_val in metrics.items():
for metric, metric_val in scores.items():
return_metric = metric_val / get_world_size()
return_metrics[metric] = return_metric.item()
return return_metrics
return_scores[metric] = return_metric.item()
return return_scores

def evaluate(self, val_scores, test_scores, total_iters):
""" GSgnnLinkPredictionModel.fit() will call this function to do user defined evalution.
Expand All @@ -888,7 +897,7 @@ def evaluate(self, val_scores, test_scores, total_iters):
if test_scores is not None:
test_score = self.compute_score(test_scores)
else:
test_score = {"mrr": "N/A"} # Dummy
test_score = {metric: "N/A" for metric in self.metric} # Dummy

if val_scores is not None:
val_score = self.compute_score(val_scores)
Expand All @@ -902,7 +911,7 @@ def evaluate(self, val_scores, test_scores, total_iters):
self._best_test_score[metric] = test_score[metric]
self._best_iter[metric] = total_iters
else:
val_score = {"mrr": "N/A"} # Dummy
val_score = {metric: "N/A" for metric in self.metric} # Dummy

return val_score, test_score

Expand All @@ -915,6 +924,8 @@ class GSgnnPerEtypeMrrLPEvaluator(GSgnnMrrLPEvaluator):
----------
eval_frequency: int
The frequency (# of iterations) of doing evaluation.
eval_metric: list of string
Evaluation metric used during evaluation.
data: GSgnnEdgeData
The processed dataset
num_negative_edges_eval: int
Expand All @@ -932,16 +943,19 @@ class GSgnnPerEtypeMrrLPEvaluator(GSgnnMrrLPEvaluator):
early_stop_strategy: str
The early stop strategy. GraphStorm supports two strategies:
1) consecutive_increase and 2) average_increase.
k: int
k used for computing metrics such as hits@k.
"""
def __init__(self, eval_frequency, data,
def __init__(self, eval_frequency, eval_metric, data,
num_negative_edges_eval, lp_decoder_type,
major_etype = LINK_PREDICTION_MAJOR_EVAL_ETYPE_ALL,
use_early_stop=False,
early_stop_burnin_rounds=0,
early_stop_rounds=3,
early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY):
early_stop_strategy=EARLY_STOP_AVERAGE_INCREASE_STRATEGY,
k=100):
self.major_etype = major_etype
super(GSgnnPerEtypeMrrLPEvaluator, self).__init__(eval_frequency,
super(GSgnnPerEtypeMrrLPEvaluator, self).__init__(eval_frequency, eval_metric,
data, num_negative_edges_eval, lp_decoder_type,
use_early_stop, early_stop_burnin_rounds,
early_stop_rounds, early_stop_strategy)
Expand All @@ -961,28 +975,16 @@ def compute_score(self, rankings, train=False): # pylint:disable=unused-argument

Returns
-------
Evaluation metric values: dict
Evaluation metric values: dict of dict
"""
# We calculate global mrr, etype is ignored.
# User can develop its own per etype MRR evaluator
metrics = {}
scores = {}
for etype, rank in rankings.items():
metrics[etype] = gen_mrr_score(rank)

# When world size == 1, we do not need the barrier
if get_world_size() > 1:
barrier()
for _, metric in metrics.items():
for _, metric_val in metric.items():
th.distributed.all_reduce(metric_val)

return_metrics = {}
for etype, metric in metrics.items():
for metric_key, metric_val in metric.items():
return_metric = metric_val / get_world_size()
if metric_key not in return_metrics:
return_metrics[metric_key] = {}
return_metrics[metric_key][etype] = return_metric.item()
scores[etype] = super().compute_score(rank)
# reorganize the nested dict to be keyed by metric, then by etype:
return_metrics = defaultdict(dict)
for metric in self.metric:
for etype in rankings.keys():
return_metrics[metric][etype] = scores[etype][metric]
return return_metrics

def _get_major_score(self, score):
Expand Down
16 changes: 0 additions & 16 deletions python/graphstorm/eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,22 +266,6 @@ def gen_lp_score(ranking):
metrics[metric] = th.tensor(sum(log[metric] for log in logs) / len(logs))
return metrics

def gen_mrr_score(ranking):
""" Get link prediction mrr metrics

Parameters
----------
ranking:
ranking of each positive edge

Returns
-------
link prediction eval metrics: list of dict
"""
logs = th.div(1.0, ranking)
metrics = {"mrr": th.tensor(th.div(th.sum(logs),len(logs)))}
return metrics


def broadcast_data(rank, world_size, data_tensor):
""" Broadcast local data to all trainers in the cluster using all2all
Expand Down
10 changes: 6 additions & 4 deletions python/graphstorm/run/gsgnn_lp/gsgnn_lm_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,27 +54,29 @@ def get_evaluator(config, train_data):
train_data: GSgnnEdgeData
Training data
"""
assert len(config.eval_metric) == 1, \
"GraphStorm doees not support computing multiple metrics at the same time."
if config.report_eval_per_type:
return GSgnnPerEtypeMrrLPEvaluator(config.eval_frequency,
config.eval_metric,
train_data,
config.num_negative_edges_eval,
config.lp_decoder_type,
config.model_select_etype,
config.use_early_stop,
config.early_stop_burnin_rounds,
config.early_stop_rounds,
config.early_stop_strategy)
config.early_stop_strategy,
config.eval_k)
else:
return GSgnnMrrLPEvaluator(config.eval_frequency,
config.eval_metric,
train_data,
config.num_negative_edges_eval,
config.lp_decoder_type,
config.use_early_stop,
config.early_stop_burnin_rounds,
config.early_stop_rounds,
config.early_stop_strategy)
config.early_stop_strategy,
config.eval_k)

def main(config_args):
""" main function
Expand Down
10 changes: 6 additions & 4 deletions python/graphstorm/run/gsgnn_lp/gsgnn_lp.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,29 @@ def get_evaluator(config, train_data):
train_data: GSgnnEdgeData
Training data
"""
assert len(config.eval_metric) == 1, \
"GraphStorm doees not support computing multiple metrics at the same time."
if config.report_eval_per_type:
return GSgnnPerEtypeMrrLPEvaluator(config.eval_frequency,
config.eval_metric,
train_data,
config.num_negative_edges_eval,
config.lp_decoder_type,
config.model_select_etype,
config.use_early_stop,
config.early_stop_burnin_rounds,
config.early_stop_rounds,
config.early_stop_strategy)
config.early_stop_strategy,
config.eval_k)
else:
return GSgnnMrrLPEvaluator(config.eval_frequency,
config.eval_metric,
train_data,
config.num_negative_edges_eval,
config.lp_decoder_type,
config.use_early_stop,
config.early_stop_burnin_rounds,
config.early_stop_rounds,
config.early_stop_strategy)
config.early_stop_strategy,
config.eval_k)

def main(config_args):
""" main function
Expand Down
Loading
Loading