Yale-LILY · niansong1996 · Jun 17, 2021 · Jun 2, 2021 · Jun 2, 2021 · Jun 2, 2021
diff --git a/__init__.py b/__init__.py
@@ -1,3 +1,3 @@
 import SummerTime.model
 import SummerTime.dataset.stdatasets as data
-import SummerTime.eval
+import SummerTime.evaluation
diff --git a/demo.ipynb b/demo.ipynb
diff --git a/eval/Metric.py b/eval/Metric.py
diff --git a/eval/__init__.py b/eval/__init__.py
diff --git a/eval/bertscore.py b/eval/bertscore.py
diff --git a/eval/bleu.py b/eval/bleu.py
diff --git a/eval/rouge.py b/eval/rouge.py
diff --git a/eval/rougewe.py b/eval/rougewe.py
diff --git a/evaluation/__init__.py b/evaluation/__init__.py
@@ -0,0 +1,6 @@
+from .rouge_metric import Rouge
+from .bertscore_metric import BertScore
+from .rougewe_metric import RougeWe
+from .bleu_metric import Bleu
+
+SUPPORTED_EVALUATION_METRICS = [BertScore, RougeWe, Bleu]
diff --git a/evaluation/base_metric.py b/evaluation/base_metric.py
@@ -0,0 +1,22 @@
+from typing import List, Tuple, Dict
+
+class SummMetric():
+    metric_name: str = None
+    range: Tuple[float, float] = None
+    higher_is_better: bool = None
+    low_resource: bool = None
+
+    def evaluate(self,
+                 ## TODO zhangir: integrate with dataset api
+                 inputs: List[str],
+                 targets: List[str],
+                 keys: List[str]) -> Dict[str, float]:
+        """
+        All metrics should have this function.
+        :input: A list of summaries.
+        :target: A list of target summaries corresponding to each entry of input.
+        :keys: Which metrics to return,
+        e.g, ['rouge_1_f_score', 'rouge_2_f_score']
+        :return: A dictionary with keys metrics and values scores.
+        """
+        raise NotImplementedError("the base class for metrics shouldn't be instantiated!")
diff --git a/evaluation/bertscore_metric.py b/evaluation/bertscore_metric.py
@@ -0,0 +1,20 @@
+from summ_eval.bert_score_metric import BertScoreMetric
+from SummerTime.evaluation.summeval_metric import SummEvalMetric
+from typing import List, Dict 
+
+class BertScore(SummEvalMetric):
+    metric_name = 'bert score'
+    range = (0, 1)
+    higher_is_better = True
+    low_resource = False
+
+    def __init__(self):
+        se_metric = BertScoreMetric()
+        super(BertScore, self).__init__(se_metric)
+
+    def evaluate(self,
+                 inputs: List[str],
+                 targets: List[str],
+                 keys: List[str] = ['bert_score_f1']) -> Dict[str, float]:
+        #TODO zhangir: update when datasets api is merged
+        return super(BertScore, self).evaluate(inputs, targets, keys)
diff --git a/evaluation/bleu_metric.py b/evaluation/bleu_metric.py
@@ -0,0 +1,20 @@
+from summ_eval.bleu_metric import BleuMetric
+from SummerTime.evaluation.summeval_metric import SummEvalMetric
+from typing import List, Dict
+
+class Bleu(SummEvalMetric):
+    metric_name = 'bleu'
+    range = (0, 10)
+    higher_is_better = True
+    low_resource = True
+
+    def __init__(self):
+        se_metric = BleuMetric()
+        super(Bleu, self).__init__(se_metric)
+
+    def evaluate(self,
+                 inputs: List[str],
+                 targets: List[str],
+                 keys: List[str] = ['bleu']) -> Dict[str, float]:
+        # TODO zhangir: potentially update when dataset api is merged.
+        return super(Bleu, self).evaluate(inputs, targets, keys)
diff --git a/evaluation/rouge_metric.py b/evaluation/rouge_metric.py
@@ -0,0 +1,19 @@
+from summ_eval.rouge_metric import RougeMetric
+from SummerTime.evaluation.summeval_metric import SummEvalMetric
+from typing import List, Dict
+
+class Rouge(SummEvalMetric):
+    metric_name = 'rouge'
+    range = (0, 1)
+    higher_is_better = True
+    low_resource = True
+
+    def __init__(self):
+        se_metric = RougeMetric()
+        super(Rouge, self).__init__(se_metric)
+
+    def evaluate(self,
+                 inputs: List[str],
+                 targets: List[str],
+                 keys: List[str] = ['rouge_3_f_score']) -> Dict[str, float]:
+        return super(Rouge, self).evaluate(inputs, targets, keys)
diff --git a/evaluation/rougewe_metric.py b/evaluation/rougewe_metric.py
@@ -0,0 +1,22 @@
+from summ_eval.rouge_we_metric import RougeWeMetric
+from SummerTime.evaluation.summeval_metric import SummEvalMetric
+from typing import List, Dict
+import nltk
+
+class RougeWe(SummEvalMetric):
+    metric_name = 'rougeWE'
+    range = (0, 1)
+    higher_is_better = True
+    low_resource = False
+
+    def __init__(self):
+        nltk.download('stopwords')
+        se_metric = RougeWeMetric()
+        super(RougeWe, self).__init__(se_metric)
+
+    def evaluate(self,
+                 inputs: List[str],
+                 targets: List[str],
+                 keys: List[str] = ['rouge_we_3_f']) -> Dict[str, float]:
+        #TODO zhangir: update when dataset api is merged.
+        return super(RougeWe, self).evaluate(inputs, targets, keys)
diff --git a/evaluation/summeval_metric.py b/evaluation/summeval_metric.py
@@ -0,0 +1,19 @@
+from .base_metric import SummMetric
+from summ_eval.metric import Metric as SEMetric
+
+class SummEvalMetric(SummMetric):
+    """
+    Generic class for a summarization metric whose backend is SummEval.
+    """
+
+    def __init__(self,
+                 se_metric: SEMetric):
+        self.se_metric = se_metric
+
+    def evaluate(self,
+                 inputs: List[str],
+                 targets: List[str],
+                 keys: List[str]) -> Dict[str, float]:
+        score_dict = self.se_metric.evaluate_batch(
+            inputs, targets)
+        return {key: score_dict[key] for key in keys}
diff --git a/summertime_pkg/README.md b/summertime_pkg/README.md
diff --git a/tests/evaluation_test.py b/tests/evaluation_test.py
@@ -0,0 +1,41 @@
+import unittest
+from typing import Tuple, List, Dict
+
+from evaluation import SUPPORTED_EVALUATION_METRICS
+
+class TestEvaluationMetrics(unittest.TestCase):
+    def get_summary_pair(self, size: int=1) -> Tuple[List[str]]:
+        test_output = [ """
+        Glowing letters that had been hanging above
+        the Yankee stadium from 1976 to 2008 were placed for auction at
+        Sotheby’s on Wednesday, but were not sold, The current owner
+        of the sign is Reggie Jackson, a Yankee hall-of-famer."""]
+        test_target = ["""
+        An auction for the lights from Yankee Stadium failed to
+        produce any bids on Wednesday at Sotheby’s. The lights,
+        currently owned by former Yankees player Reggie Jackson,
+        lit the stadium from 1976 until 2008."""]
+
+        return test_output, test_target
+
+
+    def test_evaluate(self):
+        print(f"{'#'*10} test_evaluate STARTS {'#'*10}")
+
+        for metric_class in SUPPORTED_EVALUATION_METRICS:
+            print(f"Test on {metric_class}")
+            metric = metric_class()
+
+            test_output, test_target = self.get_summary_pairs()
+            score_dict = metric.evaluate(test_output, test_target)
+            print(f"{metric_class} output dictionary")
+            print(score_dict)
+            self.assertIs(score_dict, Dict[str, float])
+            self.assertNotEqual(score_dict, {})
+            for key in score_dict:
+                self.assertTrue(self.range[0] <= score_dict[key])
+                self.assertTrue(score_dict[key] <= self.range[1])
+
+
+if __name__ = '__main__':
+    unittest.main()