Switch to argparse (#3)

* switch to argparse for more standard CLI arguments * new measures: Pearson rank, Spearman, and KL divergence
capreolus-ir · May 27, 2021 · e6152b3 · e6152b3
1 parent bf26ec6
commit e6152b3
Show file tree

Hide file tree

Showing 13 changed files with 256 additions and 183 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,24 @@
+name: test
+on: [push]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.6, 3.7, 3.8]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pytest
+          pip install -r requirements.txt
+      - name: Test with pytest
+        run: |
+          export PYTHONPATH=${PYTHONPATH}:/home/runner/work/capreolus/diffir
+          pytest
+
diff --git a/diffir/__init__.py b/diffir/__init__.py
@@ -1,4 +1,8 @@
 __version__ = "0.1.0"
 
 from diffir.weight import Weight
+from diffir.weight.custom import CustomWeight
+from diffir.weight.unsupervised import ExactMatchWeight
 from diffir.measure import Measure
+from diffir.measure.qrels import QrelMeasure
+from diffir.measure.unsupervised import TopkMeasure
diff --git a/diffir/batchrun.py b/diffir/batchrun.py
@@ -29,9 +29,13 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("directory")
     parser.add_argument("-o", "--output", dest="output_dir")
-    parser.add_argument("--config", dest="config", nargs="*")
-
+    parser.add_argument("--dataset", dest="dataset", type=str, help="dataset from ir_datasets")
+    parser.add_argument("--measure", dest="measure", type=str, default="tauap", help="measure for ranking difference (qrel, tauap,weightedtau)")
+    parser.add_argument("--metric", dest="metric", type=str, default="MAP", help="metric used with qrel measure")
+    parser.add_argument("--topk", dest="topk", type=int, default=10)
     args = parser.parse_args()
+    config = {"dataset": args.dataset, "measure": args.measure, "metric": args.metric, "topk": args.topk,
+                "weight": {"weights_1": None, "weights_2": None}}
     indir = Path(args.directory)
     output = Path(args.output_dir) if args.output_dir else indir / "diffir"
     output.mkdir(exist_ok=True)
@@ -50,7 +54,7 @@ def main():
 
     single_runs = sorted(single_runs)  # sorted needed for itertools ordering
     queue = [(fn,) for fn in single_runs] + list(itertools.combinations(single_runs, 2))
-    f = partial(process_runs, config=args.config, output=output)
+    f = partial(process_runs, config=config, output=output)
     with multiprocessing.Pool(8) as p:
         outdirs = p.map(f, queue)
 

diff --git a/diffir/measure/__init__.py b/diffir/measure/__init__.py
@@ -1,26 +1,27 @@
-from profane import ModuleBase, import_all_modules, ConfigOption
+class Measure:
+    def __init__(self, metric="ndcg_20", topk=5):
+        '''
+        Measure construction
+        :param metric: The metric used for selecting queries.
+        :param topk: How many queries to retrieve
+        '''
+        self.metric = metric
+        self.topk = topk
 
-
-class Measure(ModuleBase):
-    module_type = "measure"
-    config_spec = [
-        ConfigOption(key="metric", default_value="ndcg_20", description="The metric to use for selecting queries"),
-        ConfigOption(key="topk", default_value=5, description="How many queries to retrieve"),
-    ]
-
-    # TODO finalize API
     def query_differences(self, run1, run2, *args, **kwargs):
+        '''
+        :param run1: the first run
+        :param run2: the second run
+        :param args:
+        :param kwargs:
+        :return:
+        '''
         if run1 and run2:
             return self._query_differences(run1, run2, *args, **kwargs)
         elif run1 and run2 is None:
-            qids = sorted(list(run1.keys()))[: self.config["topk"]]
+            qids = sorted(list(run1.keys()))[: self.topk]
             id2diff = {qid: 0 for qid in qids}
             return qids, id2diff, "singlerun"
 
     def _query_differences(self, run1, run2, *args, **kwargs):
         raise NotImplementedError
-
-
-# TODO this is going to break once we introduce optional modules. need a way for them to fail gracefully.
-#      or to enumerate/register them without importing the py file?
-import_all_modules(__file__, __package__)
diff --git a/diffir/measure/qrels.py b/diffir/measure/qrels.py
@@ -1,17 +1,10 @@
-import pytrec_eval
-from profane import ModuleBase, Dependency, ConfigOption
+from ir_measures import iter_calc, parse_measure
+import sys
 from diffir.measure import Measure
 
 
-@Measure.register
 class QrelMeasure(Measure):
     module_name = "qrel"
-
-    config_spec = [
-        ConfigOption(key="topk", default_value=10, description="The number of differing queries to return"),
-        ConfigOption(key="metric", default_value="ndcg_cut_20", description="TODO"),
-    ]
-
     def _query_differences(self, run1, run2, *args, **kwargs):
         """
         :param run1: TREC run. Has the format {qid: {docid: score}, ...}
@@ -28,13 +21,32 @@ def _query_differences(self, run1, run2, *args, **kwargs):
         run2 = {qid: doc_id_to_score for qid, doc_id_to_score in run2.items() if qid in overlapping_keys}
 
         qrels = dataset.qrels_dict()
-        metric = self.config["metric"]
-        topk = self.config["topk"]
-        evaluator = pytrec_eval.RelevanceEvaluator(qrels, {metric})
-        eval_run_1 = evaluator.evaluate(run1)
-        eval_run_2 = evaluator.evaluate(run2)
+        try:
+            metric = parse_measure(self.metric)
+        except NameError:
+            print("Unknown measure: {}. Please provide a measure supported by https://ir-measur.es/".format(self.metric))
+            sys.exit(1)
+
+        topk = self.topk
+        eval_run_1 = self.convert_to_nested_dict(iter_calc([metric], qrels, run1))
+        eval_run_2 = self.convert_to_nested_dict(iter_calc([metric], qrels, run2))
+
         query_ids = eval_run_1.keys() & eval_run_2.keys()
         query_ids = sorted(query_ids, key=lambda x: abs(eval_run_1[x][metric] - eval_run_2[x][metric]), reverse=True)
         query_ids = query_ids[:topk]
-        id2diff = {x:abs(eval_run_1[x][metric] - eval_run_2[x][metric]) for x in query_ids}
-        return query_ids, id2diff, metric
+        id2diff = {x: abs(eval_run_1[x][metric] - eval_run_2[x][metric]) for x in query_ids}
+        id2qrelscores = {x: [eval_run_1[x][metric], eval_run_2[x][metric]] for x in query_ids}
+        return query_ids, id2diff, self.metric, id2qrelscores
+
+    def convert_to_nested_dict(self, ir_measures_iterator):
+        """
+        Util method to convert the results from ir_measures.iter_calc to a dict.
+        TODO: We can probably refactor so that this method won't be needed
+        """
+        eval_dict = {}
+
+        for x in ir_measures_iterator:
+            # TODO: This assumes that there would be only one measure/metric to handle.
+            eval_dict[x.query_id] = {x.measure: x.value}
+
+        return eval_dict
diff --git a/diffir/measure/unsupervised.py b/diffir/measure/unsupervised.py
@@ -1,17 +1,10 @@
-from profane import ModuleBase, Dependency, ConfigOption
 from diffir.measure import Measure
 from scipy import stats
 import numpy as np
-import math
 
 
-@Measure.register
 class TopkMeasure(Measure):
     module_name = "topk"
-    config_spec = [
-        ConfigOption(key="topk", default_value=3, description="TODO"),
-        ConfigOption(key="metric", default_value="weightedtau", description="Metric to measure the rank correaltion"),
-    ]
 
     def tauap(self, x, y, decreasing=True):
         """
@@ -47,8 +40,8 @@ def tauap_fast(self, x, y):
         n = len(ry)
         if n == 1:
             return 1
-        ordered_idx = sorted(list(range(n)), key=lambda i: rx[i])
-        ry_ordered_by_rx = [(ry[idx], i) for i, idx in enumerate(ordered_idx)]
+        ordered_idx = sorted(list(range(n)), key=lambda i: ry[i])
+        rx_ordered_by_ry = [(rx[idx], i) for i, idx in enumerate(ordered_idx)]
 
         def merge_sort(arr):
             if len(arr) <= 1:
@@ -82,38 +75,37 @@ def merge_sort(arr):
                 j += 1
                 k += 1
             return tauAP
-
-        res = (2 - 2 * merge_sort(ry_ordered_by_rx) / (n - 1)) - 1
+        res = (2 - 2 * merge_sort(rx_ordered_by_ry) / (n - 1)) - 1
         return res
 
     def pearson_rank(self, x, y):
-        x = np.interp(x, (min(x), max(x)), (0,1))
-        y = np.interp(y, (min(y), max(y)), (0,1))
-        indices = sorted(list(range(len(x))), key=lambda idx : x[idx], reverse=True)
+        x = np.interp(x, (min(x), max(x)), (0, 1))
+        y = np.interp(y, (min(y), max(y)), (0, 1))
+        indices = sorted(list(range(len(x))), key=lambda idx: x[idx], reverse=True)
         x = x[indices]
         y = y[indices]
-        x_diff = x.reshape(1,-1) - x.reshape(-1,1)
-        y_diff = y.reshape(1,-1) - y.reshape(-1,1)
+        x_diff = x.reshape(1, -1) - x.reshape(-1, 1)
+        y_diff = y.reshape(1, -1) - y.reshape(-1, 1)
         den = x[1:].sum()
         pr = 0
-        mask = np.tril(np.ones((len(x),len(x))),k=-1)
-        xy = x_diff*y_diff*mask
-        xx = x_diff*x_diff*mask
-        yy = y_diff*y_diff*mask 
+        mask = np.tril(np.ones((len(x), len(x))), k=-1)
+        xy = x_diff * y_diff * mask
+        xx = x_diff * x_diff * mask
+        yy = y_diff * y_diff * mask
         xy = xy.sum(axis=1)[1:]
         xx = xx.sum(axis=1)[1:]
         yy = yy.sum(axis=1)[1:]
-        den_i = np.sqrt(xx)*np.sqrt(yy)
-        den_i[den_i==0]=1e-5
-        res = (xy*x[1:]/den_i).sum()/den 
+        den_i = np.sqrt(xx) * np.sqrt(yy)
+        den_i[den_i == 0] = 1e-5
+        res = (xy * x[1:] / den_i).sum() / den
         return res
 
     def kl_div(self, x, y):
         x = np.array(x) - min(x) + 1e-5
         y = np.array(y) - min(y) + 1e-5
-        x = x/x.sum()
-        y = y/y.sum()
-        return -(stats.entropy(x,y)+stats.entropy(y,x))/2
+        x = x / x.sum()
+        y = y / y.sum()
+        return -(stats.entropy(x, y) + stats.entropy(y, x)) / 2
 
     def _query_differences(self, run1, run2, *args, **kwargs):
         """
@@ -124,22 +116,23 @@ def _query_differences(self, run1, run2, *args, **kwargs):
         :return: The union of top k qids in both runs, sorted by the order in which the queries appear in run 1
         ^ This is because run 1 appears on the left hand side in the web ui
         """
-        topk = self.config["topk"]
-        metric = self.config["metric"]
+        topk = self.topk
+        metric = self.metric
         qids = run1.keys() & run2.keys()
         if not qids:
             raise ValueError("run1 and run2 have no shared qids")
 
         id2measure = {}
         for qid in qids:
             from collections import defaultdict
-            min_value = min(min(run1[qid].values()), min(run2[qid].values()))-1e-5
+            min_value = min(min(run1[qid].values()), min(run2[qid].values())) - 1e-5
             doc_score_1 = defaultdict(lambda: min_value, run1[qid])
             doc_score_2 = defaultdict(lambda: min_value, run2[qid])
             doc_ids_1 = doc_score_1.keys()
             doc_ids_2 = doc_score_2.keys()
             doc_ids_union = set(doc_ids_1).union(set(doc_ids_2))
-            doc_ids_union = sorted(list(doc_ids_union), key=lambda id: (doc_score_1[id] + doc_score_2[id]), reverse=True)
+            doc_ids_union = sorted(list(doc_ids_union), key=lambda id: (doc_score_1[id] + doc_score_2[id]),
+                                   reverse=True)
             union_score1 = [doc_score_1[doc_id] for doc_id in doc_ids_union]
             union_score2 = [doc_score_2[doc_id] for doc_id in doc_ids_union]
             if metric == "weightedtau":
@@ -148,14 +141,15 @@ def _query_differences(self, run1, run2, *args, **kwargs):
                 tau = self.tauap_fast(union_score1, union_score2)
             elif metric == "spearmanr":
                 tau, p_value = stats.spearmanr(union_score1, union_score2)
-            elif metric == "pearsonr":
-                tau = (self.pearson_rank(union_score1, union_score2)+self.pearson_rank(union_score2, union_score1))/2
+            elif metric == "pearsonrank":
+                tau = (self.pearson_rank(union_score1, union_score2) + self.pearson_rank(union_score2,
+                                                                                         union_score1)) / 2
             elif metric == "kldiv":
                 tau = self.kl_div(union_score1, union_score2)
             else:
-                raise ValueError("Metric {} not supported for the measure {}".format(self.config["metric"], self.module_name))
+                raise ValueError("Metric {} not supported for the measure {}".format(self.metric, "metric"))
             id2measure[qid] = tau
         qids = sorted(qids, key=lambda x: id2measure[x])
         qids = qids[:topk]
         id2measure = {idx: id2measure[idx] for idx in qids}
-        return qids, id2measure, metric
+        return qids, id2measure, metric, None