Add 2CR for Contriever on BEIR (#1446)

castorini · Mar 6, 2023 · dfae4bb · dfae4bb
1 parent fa3181d
commit dfae4bb
Show file tree

Hide file tree

Showing 9 changed files with 1,585 additions and 37 deletions.
diff --git a/docs/2cr/beir.html b/docs/2cr/beir.html
diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py
diff --git a/pyserini/resources/beir.yaml b/pyserini/resources/beir.yaml
@@ -443,3 +443,151 @@ conditions:
           - nDCG@10: 0.6992
             R@100: 0.9270
             R@1000: 0.9767
+  - name: contriever
+    command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/contriever --index beir-v1.0.0-${dataset}.contriever --topics beir-v1.0.0-${dataset}-test --output $output --batch 128 --threads 16 --hits 1000 --remove-query
+    datasets:
+      - dataset: trec-covid
+        scores:
+          - nDCG@10: 0.2732
+            R@100: 0.0368
+            R@1000: 0.1675
+      - dataset: bioasq
+        scores:
+          - nDCG@10: 0.3016
+            R@100: 0.5412
+            R@1000: 0.7396
+      - dataset: nfcorpus
+        scores:
+          - nDCG@10: 0.3173
+            R@100: 0.2943
+            R@1000: 0.6232
+      - dataset: nq
+        scores:
+          - nDCG@10: 0.2536
+            R@100: 0.7712
+            R@1000: 0.9286
+      - dataset: hotpotqa
+        scores:
+          - nDCG@10: 0.4807
+            R@100: 0.7046
+            R@1000: 0.8294
+      - dataset: fiqa
+        scores:
+          - nDCG@10: 0.2449
+            R@100: 0.5619
+            R@1000: 0.8215
+      - dataset: signal1m
+        scores:
+          - nDCG@10: 0.2338
+            R@100: 0.2568
+            R@1000: 0.4757
+      - dataset: trec-news
+        scores:
+          - nDCG@10: 0.3484
+            R@100: 0.4234
+            R@1000: 0.7389
+      - dataset: robust04
+        scores:
+          - nDCG@10: 0.3155
+            R@100: 0.2757
+            R@1000: 0.5097
+      - dataset: arguana
+        scores:
+          - nDCG@10: 0.3791
+            R@100: 0.9011
+            R@1000: 0.9851
+      - dataset: webis-touche2020
+        scores:
+          - nDCG@10: 0.1668
+            R@100: 0.3736
+            R@1000: 0.7144
+      - dataset: cqadupstack-android
+        scores:
+          - nDCG@10: 0.3771
+            R@100: 0.7436
+            R@1000: 0.9173
+      - dataset: cqadupstack-english
+        scores:
+          - nDCG@10: 0.3571
+            R@100: 0.6442
+            R@1000: 0.8042
+      - dataset: cqadupstack-gaming
+        scores:
+          - nDCG@10: 0.4597
+            R@100: 0.8092
+            R@1000: 0.9354
+      - dataset: cqadupstack-gis
+        scores:
+          - nDCG@10: 0.2411
+            R@100: 0.5792
+            R@1000: 0.8018
+      - dataset: cqadupstack-mathematica
+        scores:
+          - nDCG@10: 0.1841
+            R@100: 0.5127
+            R@1000: 0.7757
+      - dataset: cqadupstack-physics
+        scores:
+          - nDCG@10: 0.3430
+            R@100: 0.7013
+            R@1000: 0.8980
+      - dataset: cqadupstack-programmers
+        scores:
+          - nDCG@10: 0.3029
+            R@100: 0.6402
+            R@1000: 0.8434
+      - dataset: cqadupstack-stats
+        scores:
+          - nDCG@10: 0.2483
+            R@100: 0.5269
+            R@1000: 0.7417
+      - dataset: cqadupstack-tex
+        scores:
+          - nDCG@10: 0.1540
+            R@100: 0.4333
+            R@1000: 0.6870
+      - dataset: cqadupstack-unix
+        scores:
+          - nDCG@10: 0.2636
+            R@100: 0.5879
+            R@1000: 0.8212
+      - dataset: cqadupstack-webmasters
+        scores:
+          - nDCG@10: 0.2878
+            R@100: 0.6485
+            R@1000: 0.8800
+      - dataset: cqadupstack-wordpress
+        scores:
+          - nDCG@10: 0.1914
+            R@100: 0.5364
+            R@1000: 0.7551
+      - dataset: quora
+        scores:
+          - nDCG@10: 0.8349
+            R@100: 0.9871
+            R@1000: 0.9981
+      - dataset: dbpedia-entity
+        scores:
+          - nDCG@10: 0.2916
+            R@100: 0.4529
+            R@1000: 0.7142
+      - dataset: scidocs
+        scores:
+          - nDCG@10: 0.1491
+            R@100: 0.3601
+            R@1000: 0.6105
+      - dataset: fever
+        scores:
+          - nDCG@10: 0.6821
+            R@100: 0.9356
+            R@1000: 0.9655
+      - dataset: climate-fever
+        scores:
+          - nDCG@10: 0.1550
+            R@100: 0.4422
+            R@1000: 0.7232
+      - dataset: scifact
+        scores:
+          - nDCG@10: 0.6493
+            R@100: 0.9260
+            R@1000: 0.9967
diff --git a/scripts/beir/gather_beir_index_stats.py b/scripts/beir/gather_beir_index_stats.py
@@ -74,3 +74,23 @@
     print(f'        "unique_terms": {stats["unique_terms"]},')
     print(f'        "downloaded": False')
     print(f'    }},')
+
+# Stats for "contriever" indexes
+for key in beir_keys:
+    index_reader = IndexReader(f'indexes/faiss.beir-v1.0.0-{key}.contriever.{date}.{commitid}')
+    stats = index_reader.stats()
+    md5 = compute_md5(f'indexes/faiss.beir-v1.0.0-{key}.contriever.{date}.{commitid}.tar.gz')
+    size = os.path.getsize(f'indexes/faiss.beir-v1.0.0-{key}.contriever.{date}.{commitid}.tar.gz')
+    print(f'    "beir-v1.0.0-{key}.contriever": {{')
+    print(f'        "description": "Faiss index for BEIR v1.0.0 ({beir_keys[key]}) corpus encoded by Contriever encoder.",')
+    print(f'        "filename": "faiss.beir-v1.0.0-{key}.contriever.{date}.{commitid}.tar.gz",')
+    print(f'        "readme": "faiss.beir-v1.0.0-{key}.contriever.{date}.{commitid}.README.md",')
+    print(f'        "urls": [')
+    print(f'            "https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/faiss.beir-v1.0.0-{key}.contriever.{date}.{commitid}.tar.gz"')
+    print(f'        ],')
+    print(f'        "md5": "{md5}",')
+    print(f'        "size compressed (bytes)": {size},')
+    print(f'        "documents": {stats["documents"]},')
+    print(f'        "downloaded": False,')
+    print(f'        "texts": "beir-v1.0.0-{key}.flat"')
+    print(f'    }},')
diff --git a/scripts/beir/run_beir_baselines.py b/scripts/beir/run_beir_baselines.py
@@ -92,3 +92,16 @@
     os.system(cmd)
     cmd = f'python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -m recall.100,1000 beir-v1.0.0-{key}-test runs/run.beir-v1.0.0-{key}-splade_distil_cocodenser_medium.trec'
     os.system(cmd)
+
+# Runs on Contriever index
+for key in beir_keys:
+    cmd = f'python -m pyserini.search.faiss \
+              --encoder-class contriever --encoder facebook/contriever \
+              --index beir-v1.0.0-{key}.contriever \
+              --topics beir-v1.0.0-{key}-test \
+              --output runs/run.beir.contriever.{key}.txt \
+              --batch 128 --threads 16 \
+              --remove-query --hits 1000'
+    os.system(cmd)
+    cmd = f'python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -m recall.100,1000 beir-v1.0.0-{key}-test runs/run.beir.contriever.{key}.txt'
+    os.system(cmd)
diff --git a/scripts/repro_matrix/beir_html.template b/scripts/repro_matrix/beir_html.template
@@ -149,6 +149,7 @@ pre[class*="prettyprint"] {
         <th class="headertop" colspan="3"><b>BM25 Flat</b></th>
         <th class="headertop" colspan="3"><b>BM25 Multifield</b></th>
         <th class="headertop" colspan="2"><b>SPLADE</b></th>
+        <th class="headertop" colspan="2"><b>Contriever</b></th>
       </tr>
       <tr>
         <th class="headerbottom" scope="col"></th>
@@ -161,6 +162,8 @@ pre[class*="prettyprint"] {
         <th class="headerbottom" scope="col"></th>
         <th class="headerbottom" scope="col">nDCG@10</th>
         <th class="headerbottom" scope="col">R@100</th>
+        <th class="headerbottom" scope="col">nDCG@10</th>
+        <th class="headerbottom" scope="col">R@100</th>
       </tr>
     </thead>
     <tbody>

diff --git a/scripts/repro_matrix/beir_html_row.template b/scripts/repro_matrix/beir_html_row.template
@@ -10,10 +10,12 @@
 <td></td>
 <td>$s5</td>
 <td>$s6</td>
+<td>$s7</td>
+<td>$s8</td>
 </tr>
 <tr class="hide-table-padding">
 <td></td>
-<td colspan="9">
+<td colspan="11">
 <div id="collapse${row_cnt}" class="collapse in p-3">
 
 <!-- Tabs navs -->
@@ -27,6 +29,9 @@
   <li class="nav-item" role="presentation">
     <a class="nav-link" id="row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab3" role="tab" aria-controls="row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">SPLADE</a>
   </li>
+  <li class="nav-item" role="presentation">
+    <a class="nav-link" id="row${row_cnt}-tab4-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab4" role="tab" aria-controls="row${row_cnt}-tab4" aria-selected="false" style="text-transform:none">Contriever</a>
+  </li>
 </ul>
 <!-- Tabs navs -->
 
@@ -70,6 +75,19 @@ Evaluation commands:
 <pre><code>${eval_cmd3}</code></pre>
   </blockquote>
 
+  </div>
+  <div class="tab-pane fade" id="row${row_cnt}-tab4" role="tabpanel" aria-labelledby="row${row_cnt}-tab4">
+Command to generate run:
+
+  <blockquote class="mycode">
+<pre><code>$cmd4
+</code></pre></blockquote>
+Evaluation commands:
+
+  <blockquote class="mycode">
+<pre><code>${eval_cmd4}</code></pre>
+  </blockquote>
+
   </div>
 </div>
 <!-- Tabs content -->

diff --git a/scripts/repro_matrix/generate_html_beir.py b/scripts/repro_matrix/generate_html_beir.py
@@ -25,6 +25,7 @@
 def format_run_command(raw):
     return raw.replace('--topics', '\\\n  --topics')\
         .replace('--index', '\\\n  --index')\
+        .replace('--encoder-class', '\\\n --encoder-class')\
         .replace('--output ', '\\\n  --output ')\
         .replace('--output-format trec', '\\\n  --output-format trec \\\n ') \
         .replace('--hits ', '\\\n  --hits ')
@@ -84,12 +85,16 @@ def read_file(f):
                              s4=f'{table[dataset]["multifield"]["R@100"]:8.4f}',
                              s5=f'{table[dataset]["splade-distil-cocodenser-medium"]["nDCG@10"]:8.4f}',
                              s6=f'{table[dataset]["splade-distil-cocodenser-medium"]["R@100"]:8.4f}',
+                             s7=f'{table[dataset]["contriever"]["nDCG@10"]:8.4f}',
+                             s8=f'{table[dataset]["contriever"]["R@100"]:8.4f}',
                              cmd1=commands[dataset]["flat"],
                              cmd2=commands[dataset]["multifield"],
                              cmd3=commands[dataset]["splade-distil-cocodenser-medium"],
+                             cmd4=commands[dataset]["contriever"],
                              eval_cmd1=eval_commands[dataset]["flat"].rstrip(),
                              eval_cmd2=eval_commands[dataset]["multifield"].rstrip(),
                              eval_cmd3=eval_commands[dataset]["splade-distil-cocodenser-medium"].rstrip(),
+                             eval_cmd4=eval_commands[dataset]["contriever"].rstrip(),
                              )
 
             html_rows.append(s)

diff --git a/scripts/repro_matrix/run_all_beir.py b/scripts/repro_matrix/run_all_beir.py
@@ -96,18 +96,20 @@
             final_score = (top_level_sums[model][metric] + cqa_score) / 18
             final_scores[model][metric] = final_score
 
-    print(' ' * 30 + 'BM25-flat' + ' ' * 10 + 'BM25-mf' + ' ' * 11 + 'SPLADE')
+    print(' ' * 30 + 'BM25-flat' + ' ' * 10 + 'BM25-mf' + ' ' * 11 + 'SPLADE' + ' ' * 11 + 'Contriever')
     print(' ' * 26 + 'nDCG@10   R@100   ' * 3)
-    print(' ' * 27 + '-' * 14 + '    ' + '-' * 14 + '    ' + '-' * 14)
+    print(' ' * 27 + '-' * 14 + '    ' + '-' * 14 + '    ' + '-' * 14 + '    ' + '-' * 14)
     for dataset in beir_keys:
         print(f'{dataset:25}' +
               f'{table[dataset]["bm25-flat"]["nDCG@10"]:8.4f}{table[dataset]["bm25-flat"]["R@100"]:8.4f}  ' +
               f'{table[dataset]["bm25-multifield"]["nDCG@10"]:8.4f}{table[dataset]["bm25-multifield"]["R@100"]:8.4f}  ' +
-              f'{table[dataset]["splade-distil-cocodenser-medium"]["nDCG@10"]:8.4f}{table[dataset]["splade-distil-cocodenser-medium"]["R@100"]:8.4f}')
-    print(' ' * 27 + '-' * 14 + '    ' + '-' * 14 + '    ' + '-' * 14)
-    print('avg' + ' ' * 22 + f'{final_scores["bm25-flat"]["nDCG@10"]:8.4f}{final_scores["bm25-flat"]["R@100"]:8.4f}  ' +
-          f'{final_scores["bm25-multifield"]["nDCG@10"]:8.4f}{final_scores["bm25-multifield"]["R@100"]:8.4f}  ' +
-          f'{final_scores["splade-distil-cocodenser-medium"]["nDCG@10"]:8.4f}{final_scores["splade-distil-cocodenser-medium"]["R@100"]:8.4f} ')
+              f'{table[dataset]["splade-distil-cocodenser-medium"]["nDCG@10"]:8.4f}{table[dataset]["splade-distil-cocodenser-medium"]["R@100"]:8.4f}' + 
+              f'{table[dataset]["contriever"]["nDCG@10"]:8.4f}{table[dataset]["contriever"]["R@100"]:8.4f}  ')
+        print(' ' * 27 + '-' * 14 + '    ' + '-' * 14 + '    ' + '-' * 14 + '    ' + '-' * 14)
+        print('avg' + ' ' * 22 + f'{final_scores["bm25-flat"]["nDCG@10"]:8.4f}{final_scores["bm25-flat"]["R@100"]:8.4f}  ' +
+              f'{final_scores["bm25-multifield"]["nDCG@10"]:8.4f}{final_scores["bm25-multifield"]["R@100"]:8.4f}  ' +
+              f'{final_scores["splade-distil-cocodenser-medium"]["nDCG@10"]:8.4f}{final_scores["splade-distil-cocodenser-medium"]["R@100"]:8.4f} ' + 
+              f'{final_scores["contriever"]["nDCG@10"]:8.4f}{final_scores["contriever"]["R@100"]:8.4f}  ')
 
     end = time.time()