ColBERT & DeepCT

In [1]:
!pip install tensorflow==1.15
!pip install -U python-terrier
!pip install -U git+https://github.com/terrierteam/pyterrier_colbert
!pip install -U git+https://github.com/terrierteam/pyterrier_deepct.git
!pip install transformers==3.0.2
!pip install faiss-gpu==1.6.3

Collecting git+https://github.com/terrierteam/pyterrier_colbert
  Cloning https://github.com/terrierteam/pyterrier_colbert to /tmp/pip-req-build-p81kpdc1
  Running command git clone -q https://github.com/terrierteam/pyterrier_colbert /tmp/pip-req-build-p81kpdc1
Collecting ColBERT@ git+https://github.com/cmacdonald/ColBERT.git@v0.2#egg=ColBERT
  Cloning https://github.com/cmacdonald/ColBERT.git (to revision v0.2) to /tmp/pip-install-9lpporff/colbert_83a0da2e3e534af993244a03b79f76f2
  Running command git clone -q https://github.com/cmacdonald/ColBERT.git /tmp/pip-install-9lpporff/colbert_83a0da2e3e534af993244a03b79f76f2
  Running command git checkout -b v0.2 --track origin/v0.2
  Switched to a new branch 'v0.2'
  Branch 'v0.2' set up to track remote branch 'v0.2' from 'origin'.
Collecting git+https://github.com/terrierteam/pyterrier_deepct.git
  Cloning https://github.com/terrierteam/pyterrier_deepct.git to /tmp/pip-req-build-tt2ndb_4
  Running command git clone -q https://github.com/ter

In [2]:
import pyterrier as pt
import os
import pandas as pd
import pyterrier_colbert.ranking
import pyterrier_colbert.indexing
import pyterrier_deepct
import tensorflow as tf
import sys
import faiss

if not pt.started():
    pt.init()

COLAB='google.colab' in sys.modules
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.logging.set_verbosity(tf.logging.ERROR)
assert tf.__version__.startswith('1'), 'TF 1 is required by DeepCT; on Colab, use %tensorflow_version 1.x'
# assert faiss.get_num_gpus() > 0

terrier-assemblies 5.6  jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.6  jar not found, downloading to /root/.pyterrier...
Done
PyTerrier 0.7.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [3]:
"""
!rm -rf /content/sample_data
!rm -rf /content/index_vaswani
!rm -rf /content/index_colbert
"""

'\n!rm -rf /content/sample_data\n!rm -rf /content/index_vaswani\n!rm -rf /content/index_colbert\n'

In [4]:
dataset_name = 'irds:vaswani'
path_index_vaswani = '/content/index_vaswani'
props_path = '/data.properties'
dataset = pt.datasets.get_dataset(dataset_name)

if not os.path.exists(path_index_vaswani):
    indexer = pt.index.IterDictIndexer(path_index_vaswani)
    index_ref = indexer.index(dataset.get_corpus_iter(), meta=('docno', 'text'))
else:
    index_ref = pt.IndexRef.of(path_index_vaswani + props_path)

index = pt.IndexFactory.of(index_ref)
topics = dataset.get_topics()
qrels = dataset.get_qrels()
evals = ['map', 'ndcg_cut.10', 'P.10', 'mrt']
br = pt.BatchRetrieve(index, wmodel='BM25')

[INFO] [starting] building docstore
[INFO] [starting] http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz
docs_iter:   0%|                                     | 0/11429 [00:00<?, ?doc/s]
http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: 0.0%| 0.00/2.13M [00:00<?, ?B/s][A
http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: 0.8%| 16.4k/2.13M [00:00<00:31, 67.1kB/s][A
http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: 2.3%| 49.2k/2.13M [00:00<00:21, 97.2kB/s][A
http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: 3.9%| 81.9k/2.13M [00:00<00:19, 107kB/s] [A
http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: 6.9%| 147k/2.13M [00:01<00:13, 143kB/s] [A
http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: 14.3%| 303k/2.13M [00:01<00:07, 235kB/s][A
http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: 29.3%| 623k/2.13M [00:01<00:03, 401kB/s][A

[A[INFO] [finished] http:

vaswani documents:   0%|          | 0/11429 [00:00<?, ?it/s]

In [5]:
def experiment_br(retrs, labels):
  return pt.Experiment(retrs, topics, qrels, names=labels, eval_metrics=evals)

ColBERT

In [6]:
checkpoint_colbert = 'http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip'
path_index_colbert = '/content/index_colbert'

if not os.path.exists(path_index_colbert):
    indexer_colbert = pyterrier_colbert.indexing.ColBERTIndexer(checkpoint_colbert, '/content', 'index_colbert', chunksize=3)
    indexer_colbert.index(dataset.get_corpus_iter())

colbert_factory = pyterrier_colbert.ranking.ColBERTFactory(checkpoint_colbert, None, None)
colbert = colbert_factory.text_scorer(doc_attr='text')
pipeline_colbert = br >> pt.text.get_text(dataset, 'text')>> colbert

expr_colbert = experiment_br([br, pipeline_colbert], ['BM25', 'BM25 >> ColBERT'])
print(pd.DataFrame(expr_colbert))

vaswani documents:   0%|          | 0/11429 [00:00<?, ?it/s]

[Sep 20, 14:29:07] [0] 		 #> Local args.bsize = 128
[Sep 20, 14:29:07] [0] 		 #> args.index_root = /content
[Sep 20, 14:29:07] [0] 		 #> self.possible_subset_sizes = [69905]


[INFO] Lock 139828560213904 acquired on /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517.lock
[INFO] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpflwq7kth


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

[INFO] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json in cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
[INFO] creating metadata file for /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
[INFO] Lock 139828560213904 released on /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517.lock
[INFO] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
[INFO] Model config BertConfig {
 

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

[INFO] storing https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin in cache at /root/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
[INFO] creating metadata file for /root/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
[INFO] Lock 139828564579856 released on /root/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157.lock
[INFO] loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at /root/.cache/torch/transformers/f2ee78bdd635b758cc0a12352586868bef80e47401abe4c4fcc3832421e7338b.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
- This IS expected if you are initializing ColBERT from the checkpoint of a mo

[Sep 20, 14:29:35] #> Loading model checkpoint.
[Sep 20, 14:29:35] #> Loading checkpoint http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip


Downloading: "http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip" to /root/.cache/torch/hub/checkpoints/colbert.dnn.zip


  0%|          | 0.00/1.11G [00:00<?, ?B/s]



[Sep 20, 14:31:35] #> checkpoint['epoch'] = 0
[Sep 20, 14:31:35] #> checkpoint['batch'] = 44500




[INFO] Lock 139828558130512 acquired on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
[INFO] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpzvyx6ona


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

[INFO] storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt in cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
[INFO] creating metadata file for /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
[INFO] Lock 139828558130512 released on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
[INFO] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084




[Sep 20, 14:31:38] #> Note: Output directory /content already exists




[Sep 20, 14:31:38] #> Creating directory /content/index_colbert 




[INFO] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


[Sep 20, 14:35:43] [0] 		 #> Completed batch #0 (starting at passage #0) 		Passages/min: 2.8k (overall),  2.8k (this encoding),  13193.6M (this saving)
[Sep 20, 14:35:44] [0] 		 [NOTE] Done with local share.
[Sep 20, 14:35:44] [0] 		 #> Joining saver thread.
[Sep 20, 14:35:44] [0] 		 #> Saved batch #0 to /content/index_colbert/0.pt 		 Saving Throughput = 1.7M passages per minute.

#> num_embeddings = 581496
[Sep 20, 14:35:44] #> Starting..
[Sep 20, 14:35:44] #> Processing slice #1 of 1 (range 0..1).
[Sep 20, 14:35:44] #> Will write to /content/index_colbert/ivfpq.100.faiss.
[Sep 20, 14:35:44] #> Loading /content/index_colbert/0.sample ...
#> Sample has shape (29074, 128)
[Sep 20, 14:35:44] Preparing resources for 1 GPUs.
[Sep 20, 14:35:44] #> Training with the vectors...
[Sep 20, 14:35:44] #> Training now (using 1 GPUs)...
39.14304494857788
33.1834614276886
0.00032901763916015625
[Sep 20, 14:36:56] Done training!

[Sep 20, 14:36:56] #> Indexing the vectors...
[Sep 20, 14:36:56] #> Load

  warn("No index_root and index_name specified - no index ranking possible")
[INFO] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
[INFO] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

[INFO] loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at /root/.cache/torch/transfo

[Sep 20, 14:37:18] #> Loading model checkpoint.
[Sep 20, 14:37:18] #> Loading checkpoint http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip


- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Sep 20, 14:37:38] #> checkpoint['epoch'] = 0
[Sep 20, 14:37:38] #> checkpoint['batch'] = 44500


[INFO] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
[INFO] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


              name       map  ndcg_cut.10      P.10           mrt
0             BM25  0.296517     0.446609  0.352688     91.004017
1  BM25 >> ColBERT  0.284444     0.430657  0.356989  41215.058192


DEEPCT

In [7]:
if not os.path.exists('marco.zip'):
  !wget http://boston.lti.cs.cmu.edu/appendices/arXiv2019-DeepCT-Zhuyun-Dai/outputs/marco.zip
  !unzip marco.zip

if not os.path.exists('uncased_L-12_H-768_A-12.zip'):
  !wget https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip
  !unzip uncased_L-12_H-768_A-12.zip
  !mkdir -p bert-base-uncased
  !mv vocab.txt bert_* bert-base-uncased/

deepct = pyterrier_deepct.DeepCTTransformer('bert-base-uncased/bert_config.json', 'marco/model.ckpt-65816')
index_loc_deepct = './index_deepct'
indexer_deepct = deepct >> pt.IterDictIndexer(index_loc_deepct)
index_ref_deepct = indexer_deepct.index(dataset.get_corpus_iter())
br_deepct = pt.BatchRetrieve(index_ref_deepct, wmodel="BM25")

expr_deepct = experiment_br([br, br_deepct], ['BM25', 'BM25 >> DeepCT'])
print(pd.DataFrame(expr_deepct))

--2021-09-20 15:41:42--  http://boston.lti.cs.cmu.edu/appendices/arXiv2019-DeepCT-Zhuyun-Dai/outputs/marco.zip
Resolving boston.lti.cs.cmu.edu (boston.lti.cs.cmu.edu)... 128.2.207.32, 128.2.207.139
Connecting to boston.lti.cs.cmu.edu (boston.lti.cs.cmu.edu)|128.2.207.32|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1237248594 (1.2G) [application/zip]
Saving to: ‘marco.zip’


2021-09-20 15:43:07 (14.1 MB/s) - ‘marco.zip’ saved [1237248594/1237248594]

Archive:  marco.zip
   creating: marco/
  inflating: marco/checkpoint        
  inflating: marco/graph.pbtxt       
  inflating: marco/model.ckpt-65816.data-00000-of-00001  
  inflating: marco/model.ckpt-65816.index  
  inflating: marco/model.ckpt-65816.meta  
  inflating: marco/train.tf_record   
--2021-09-20 15:43:25--  https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.125.128, 142.251.8.128, 74.125.23.128, ...


[INFO] NumExpr defaulting to 2 threads.


vaswani documents:   0%|          | 0/11429 [00:00<?, ?it/s]

             name       map  ndcg_cut.10      P.10
0            BM25  0.296517     0.446609  0.352688
1  BM25 >> DeepCT  0.296275     0.443258  0.359140


end of fun.