All the setup

In [28]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = 'MasterCourses/compsci685/Project'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd /content/drive/My\ Drive/$FOLDERNAME/
!pip install transformers
!pip install faiss-cpu

import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/MasterCourses/compsci685/Project
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Found device: NVIDIA A100-SXM4-40GB, n_gpu: 1


Initializing variables used throughout

In [29]:
import pickle

with open("./dataset/passages", "rb") as fp:   # Unpickling
      passages = pickle.load(fp)

with open("./dataset/test_query_passages_answer_list", "rb") as fp:   # Unpickling
        test_query_passages_answer_list = pickle.load(fp)

test_queries = [test_query_passages_answer['query'] for test_query_passages_answer in test_query_passages_answer_list]

Obtain passage embeddings using model - facebook/dpr-ctx_encoder-multiset-base. Save once obtained and use the saved values. Comment out the code

In [None]:
from transformers import DPRContextEncoderTokenizerFast, DPRContextEncoder

model_name = "facebook/dpr-ctx_encoder-multiset-base"
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(model_name)
ctx_encoder = DPRContextEncoder.from_pretrained(model_name).to(device)

passages_embeddings = []

for i in range(int(len(passages)/5)):
    print(i)
    model_input = ctx_tokenizer(passages[(5*i):(5*i+5)], truncation=True, padding="longest", return_tensors="pt")
    outputs=ctx_encoder(model_input["input_ids"].to(device), return_dict=True)
    embs = outputs["pooler_output"].detach().cpu().numpy()
    passages_embeddings.extend(embs)

with open("retriever/dpr/embeddings/passages_embeddings_and_indexes/dpr-ctx_encoder-multiset-base/passages_embeddings", "wb") as fp:   # Unpickling
    pickle.dump(passages_embeddings, fp)

Obtain test query embeddings using facebook/dpr-question_encoder-multiset-base

In [None]:
from transformers import DPRQuestionEncoderTokenizerFast, DPRQuestionEncoder

q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained("facebook/dpr-question_encoder-multiset-base")
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base").to(device)

test_queries_embeddings = []
print(len(test_queries))

for i in range(int(len(test_queries)/5)):
    print(i)
    model_input = q_tokenizer(test_queries[(5*i):(5*i+5)], truncation=True, padding="longest", return_tensors="pt")
    outputs=q_encoder(model_input["input_ids"].to(device), return_dict=True)
    embs = outputs["pooler_output"].detach().cpu().numpy()
    test_queries_embeddings.extend(embs)

with open("retriever/dpr/embeddings/test_queries_embeddings/dpr-question_encoder-multiset-base/test_queries_embeddings", "wb") as fp:   # Unpickling
    pickle.dump(test_queries_embeddings, fp)

Build and save index

In [22]:
import numpy as np
import faiss


def build_faiss_index(embeddings, filename):
  dim = 768
  m = 128

  index = faiss.IndexFlatIP(dim)#, m, faiss.METRIC_INNER_PRODUCT)
  embeddings = np.stack(embeddings)

  index.train(embeddings)
  index.add(embeddings)

  faiss.write_index(index, filename)

Build passages faiss index for embeddings obtained from encoder - facebook/dpr-ctx_encoder-multiset-base

In [None]:
build_faiss_index(passages_embeddings, "retriever/dpr/embeddings/passages_embeddings_and_indexes/dpr-ctx_encoder-multiset-base/passages_faiss_index")

Trying another query encoder

In [4]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pickle

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

model_name = "sentence-transformers/msmarco-distilbert-base-tas-b"
model = SentenceTransformer(model_name).to(device)

test_queries_embeddings = []

for i in range(int(len(test_queries)/5)):
    print(i)
    test_queries_embeddings.extend(model.encode(test_queries[(5*i):(5*i+5)]))

with open("retriever/dpr/embeddings/test_queries_embeddings/msmarco-distilbert-base-tas-b/test_queries_embeddings", "wb") as fp:   # Unpickling
    pickle.dump(test_queries_embeddings, fp)

Using the model - sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco to build embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel

model_name = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

passages_embeddings = []

for i in range(int(len(passages)/5)):
    print(i)
    model_input = tokenizer(passages[(5*i):(5*i+5)], truncation=True, padding="longest", return_tensors="pt")
    embs = bert_model(**model_input.to(device))[0][:,0,:].detach().cpu().numpy() # Gives us the cls token embeddings
    passages_embeddings.extend(embs)

with open("retriever/dpr/embeddings/passages_embeddings_and_indexes/distilbert-dot-tas_b-b256-msmarco/passages_embeddings", "wb") as fp:   # Unpickling
    pickle.dump(passages_embeddings, fp)

In [24]:
build_faiss_index(passages_embeddings, "retriever/dpr/embeddings/passages_embeddings_and_indexes/distilbert-dot-tas_b-b256-msmarco/passages_faiss_index")

Saving query embeddings using the same model

In [None]:
from transformers import AutoTokenizer, AutoModel

model_name = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

test_queries_embeddings = []

for i in range(int(len(test_queries)/5)):
    print(i)
    model_input = tokenizer(test_queries[(5*i):(5*i+5)], truncation=True, padding="longest", return_tensors="pt")
    embs = bert_model(**model_input.to(device))[0][:,0,:].detach().cpu().numpy() # Gives us the cls token embeddings
    test_queries_embeddings.extend(embs)

with open("retriever/dpr/embeddings/test_queries_embeddings/distilbert-dot-tas_b-b256-msmarco/test_queries_embeddings", "wb") as fp:   # Unpickling
    pickle.dump(test_queries_embeddings, fp)