In [5]:
from sentence_transformers import SentenceTransformer, util
import torch

query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district", "I think there are 5 million people in london today.", "No, there are 10 million people in New York."]

#Load the model
# model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')    # TAS-B
# model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v2')     # SBERT
model = SentenceTransformer('sentence-transformers/gtr-t5-xl').to('cuda:0')      # gtr-t5-xl
# model = SentenceTransformer('BAAI/bge-large-en-v1.5')       # BGE
# model = SentenceTransformer('BAAI/llm-embedder')
# model.load_state_dict(torch.load('/data/richard/taggerv2/test/test6/beir/outputs/ckpts/2025_05_27_17h55m37s/model_step_440075.pth'))
model.load_state_dict(torch.load('/data/richard/taggerv2/test/test6/beir/outputs/ckpts/2025_05_30_21h52m36s/model_step_251471.pth'))

#Encode query and documents
query_emb = model.encode(query)
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

#Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)



  model.load_state_dict(torch.load('/data/richard/taggerv2/test/test6/beir/outputs/ckpts/2025_05_30_21h52m36s/model_step_251471.pth'))


0.9264308214187622 Around 9 Million people live in London
0.869159460067749 I think there are 5 million people in london today.
0.7612994909286499 No, there are 10 million people in New York.
0.3208233416080475 London is known for its financial district


In [1]:
from datasets import load_dataset


mmlu_dataset = load_dataset("cais/mmlu", "all")


In [2]:
mmlu_dataset['test']

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 14042
})

In [7]:
mmlu_queries = {}
mmlu_choices = {}
mmlu_answers = {}
mmlu_subjects = {}

for i, data in enumerate(mmlu_dataset['test']):
    mmlu_queries[i] = data['question']
    mmlu_choices[i] = data['choices']
    mmlu_subjects[i] = data['subject']
    mmlu_answers[i] = data['choices'][data['answer']]

In [8]:
len(mmlu_queries)

14042

In [4]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from sentence_transformers import SentenceTransformer

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "MSMARCO"
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
# out_dir = os.path.join('/data/richard/taggerv2/test/test6/beir/outputs', "datasets")
data_path = '/data/richard/taggerv2/test/test6/beir/outputs/datasets/msmarco'

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

2025-06-01 19:04:10 - Loading Corpus...


  0%|          | 0/8841823 [00:00<?, ?it/s]

2025-06-01 19:04:33 - Loaded 8841823 TEST Documents.
2025-06-01 19:04:34 - Doc Example: {'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.', 'title': ''}
2025-06-01 19:04:34 - Loading Queries...
2025-06-01 19:04:34 - Loaded 43 TEST Queries.
2025-06-01 19:04:34 - Query Example: anthropological definition of environment


In [5]:
type(corpus)

dict

In [6]:
list(corpus.keys())[0], list(corpus.values())[0], 

('0',
 {'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',
  'title': ''})

In [7]:
real_corpus = [text_dict['text'] for text_dict in list(corpus.values())]
len(real_corpus)

8841823