In [1]:
dataset = 'trec-covid'

## GTE

### indexing

In [33]:
import os
import json
import pickle
import torch
import faiss
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Paths
dataset_folder = f"./datasets/{dataset}"
corpus_file = os.path.join(dataset_folder, "corpus.json")
selected_indices_file = os.path.join(dataset_folder, "selected_indices.json")
index_folder = os.path.join(dataset_folder, "gte", "indexes", f"{dataset}-index")
os.makedirs(index_folder, exist_ok=True)

# Model setup
model_name = "Alibaba-NLP/gte-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16 ).to("cuda")
model.eval()

# Function for embedding normalization
def normalize_embeddings(embeddings):
    return F.normalize(embeddings, p=2, dim=1)

# Document encoding function
def encode_documents(corpus, tokenizer, model, batch_size=32):
    """
    Encode documents using GTE with normalization.
    """
    doc_ids = list(corpus.keys())
    texts = [f"{corpus[doc_id]['title']} {corpus[doc_id]['text']}" for doc_id in doc_ids]
    embeddings = []

    print("Encoding documents...")
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")

        with torch.no_grad():
            outputs = model(**inputs)
            # pooled_embeddings = outputs.last_hidden_state[:, 0] # CLS pooling
            pooled_embeddings = normalize_embeddings(outputs.last_hidden_state[:, 0])  # CLS pooling

        embeddings.append(pooled_embeddings.cpu())

    embeddings = torch.cat(embeddings, dim=0).numpy()  # Combine all batches into a single array
    return doc_ids, embeddings

# Load corpus
print("Loading corpus...")
with open(corpus_file, "r") as f:
    corpus = json.load(f)

# Load selected indices
print("Loading selected indices...")
with open(selected_indices_file, "r") as f:
    selected_indices = json.load(f)

# Exclude non-member documents
non_mem_indices = set(selected_indices["non_mem_indices"])
filtered_corpus = {doc_id: content for doc_id, content in corpus.items() if doc_id not in non_mem_indices}
print(f"Filtered corpus size: {len(filtered_corpus)} documents")

# Encode documents
doc_ids, doc_embeddings = encode_documents(filtered_corpus, tokenizer, model)

# Save document embeddings using FAISS
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product index
index.add(doc_embeddings)

# Save FAISS index and document IDs
faiss_index_path = os.path.join(index_folder, "corpus_index.faiss")
doc_ids_path = os.path.join(index_folder, "doc_ids.pkl")

faiss.write_index(index, faiss_index_path)
with open(doc_ids_path, "wb") as f:
    pickle.dump(doc_ids, f)

print(f"Indexing completed. Index saved to: {index_folder}")


Loading corpus...
Loading selected indices...
Filtered corpus size: 2633 documents
Encoding documents...


Encoding batches: 100%|██████████| 83/83 [00:14<00:00,  5.73batch/s]

Indexing completed. Index saved to: ./datasets/nfcorpus/gte/indexes/nfcorpus-index





### test

In [31]:
import os
import json
import pickle
import torch
import faiss
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# Paths
index_folder = os.path.join(dataset_folder, "gte", "indexes", f"{dataset}-index")
faiss_index_path = os.path.join(index_folder, "corpus_index.faiss")
doc_ids_path = os.path.join(index_folder, "doc_ids.pkl")
corpus_file = os.path.join(dataset_folder, "corpus.json")

# Model setup
model_name = "Alibaba-NLP/gte-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16).to("cuda")
model.eval()

# Function for embedding normalization
def normalize_embeddings(embeddings):
    return F.normalize(embeddings, p=2, dim=1)

# Batch Query Encoding Function
def encode_queries(queries, tokenizer, model, batch_size=32):
    """
    Encode a batch of queries using GTE.
    """
    embeddings = []
    for i in range(0, len(queries), batch_size):
        batch_queries = queries[i:i + batch_size]
        inputs = tokenizer(batch_queries, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
        with torch.no_grad():
            outputs = model(**inputs)
            # batch_embeddings = outputs.last_hidden_state[:, 0]  # CLS pooling
            batch_embeddings = normalize_embeddings(outputs.last_hidden_state[:, 0])  # CLS pooling
            embeddings.append(batch_embeddings.cpu())
    return torch.cat(embeddings, dim=0).numpy()

# Load FAISS index and document IDs
print("Loading FAISS index...")
faiss_index = faiss.read_index(faiss_index_path)

print("Loading document IDs...")
with open(doc_ids_path, "rb") as f:
    doc_ids = pickle.load(f)

# Load corpus for mapping document IDs to content
print("Loading corpus...")
with open(corpus_file, "r") as f:
    corpus = json.load(f)

# Test Queries
queries = [
    "Is dexmedetomidine used as part of enhanced recovery after surgery (ERAS) protocols to reduce opioid consumption in the Post anesthesia care unit (PACU)?",
    "Does the 2016 American Society of Anesthesiologists (ASA) guidelines recommend a multimodal opioid-sparing approach for postoperative pain management?",
    "Does persistent postsurgical pain (PPP) have an incidence rate of up to 50%?",
    "Can dexmedetomidine be used as an adjuvant in epidurals to enhance local anesthetic sparing effects?",
    "Is local infiltration of IV dexmedetomidine associated with earlier discharge from the Post anesthesia care unit (PACU)?",
    "Is dexmedetomidine known to provide analgesic effects through a different mechanism of action compared to opioids?",
    "Does dexmedetomidine improve hemodynamic stability when used at the end of a surgical procedure?",
    "Is dexmedetomidine associated with better pain response than ropivacaine when initiated at the end of surgery?",
    "Can dexmedetomidine be used during nerve blocks to reduce postoperative pain?",
    "Is the perioperative use of dexmedetomidine known to significantly improve postoperative outcomes in ERAS protocols?",
    "Does the incorporation of dexmedetomidine into ERAS protocols contribute to reduced opioid consumption in the Post anesthesia care unit (PACU)?",
    "Is dexmedetomidine classified as a selective alpha(2) agonist with unique analgesic properties compared to opioids?",
    "Are regional nerve blocks combined with dexmedetomidine used to improve postoperative outcomes in ERAS protocols?",
    "Does local infiltration of IV dexmedetomidine correlate with improved discharge times from the PACU?",
    "Is the use of dexmedetomidine as an adjuvant in epidurals intended to enhance local anesthetic sparing effects?"
]

# Encode queries in a batch
print("Encoding queries...")
query_embeddings = encode_queries(queries, tokenizer, model)

# Perform batch retrieval
k = 3  # Number of top results per query
print(f"Performing search for top-{k} results for each query...")
distances, indices = faiss_index.search(query_embeddings, k)

# Map results back to corpus
print("\nSearch Results:")
for query_idx, query in enumerate(queries):
    print(f"Query {query_idx + 1}: {query}")
    print(f"Top-{k} results:")
    for rank, idx in enumerate(indices[query_idx], start=1):
        doc_id = doc_ids[idx]
        doc_content = corpus[doc_id]
        print(f"  Rank {rank}:")
        print(f"    Doc ID: {doc_id}")
        print(f"    Title: {doc_content.get('title', '')}")
        print(f"    Text Snippet: {doc_content.get('text', '')[:200]}...")
    print()


Loading FAISS index...
Loading document IDs...
Loading corpus...
Encoding queries...
Performing search for top-3 results for each query...

Search Results:
Query 1: Is dexmedetomidine used as part of enhanced recovery after surgery (ERAS) protocols to reduce opioid consumption in the Post anesthesia care unit (PACU)?
Top-3 results:
  Rank 1:
    Doc ID: yzv43e8l
    Title: Dexmedetomidine in Enhanced Recovery After Surgery (ERAS) Protocols for Postoperative Pain
    Text Snippet: PURPOSE OF REVIEW: Effective acute pain management has evolved considerably in recent years and is a primary area of focus in attempts to defend against the opioid epidemic. Persistent postsurgical pa...
  Rank 2:
    Doc ID: 96w3ygrv
    Title: Role of dexmedetomidine infusion after coronary artery bypass grafting
    Text Snippet: BACKGROUND: Postoperative pain has negative consequences on patients’ outcomes after cardiac surgery. Routine management with opioid and or non-steroidal anti-inflammatory medicati

## ColBERT

### indexing

In [None]:
import pandas as pd
import json
import os
from ragatouille import RAGPretrainedModel

# Dataset paths
corpus_file = f'./datasets/{dataset}/corpus.json'
selected_indices_file = f'./datasets/{dataset}/selected_indices.json'
output_dir = f'./datasets/{dataset}'

# Step 1: Load the corpus
with open(corpus_file, 'r') as file:
    corpus = json.load(file)

# Step 2: Load selected indices (non-members to exclude)
with open(selected_indices_file, "r") as f:
    selected_indices = json.load(f)  # Assuming it contains "mem_indices" and "non_mem_indices"

# Step 3: Filter the corpus to exclude non-members
non_mem_indices = set(selected_indices["non_mem_indices"])
filtered_corpus = {doc_id: content for doc_id, content in corpus.items() if doc_id not in non_mem_indices}

# Step 4: Transform the filtered corpus into a pandas DataFrame
records = []
for doc_id, content in filtered_corpus.items():
    records.append({
        'id': doc_id,
        'title': content.get('title', ''),
        'text': content.get('text', '')
    })

df = pd.DataFrame(records)

# Ensure the DataFrame contains the required fields
print("Filtered DataFrame:")
print(df.head())

# Step 5: Prepare the data for indexing
document_ids = df['id'].tolist()
my_documents = df['text'].tolist()
document_metadatas = [{"title": title} for title in df['title']]

# Step 6: Load the pretrained RAG model
model_name = "colbert-ir/colbertv2.0"  # Replace with your actual model
RAG = RAGPretrainedModel.from_pretrained(model_name, index_root=output_dir)

# Step 7: Index the documents
index_name = f"{dataset}-index"
# Ensure the output directory exists
os.makedirs(os.path.join(os.path.dirname(output_dir), 'colbert', 'indexes', index_name), exist_ok=True)

index_path = RAG.index(
    index_name=index_name,
    collection=my_documents,
    document_ids=document_ids,
    document_metadatas=document_metadatas,
    max_document_length = 512
)
print(f"Indexing completed. Index saved at: {index_path}")


Filtered DataFrame:
         id                                              title  \
0  ug7v899j  Clinical features of culture-proven Mycoplasma...   
1  02tnwd4m  Nitric oxide: a pro-inflammatory mediator in l...   
2  ejv2xln0    Surfactant protein-D and pulmonary host defense   
3  9785vg6d  Gene expression in epithelial cells in respons...   
4  zjufx4fo  Sequence requirements for RNA strand transfer ...   

                                                text  
0  OBJECTIVE: This retrospective chart review des...  
1  Inflammatory diseases of the respiratory tract...  
2  Surfactant protein-D (SP-D) participates in th...  
3  Respiratory syncytial virus (RSV) and pneumoni...  
4  Nidovirus subgenomic mRNAs contain a leader se...  


[Nov 29, 18:03:17] #> Note: Output directory ./datasets/trec-covid/colbert/indexes/trec-covid-index already exists


[Nov 29, 18:03:17] #> Will delete 39 files already at ./datasets/trec-covid/colbert/indexes/trec-covid-index in 20 seconds...
[Nov 29,

0it [00:00, ?it/s]

[Nov 29, 18:07:46] [0] 		 #> Encoding 25000 passages..


1it [01:01, 61.28s/it]

[Nov 29, 18:08:47] [0] 		 #> Encoding 25000 passages..


2it [02:03, 61.98s/it]

[Nov 29, 18:09:50] [0] 		 #> Encoding 25000 passages..


3it [03:04, 61.48s/it]

[Nov 29, 18:10:51] [0] 		 #> Encoding 25000 passages..


4it [04:05, 61.35s/it]

[Nov 29, 18:11:52] [0] 		 #> Encoding 12561 passages..


5it [04:36, 55.36s/it]
100%|██████████| 5/5 [00:00<00:00, 17.14it/s]


[Nov 29, 18:12:27] #> Optimizing IVF to store map from centroids to list of pids..
[Nov 29, 18:12:27] #> Building the emb2pid mapping..
[Nov 29, 18:12:27] len(emb2pid) = 30859788


100%|██████████| 65536/65536 [00:03<00:00, 20010.07it/s]


[Nov 29, 18:12:31] #> Saved optimized IVF to ./datasets/trec-covid/colbert/indexes/trec-covid-index/ivf.pid.pt
Done indexing!
Indexing completed. Index saved at: datasets/trec-covid/colbert/indexes/trec-covid-index


### test

In [None]:
import os
from ragatouille import RAGPretrainedModel

# Paths to ColBERT index and configuration
index_path = os.path.join("./datasets", dataset, 'colbert', 'indexes', f"{dataset}-index")
model_name = "colbert-ir/colbertv2.0"

# Load RAG model
colbert = RAGPretrainedModel.from_index(index_path)

# Example query
query = "Can donor and acceptor template homology facilitate recombinogenic transfers?"
# Perform the search
k = 3  # Number of results to retrieve
results = colbert.search(query, k=k)

# Display the search results
for result in results:
    print(result)



Loading searcher for index trec-covid-index for the first time... This may take a few seconds
[Nov 29, 18:13:56] #> Loading codec...
[Nov 29, 18:13:56] #> Loading IVF...
[Nov 29, 18:13:56] #> Loading doclens...


100%|██████████| 5/5 [00:00<00:00, 831.11it/s]

[Nov 29, 18:13:56] #> Loading codes and residuals...



100%|██████████| 5/5 [00:02<00:00,  2.32it/s]

Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Can donor and acceptor template homology facilitate recombinogenic transfers?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2064, 15009,  1998,  5138,  2953, 23561, 24004,  6483,
        10956, 28667,  5358, 21891, 16505, 15210,  1029,   102,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

{'content': 'Abstract Reverse transcription requires two replicative template switches, called minus and plus strand strong stop transfer, and can include additional, recombinogenic switches. Donor and acceptor template homology facilitates both replicative and recombinogenic transfers, but homology-independent determinants ma




## BGE

### indexing

In [34]:
import os
import json
import pickle
import torch
import faiss
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Dataset paths
dataset_folder = f"./datasets/{dataset}"
corpus_file = os.path.join(dataset_folder, "corpus.json")
selected_indices_file = os.path.join(dataset_folder, "selected_indices.json")
index_folder = os.path.join(dataset_folder, "bge", "indexes", f"{dataset}-index")
os.makedirs(index_folder, exist_ok=True)

# Model setup
model_name = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, torch.float16).to("cuda")
model.eval()

# Function for embedding normalization
def normalize_embeddings(embeddings):
    return torch.nn.functional.normalize(embeddings, p=2, dim=1)

# Document encoding function
def encode_documents(corpus, tokenizer, model, batch_size=256):
    """
    Encode documents using BGE with FP16 precision.
    """
    doc_ids = list(corpus.keys())
    texts = [f"{corpus[doc_id]['title']} {corpus[doc_id]['text']}" for doc_id in doc_ids]
    embeddings = []

    print("Encoding documents...")
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")

        with torch.no_grad():
            outputs = model(**inputs)
            pooled_embeddings = normalize_embeddings(outputs.last_hidden_state[:, 0])  # CLS pooling

        embeddings.append(pooled_embeddings)

    embeddings = torch.cat(embeddings, dim=0).half().cpu().numpy()  # Ensure FP16 during concatenation
    return doc_ids, embeddings

# Load corpus
print("Loading corpus...")
with open(corpus_file, "r") as f:
    corpus = json.load(f)

# Load selected indices (non-members to exclude)
print("Loading selected indices...")
with open(selected_indices_file, "r") as f:
    selected_indices = json.load(f)
non_mem_indices = set(selected_indices["non_mem_indices"])

# Filter the corpus to exclude non-members
print("Filtering corpus to exclude non-members...")
filtered_corpus = {doc_id: content for doc_id, content in corpus.items() if doc_id not in non_mem_indices}

# Encode documents
print("Encoding documents...")
doc_ids, doc_embeddings = encode_documents(filtered_corpus, tokenizer, model)

# Save document embeddings using FAISS
print("Saving embeddings and document IDs...")
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product index
index.add(doc_embeddings)

# Save FAISS index and document IDs
faiss_index_path = os.path.join(index_folder, "corpus_index.faiss")
doc_ids_path = os.path.join(index_folder, "doc_ids.pkl")

faiss.write_index(index, faiss_index_path)
with open(doc_ids_path, "wb") as f:
    pickle.dump(doc_ids, f)

print(f"Indexing completed. Index saved to: {index_folder}")


Loading corpus...
Loading selected indices...
Filtering corpus to exclude non-members...
Encoding documents...
Encoding documents...


Encoding batches: 100%|██████████| 11/11 [00:55<00:00,  5.03s/batch]


Saving embeddings and document IDs...
Indexing completed. Index saved to: ./datasets/nfcorpus/bge/indexes/nfcorpus-index


### Test

In [17]:
import os
import json
import pickle
import torch
import faiss
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Paths
dataset_folder = f"./datasets/{dataset}"
index_folder = os.path.join(dataset_folder, "bge", "indexes", f"{dataset}-index")
corpus_file = os.path.join(dataset_folder, "corpus.json")

# Paths to the saved FAISS index and document IDs
faiss_index_path = os.path.join(index_folder, "corpus_index.faiss")
doc_ids_path = os.path.join(index_folder, "doc_ids.pkl")

# Model setup
model_name = "BAAI/bge-large-en-v1.5"  # Use 'BAAI/bge-large-zh-v1.5' for Chinese
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda")
model.eval()

# Function for embedding normalization
def normalize_embeddings(embeddings):
    return torch.nn.functional.normalize(embeddings, p=2, dim=1)

# Query encoding function
def encode_query(queries, tokenizer, model):
    """
    Encode a list of queries using BGE.
    """
    inputs = tokenizer(queries, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        query_embeddings = normalize_embeddings(outputs.last_hidden_state[:, 0])  # CLS pooling
    return query_embeddings.cpu().numpy()

# Load FAISS index and document IDs
print("Loading FAISS index...")
faiss_index = faiss.read_index(faiss_index_path)

print("Loading document IDs...")
with open(doc_ids_path, "rb") as f:
    doc_ids = pickle.load(f)

# Load corpus for mapping doc IDs to text
print("Loading corpus...")
with open(corpus_file, "r") as f:
    corpus = json.load(f)

# Query retrieval test
queries = [
            "Do passively acquired antibodies through colostrum help protect calves against neonatal calf diarrhea?",
            "Are enterotoxigenic Escherichia coli (ETEC) strains the most common cause of colibacillosis in newborn calves?",
            "Is there a significant difference in antibody titers between normal and diarrheic calf groups?",
            "Are the total IgG concentrations in diarrheic calves lower than those in normal calves?",
            "Do diarrheic calves have higher anti-E. coli antibody levels than normal calves?",
            "Is there a positive correlation between the total IgG in dam serum and the total IgG in calf serum?",
            "Is there a significant positive correlation between colostral anti-E. coli antibodies and anti-E. coli antibodies in calf serum?",
            "Does anti-E. coli antibody in calf serum have a negative correlation with total IgG in dam serum?",
            "Is there a negative correlation between anti-E. coli antibody levels in calf serum and total IgG in colostrum?",
            "Are higher anti-E. coli antibodies in diarrheic calves associated with lower total IgG levels?",
            "Is passive immunity to diarrhea potentially influenced by cytokines transferred by colostrum to neonatal calves?",
            "Do maternal cellular components potentially contribute to immunity against diarrhea in neonatal calves?",
            "Does the ratio of dam IgG to calf IgG equal 52.11?",
            "Is the correlation coefficient (r) between colostral anti-E. coli antibody and calf serum anti-E. coli antibody 0.345?",
            "Are maternally derived antibodies considered a good indicator of passive immunity against diarrhea in calves?"
        ] # Add more queries as needed

print("Encoding queries...")
query_embeddings = encode_query(queries, tokenizer, model)

# Perform retrieval
k = 3  # Number of top results
print(f"Performing search for top-{k} results...")
distances, indices = faiss_index.search(query_embeddings, k)

# Map results back to corpus and display
print("\nSearch Results:")
for query_idx, query in enumerate(queries):
    print(f"\nQuery: {query}")
    for rank, idx in enumerate(indices[query_idx], start=1):
        doc_id = doc_ids[idx]
        doc_content = corpus.get(doc_id, {})
        title = doc_content.get('title', 'No Title')
        text_snippet = doc_content.get('text', 'No Text')[:200]  # Truncate text for display

        print(f"  Rank {rank}:")
        print(f"    Doc ID: {doc_id}")
        print(f"    Title: {title}")
        print(f"    Text Snippet: {text_snippet}...")


Loading FAISS index...
Loading document IDs...
Loading corpus...
Encoding queries...
Performing search for top-3 results...

Search Results:

Query: Do passively acquired antibodies through colostrum help protect calves against neonatal calf diarrhea?
  Rank 1:
    Doc ID: emsavvl8
    Title: Correlation between neonatal calf diarrhea and the level of maternally derived antibodies.
    Text Snippet: Passively acquired antibodies through colostrum will protect calves against etiological agents of neonatal calf diarrhea. Among them enteric diseases due to strains of Enterotoxigenic Escherichia coli...
  Rank 2:
    Doc ID: gsa47y20
    Title: Passive immunity to bovine rotavirus in newborn calves fed colostrum supplements from immunized or nonimmunized cows.
    Text Snippet: Colostrum was collected and pooled from each of five cows in three experimental groups: group I cows received intramuscular and intramammary inoculations of adjuvanted modified live Ohio Agricultural ...
  Rank 3:
 

# Not used

## Contriever

### indexing

In [2]:
import os
import json
import pickle
import torch
import faiss
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Paths
dataset_folder = f"./datasets/{dataset}"
corpus_file = os.path.join(dataset_folder, "corpus.json")
selected_indices_file = os.path.join(dataset_folder, "selected_indices.json")  # Path to non-member indices
index_folder = os.path.join(dataset_folder, "contriever", "indexes", f"{dataset}-index")
os.makedirs(index_folder, exist_ok=True)

# Model setup
model_name = "facebook/contriever-msmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Mean pooling function
def mean_pooling(token_embeddings, mask):
    """
    Perform mean pooling for sentence embeddings.
    """
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

# Document encoding function
def encode_documents(corpus, tokenizer, model, device, batch_size=32):
    """
    Encode documents using Contriever model.
    """
    doc_ids = list(corpus.keys())
    texts = [f"{corpus[doc_id]['title']} {corpus[doc_id]['text']}" for doc_id in doc_ids]
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            pooled_embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
            pooled_embeddings = torch.nn.functional.normalize(pooled_embeddings, p=2, dim=1)  # Normalize embeddings

        embeddings.append(pooled_embeddings.cpu())

    embeddings = torch.cat(embeddings, dim=0).numpy()
    return doc_ids, embeddings

# Query encoding function
def encode_query(query, tokenizer, model, device):
    """
    Encode a single query using Contriever.
    """
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        query_embedding = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
        query_embedding = torch.nn.functional.normalize(query_embedding, p=2, dim=1)
    return query_embedding.cpu().numpy()

# Load corpus
print("Loading corpus...")
with open(corpus_file, "r") as f:
    corpus = json.load(f)

# Load selected indices (exclude non-members)
print("Loading selected indices...")
with open(selected_indices_file, "r") as f:
    selected_indices = json.load(f)
non_mem_indices = set(selected_indices["non_mem_indices"])

# Filter the corpus to exclude non-members
filtered_corpus = {doc_id: content for doc_id, content in corpus.items() if doc_id not in non_mem_indices}
print(f"Filtered corpus size: {len(filtered_corpus)}")

# Encode documents
print("Encoding documents...")
doc_ids, doc_embeddings = encode_documents(filtered_corpus, tokenizer, model, device, batch_size=256)

# Save document embeddings using FAISS
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product index
index.add(doc_embeddings)

# Save FAISS index and document IDs
faiss_index_path = os.path.join(index_folder, "corpus_index.faiss")
doc_ids_path = os.path.join(index_folder, "doc_ids.pkl")

faiss.write_index(index, faiss_index_path)
with open(doc_ids_path, "wb") as f:
    pickle.dump(doc_ids, f)

print(f"Indexing completed. Index saved to: {index_folder}")

  from .autonotebook import tqdm as notebook_tqdm


Loading corpus...
Loading selected indices...
Filtered corpus size: 105520
Encoding documents...


Encoding batches: 100%|██████████| 413/413 [12:19<00:00,  1.79s/batch]


Indexing completed. Index saved to: ./datasets/trec-covid/contriever/indexes/trec-covid-index


### Test

In [4]:
# Query retrieval test
query = "PURPOSE OF REVIEW: Effective acute pain management has evolved considerably in recent years and is a primary area of focus in attempts to defend against the opioid epidemic. Persistent postsurgical pain (PPP) has an incidence of up to 30\u201350% and has negative outcome of quality of life and negative burden on individuals, family, and society. The 2016 American Society of Anesthesiologists (ASA) guidelines states that enhanced recovery after surgery (ERAS) forms an integral part of Perioperative Surgical Home (PSH) and is now recommended to use a multimodal opioid-sparing approach for management of postoperative pain. As such, dexmedetomidine is now being used as part of ERAS protocols along with regional nerve blocks and other medications, to create a satisfactory postoperative outcome with reduced opioid consumption in the Post anesthesia care unit (PACU). RECENT FINDINGS: Dexmedetomidine, a selective alpha(2) agonist, possesses analgesic effects and has a different mechanism of action when compared with opioids. When dexmedetomidine is initiated at the end of a procedure, it has a better hemodynamic stability and pain response than ropivacaine. Dexmedetomidine can be used as an adjuvant in epidurals with local anesthetic sparing effects. Its use during nerve blocks results in reduced postoperative pain. Also, local infiltration of IV dexmedetomidine is associated with earlier discharge from PACU. SUMMARY: Perioperative use of dexmedetomidine has significantly improved postoperative outcomes when used as part of ERAS protocols. An in-depth review of the use of dexmedetomidine in ERAS protocols is presented for clinical anesthesiologist"
print(f"Encoding query: {query}")
query_embedding = encode_query(query, tokenizer, model, device)

# Perform retrieval
k = 3  # Number of top results
print(f"Performing search for top-{k} results...")
distances, indices = index.search(query_embedding, k)

# Map results back to corpus
print("\nSearch Results:")
for rank, idx in enumerate(indices[0], start=1):
    doc_id = doc_ids[idx]
    doc_content = corpus[doc_id]
    print(f"Rank {rank}:")
    print(f"  Doc ID: {doc_id}")
    print(f"  Title: {doc_content.get('title', '')}")
    print(f"  Text Snippet: {doc_content.get('text', '')[:200]}...")  # Display snippet

Encoding query: PURPOSE OF REVIEW: Effective acute pain management has evolved considerably in recent years and is a primary area of focus in attempts to defend against the opioid epidemic. Persistent postsurgical pain (PPP) has an incidence of up to 30–50% and has negative outcome of quality of life and negative burden on individuals, family, and society. The 2016 American Society of Anesthesiologists (ASA) guidelines states that enhanced recovery after surgery (ERAS) forms an integral part of Perioperative Surgical Home (PSH) and is now recommended to use a multimodal opioid-sparing approach for management of postoperative pain. As such, dexmedetomidine is now being used as part of ERAS protocols along with regional nerve blocks and other medications, to create a satisfactory postoperative outcome with reduced opioid consumption in the Post anesthesia care unit (PACU). RECENT FINDINGS: Dexmedetomidine, a selective alpha(2) agonist, possesses analgesic effects and has a different mech

## Qwen

### indexing

In [None]:
import os
import json
import pickle
import torch
import faiss
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Paths
dataset_folder = f"./datasets/{dataset}"
corpus_file = os.path.join(dataset_folder, "corpus.json")
index_folder = os.path.join(dataset_folder, "qwen", "indexes", f"{dataset}-index")
os.makedirs(index_folder, exist_ok=True)

# Model setup
model_name = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to("cuda")
model.eval()

# Pooling function
def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    Pool the last token embeddings based on the attention mask.
    """
    sequence_lengths = attention_mask.sum(dim=1) - 1
    batch_size = last_hidden_states.shape[0]
    return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

# Normalization function
def normalize_embeddings(embeddings):
    return F.normalize(embeddings, p=2, dim=1)

# Document encoding function
def encode_documents(corpus, tokenizer, model, batch_size=256):
    """
    Encode documents using Qwen.
    """
    doc_ids = list(corpus.keys())
    texts = [f"{corpus[doc_id]['title']} {corpus[doc_id]['text']}" for doc_id in doc_ids]
    embeddings = []

    print("Encoding documents...")
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")

        with torch.no_grad():
            outputs = model(**inputs)
            pooled_embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])
            normalized_embeddings = normalize_embeddings(pooled_embeddings)

        embeddings.append(normalized_embeddings.cpu())

    embeddings = torch.cat(embeddings, dim=0).numpy()  # Combine and convert to numpy
    return doc_ids, embeddings

# Load corpus
print("Loading corpus...")
with open(corpus_file, "r") as f:
    corpus = json.load(f)

# Encode documents
doc_ids, doc_embeddings = encode_documents(corpus, tokenizer, model)

# Save document embeddings using FAISS
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product index
index.add(doc_embeddings)

# Save FAISS index and document IDs
faiss_index_path = os.path.join(index_folder, "corpus_index.faiss")
doc_ids_path = os.path.join(index_folder, "doc_ids.pkl")

faiss.write_index(index, faiss_index_path)
with open(doc_ids_path, "wb") as f:
    pickle.dump(doc_ids, f)

print(f"Indexing completed. Index saved to: {index_folder}")


  from .autonotebook import tqdm as notebook_tqdm
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct:
- modeling_qwen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading shards: 100%|██████████| 2/2 [02:51<00:00, 85.53s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.08s/it]


Loading corpus...
Encoding documents...


Encoding batches: 100%|██████████| 414/414 [2:20:04<00:00, 20.30s/batch]  


Indexing completed. Index saved to: ./datasets/trec-covid/qwen/indexes/trec-covid-index


### test

In [None]:
import os
import json
import pickle
import torch
import faiss
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# Paths
dataset_folder = f"./datasets/{dataset}"
index_folder = os.path.join(dataset_folder, "qwen", "indexes", f"{dataset}-index")
corpus_file = os.path.join(dataset_folder, "corpus.json")

# Paths to the saved FAISS index and document IDs
faiss_index_path = os.path.join(index_folder, "corpus_index.faiss")
doc_ids_path = os.path.join(index_folder, "doc_ids.pkl")

# Model setup
model_name = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to("cuda")
model.eval()

# Pooling function
def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    sequence_lengths = attention_mask.sum(dim=1) - 1
    batch_size = last_hidden_states.shape[0]
    return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

# Normalization function
def normalize_embeddings(embeddings):
    return F.normalize(embeddings, p=2, dim=1)

# Query encoding function
def encode_query(queries, tokenizer, model):
    """
    Encode a list of queries using Qwen.
    """
    inputs = tokenizer(queries, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        query_embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])
        query_embeddings = normalize_embeddings(query_embeddings)
    return query_embeddings.cpu().numpy()

# Load FAISS index and document IDs
print("Loading FAISS index...")
faiss_index = faiss.read_index(faiss_index_path)

print("Loading document IDs...")
with open(doc_ids_path, "rb") as f:
    doc_ids = pickle.load(f)

# Load corpus for mapping doc IDs to text
print("Loading corpus...")
with open(corpus_file, "r") as f:
    corpus = json.load(f)

# Query retrieval test
queries = queries = [
            "A total of 50 poultry farms of commercial broilers (N = 39) and commercial layers (N = 11) suffered from respiratory problems and mortality during the period"
        ]
print("Encoding queries...")
query_embeddings = encode_query(queries, tokenizer, model)

# Perform retrieval
k = 3  # Number of top results
print(f"Performing search for top-{k} results...")
distances, indices = faiss_index.search(query_embeddings, k)

# Map results back to corpus
print("\nSearch Results:")
for query_idx, query in enumerate(queries):
    print(f"\nQuery: {query}")
    for rank, idx in enumerate(indices[query_idx], start=1):
        doc_id = doc_ids[idx]
        doc_content = corpus.get(doc_id, {})
        title = doc_content.get("title", "No Title")
        text_snippet = doc_content.get("text", "No Text")[:200]  # Display snippet
        print(f"  Rank {rank}:")
        print(f"    Doc ID: {doc_id}")
        print(f"    Title: {title}")
        print(f"    Text Snippet: {text_snippet}...")

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]


Loading FAISS index...
Loading document IDs...
Loading corpus...
Encoding queries...
Performing search for top-3 results...

Search Results:

Query: A total of 50 poultry farms of commercial broilers (N = 39) and commercial layers (N = 11) suffered from respiratory problems and mortality during the period
  Rank 1:
    Doc ID: dfx9z2g7
    Title: Seroprevalence of major avian respiratory diseases in broiler and sonali chicken in selected areas of Bangladesh
    Text Snippet: OBJECTIVE: This study was conducted to investigate different respiratory diseases in broiler and sonali birds in some selected districts of Bangladesh. MATERIALS AND METHODS: We were collected a total...
  Rank 2:
    Doc ID: dnswqp4j
    Title: Molecular survey and interaction of common respiratory pathogens in chicken flocks (field perspective)
    Text Snippet: AIM: The present study was designed for the detection of the most prevalent respiratory infections in chicken flocks and clarifying their interaction and