In [1]:
!pip install datasets
from datasets import load_dataset 



In [2]:
pubmed = load_dataset(
   'pubmed_qa',
   'pqa_labeled',
   split='train'
)
pubmed

Dataset({
    features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
    num_rows: 1000
})

In [3]:
contexts = []
for record in pubmed['context']:
    contexts.append('\n'.join(record['contexts']))
for context in contexts[:3]:
    print(f"{context[:300]}...")

Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cel...
Assessment of visual acuity depends on the optotypes used for measurement. The ability to recognize different optotypes differs even if their critical details appear under the same visual angle. Since optotypes are evaluated on individuals with good visual acuity and without eye disorders, differenc...
Apparent life-threatening events in infants are a difficult and frequent problem in pediatric practice. The prognosis is uncertain because of risk of sudden infant death syndrome.
Eight infants aged 2 to 15 months were admitted during a period of 6 years; they suffered from similar maladies in the b...


In [4]:
# Sparse Vector
!pip install transformers
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(
   'bert-base-uncased'
)



In [None]:
from collections import Counter

def build_dict(input_batch):
    sparse_emb = []
    for token_ids in input_batch:
        indices = []
        values = []
        d = dict(Counter(token_ids))
        for indice in d:
            indices.append(indice)
            values.append(d[indice])
        sparse_emb.append({'indices': indices, 'values': values})
    return sparse_emb

def generate_sparse_vectors(context_batch):
    inputs = tokenizer(
           context_batch, padding=True,
           truncation=True,
           max_length=512
    )['input_ids']
    sparse_embeds = build_dict(inputs)
    return sparse_embeds

contexts = []
for record in pubmed['context']:
    contexts.append('\n'.join(record['contexts']))

sparse_vectors = generate_sparse_vectors(contexts)


In [None]:
!pip install rank_bm25
from rank_bm25 import BM25Okapi
from collections import Counter
import numpy as np

def build_bm25(input_batch):
    # Initialize BM25
    bm25 = BM25Okapi(input_batch)
    
    # Create sparse embeddings using BM25 scores
    sparse_emb = []
    for token_ids in input_batch:
        indices = []
        values = []
        # Get BM25 scores for the current document
        scores = bm25.get_scores(token_ids)
        for idx, score in enumerate(scores):
            if score > 0:  # Only consider non-zero scores
                indices.append(idx)
                values.append(score)
        sparse_emb.append({'indices': indices, 'values': values})
    return sparse_emb

def generate_sparse_vectors_bm25(context_batch):
    # Tokenize input text
    inputs = tokenizer(
        context_batch, padding=True,
        truncation=True, max_length=512
    )['input_ids']
    
    # Convert inputs to string format as BM25 requires text input
    inputs_str = [[str(token) for token in doc] for doc in inputs]
    
    # Create sparse dictionaries using BM25
    sparse_embeds = build_bm25(inputs_str)
    return sparse_embeds

# Example usage
contexts = []
for record in pubmed['context']:
    contexts.append('\n'.join(record['contexts']))

sparse_vectors_bm25 = generate_sparse_vectors_bm25(contexts)


In [None]:
print(f"First sparse embedding: {sparse_vectors_bm25[0]}")

In [None]:
# Install the required libraries
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

emb = model.encode(contexts[0])

print(emb.shape)


In [None]:
# Install the Pinecone client
!pip install pinecone-client

# Import necessary modules
import os
from pinecone import Pinecone, ServerlessSpec

# Initialize the Pinecone instance
pc = Pinecone(
    api_key="cc72d653-271c-42ed-bc71-4209c9409e6b" 
)

# Define the index name
index_name = "hybrid-search"

# Check if the index already exists
if index_name not in pc.list_indexes().names():
    # Create the index
    pc.create_index(
        name=index_name,
        dimension=384,  # Dimensionality of dense model
        metric='dotproduct',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )


In [None]:
from tqdm.auto import tqdm
index = pinecone.Index(
    name="hybrid-search",
    host="https://hybrid-search-okle6cz.svc.aped-4627-b74a.pinecone.io",
    api_key="cc72d653-271c-42ed-bc71-4209c9409e6b"
)
batch_size = 32
for i in tqdm(range(0, len(contexts), batch_size)):
    # find end of batch
    end_batch = min(i + batch_size, len(contexts))
    # extract batch
    context_batch = contexts[i:end_batch]
    # create unique IDs
    unique_ids = [str(id) for id in range(i, end_batch)]
    # add context passages as metadata
    meta = [{'context': context} for context in context_batch]
    # create dense vectors
    dense_embeds = model.encode(context_batch).tolist()
    # create sparse vectors
    sparse_embeds = generate_sparse_vectors(context_batch)

    vector = []
    for _id, sparse, dense, metadata in zip(unique_ids, sparse_embeds, dense_embeds, meta):
        sparse_values = [float(value) for value in sparse['values']]
        vector.append(
            {
                'id': _id,
                'values': dense,
                'sparse_values': {
                    'indices': sparse['indices'],
                    'values': sparse_values
                },
                'metadata': metadata
            }
        )
    index.upsert(vector)

index.describe_index_stats()


In [None]:
def hybrid_scale(dense, sparse, alpha: float):
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must between 0 and 1")

    hsparse = {
        'indices': sparse['indices'],
        'values': [x*(1-alpha) for x in sparse['values']]
    }
    hdense = [x*alpha for x in dense]
    return hsparse, hdense

def hybrid_query(question, top_k, alpha):
    sparse_vec = generate_sparse_vectors([question])[0]
    #print("Sparse Vector:", sparse_vec)
    
    dense_vec = model.encode([question]).tolist()
    #print("Dense Vector:", dense_vec)
    

    sparse_vec, dense_vec = hybrid_scale(dense_vec, sparse_vec, alpha)
    # print("Scaled Dense Vector:", dense_vec)
    # print("Scaled Sparse Vector:", sparse_vec)
    
    result = index.query(
        vector = dense_vec,
        sparse_values = sparse_vec,
        top_k = top_k,
        include_metadata = True
    )
    return result

In [None]:
question = "Can clinicians use the PHQ-9 to assess depression in people with vision loss?"
hybrid_query(question, top_k=3, alpha=1)