In [1]:
# Use Cohere / OpenAI / Claude embeddings 
# Weaviate Database
# Step 0: Llama-Index / Langchain for indexing


# sparse (BM-25) vs dense (Semantic) retrieval


# The textbooks have already been chunked, so we need to do indexing


In [2]:
# %load_ext autoreload
# %autoreload 2

%reload_ext autoreload

In [3]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os
load_dotenv()
embeddings = OpenAIEmbeddings()

No need for document / text splitting since the dataset has already done that for us.

In [4]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
loader = TextLoader("demo.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(docs[0].page_content)

Hello eversif sdjfkal fjdlsaf sdfnj asfdjl sdfnjsndfs
 sjdlf sdfnjs d'
  sdjf


In [5]:
# Testing it out
x = embeddings.embed_query("Hello")

In [6]:
# Compile and add all the doc strings
import json
def compile_docs(chunks_dir) -> list:
    arr = []
    current_dir = os.getcwd()
    for file in os.listdir(chunks_dir):
        with open(os.path.join(chunks_dir, file)) as jsonl_file:
            for line in jsonl_file:
                # Parse the JSON data from the line
                d = json.loads(line)
                arr.append(d)
    return arr
docs = compile_docs("textbooks/chunk")

In [7]:
len(docs)

125847

In [8]:
import weaviate
import os

client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WCS_URL"),  # Replace with your Weaviate Cloud URL
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCS_API_KEY")),  # Replace with your Weaviate Cloud key
    headers={'X-OpenAI-Api-key': os.getenv("OPENAI_API_KEY")}  # Replace with your OpenAI API key
)

In [9]:
def import_batches_docs(docs, collection_name) -> None:
    collection = client.collections.get(collection_name)
    with collection.batch.dynamic() as batch:
        old_data = ""
        # Docs to check: Neurology_Adams
        for data in docs:
            if old_data != data["title"]:
                print(data["title"])
                old_data = data["title"]
            batch.add_object(
                properties={"title": data["title"], "content":data["content"]},
                vector= embeddings.embed_query(data["content"]),
            )

In [10]:
# import_batches_docs(docs, "Medical_RAG_Data")

In [11]:
medical_RAG_collection = client.collections.get("Medical_RAG_Data")

Testing out some features (ignore)

In [12]:
from weaviate.classes.query import MetadataQuery
# response = medical_RAG_collection.query.near_text(
#     query="What is anatomy? Anatomy includes those structures that can be seen grossly (without the aid of magnification) and microscopically (with the aid of magnification). Typically, when used by itself, the term anatomy tends to mean gross or macroscopic anatomy\u2014that is, the study of structures that can be seen without using a microscopic. Microscopic anatomy, also called histology, is the study of cells and tissues using a microscope. Anatomy forms the basis for the practice of medicine. Anatomy leads the physician toward an understanding of a patient\u2019s disease, whether he or she is carrying out a physical examination or using the most advanced imaging techniques. Anatomy is also important for dentists, chiropractors, physical therapists, and all others involved in any aspect of patient treatment that begins with an analysis of clinical signs. The ability to interpret a clinical observation correctly is therefore the endpoint of a sound anatomical understanding.",
#     limit=2,
#     # target_vector="title_country",  # Specify the target vector for named vector collections
#     # return_metadata=MetadataQuery(distance=True)
# )

In [13]:
agg = medical_RAG_collection.aggregate.over_all()
print(agg)

AggregateReturn(properties={}, total_count=114397)


In [14]:
response = medical_RAG_collection.query.fetch_objects(
    include_vector=True,
    limit=1
)
print(response.objects[0].properties)
# print(response.objects[0].vector)

{'title': 'InternalMed_Harrison', 'content': 'severe symptoms or circumstances in which other processes, e.g., infection, are strongly suspected. Nasal oxygen should be used as appropriate to protect arterial saturation. Most crises resolve in 1–7 days. Use of blood transfusion should be reserved for extreme cases: transfusions do not shorten the duration of the crisis.'}


## BM-25 Search

In [15]:
response = medical_RAG_collection.query.bm25(
    query="food",
    limit=1,
    query_properties=["content"],
)
print(response.objects[0])

Object(uuid=_WeaviateUUIDInt('2c4037b2-63d4-48c1-9ec0-6acc3c5e1a9b'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'content': 'Obsessive-compulsive features, both related and unrelated to food, are often prominent. Most individuals with anorexia nervosa are preoccupied with thoughts of food. Some col- lect recipes or hoard food. Observations of behaviors associated with other forms of star- vation suggest that obsessions ancl compulsions related to food may be exacerbated by undemutrition. When individuals with anorexia nervosa exhibit obsessions and compul- sions that are not related to food, body shape, or weight, an additional diagnosis of obses- sive-compulsive disorder (OCD) may be warranted.', 'title': 'Psichiatry_DSM-5'}, references=None, vector={}, collection='Medical_RAG_Data')


## Near Vector Search

In [16]:
response = medical_RAG_collection.query.near_vector(
    near_vector=embeddings.embed_query("What is the biggest bone in the body?"), # your query vector goes here
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)
print(response.objects[0])

Object(uuid=_WeaviateUUIDInt('49e8f90a-84c6-4fa0-9f02-11dc97806fbb'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.18653613328933716, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'content': 'The bones of the gluteal region and the thigh are the pelvic bone and the femur (Fig. 6.8). The large ball and socket joint between these two bones is the hip joint. The femur is the bone of the thigh. At its distal end, its major weight-bearing articulation is with the tibia, but it also articulates anteriorly with the patella (knee cap). The patella is the largest sesamoid bone in the body and is embedded in the quadriceps femoris tendon. The joint between the femur and tibia is the principal articulation of the knee joint, but the joint between the patella and femur shares the same articular cavity. Although the main movements at the knee are flexion and extension, the knee joint also allows the femur to rota

In [17]:
# Can also search with images. Could be useful for Multimodal benchmarking

## Near Text Search (cannot be done with what I currently have set up)

In [18]:
import weaviate.classes as wvc
client.collections.create(
    "Test",
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
    ],
    
    vectorizer_config=[ # NEED TO SPECIFY THIS FOR NEAR TEXT TO WORK
        wvc.config.Configure.NamedVectors.text2vec_openai(
            name="embeddings",
            source_properties=["content"],
            vectorize_collection_name=False,
            vector_index_config=wvc.config.Configure.VectorIndex.hnsw(),
        ),
    ],
)

UnexpectedStatusCodeError: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name "Test" already exists'}]}.

In [92]:
import_batches_docs(docs[0:2], "Test")

Histology_Ross


In [93]:
test = client.collections.get("Test")
agg = test.aggregate.over_all()
print(agg)

AggregateReturn(properties={}, total_count=2)


In [95]:
response = test.query.near_text(
    query="anatomy",
    limit=1,
    # target_vector="title_country",  # Specify the target vector for named vector collections
    # return_metadata=MetadataQuery(distance=True)
)
response

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('cd518dcc-e038-451e-84ea-689ef83b11f0'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'title': 'Histology_Ross', 'content': 'The objective of a histology course is to lead the student to understand the microanatomy of cells, tissues, and organs and to correlate structure with function.'}, references=None, vector={}, collection='Test')])

## RAG Pipeline

In [55]:
from template import *
from openai import OpenAI


class RAG_System:
    def __init__(self, model='gpt-4o', limit=5, retrieval_method="dense", collection_name="Medical_RAG_Data"):
        self.model = model
        self.limit = limit
        self.retrieval_method = retrieval_method.lower()
        self.client = weaviate.connect_to_wcs(
            cluster_url=os.getenv("WCS_URL"),  # Replace with your Weaviate Cloud URL
            auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCS_API_KEY")),  # Replace with your Weaviate Cloud key
            headers={'X-OpenAI-Api-key': os.getenv("OPENAI_API_KEY")}  # Replace with your OpenAI API key
        )
        self.collection = self.client.collections.get(collection_name)
        self.model = model
    def answer_question(self, query, options=None, rag=False) -> dict:
        '''
        query (str): question to be answered
        options (Dict[str, str]): options to be chosen from (A,B,C,...)
        '''
        openai_client = OpenAI()
        if options is not None:
            options = '\n'.join([key+". "+options[key] for key in sorted(options.keys())])
        else:
            options = ''
        if rag:
            if self.retrieval_method == "dense":
                docs = self.collection.query.near_vector(
                            near_vector=embeddings.embed_query(query), # your query vector goes here
                            limit=self.limit,
                            return_metadata=MetadataQuery(distance=True)
                        )
            elif self.retrieval_method == "sparse":
                docs = self.collection.query.bm25(
                                query=query,
                                limit=self.limit,
                                query_properties=["content"],
                            )
            context = ""
            for i in range(len(docs.objects)):
                context += docs.objects[i].properties["content"]
                context += " "
            prompt_medrag = general_medrag.render(context=context, question=query, options=options)
            messages=[
                    {"role": "system", "content": general_medrag_system},
                    {"role": "user", "content": prompt_medrag}
            ]
        else:
            prompt_cot = general_cot.render(question=query, options=options)
            messages = [
                {"role": "system", "content": general_cot_system},
                {"role": "user", "content": prompt_cot}
            ]
        print(messages)
        response = openai_client.chat.completions.create(
                      model=self.model,
                      messages=messages
                    )
        ans = response.choices[0].message.content
        return ans

In [62]:
sys = RAG_System()
ans = sys.answer_question(query="What is the biggest bone in the body?", options={"A": "Femur", "B": "Humerus", "C": "Cubitus", "D": "Heart"}, rag=True)

  sys = RAG_System()


[{'role': 'system', 'content': 'You are a helpful medical expert, and your task is to answer a multi-choice medical question using the relevant documents. Please first think step-by-step and then choose the answer from the provided options. Organize your output in a json formatted as Dict{"step_by_step_thinking": Str(explanation), "answer_choice": Str{A/B/C/...}}. Your responses will be used for research purposes only, so please have a definite answer.'}, {'role': 'user', 'content': " Here are the relevant documents: The bones of the gluteal region and the thigh are the pelvic bone and the femur (Fig. 6.8). The large ball and socket joint between these two bones is the hip joint. The femur is the bone of the thigh. At its distal end, its major weight-bearing articulation is with the tibia, but it also articulates anteriorly with the patella (knee cap). The patella is the largest sesamoid bone in the body and is embedded in the quadriceps femoris tendon. The joint between the femur and 

In [63]:
ans
# WHY DOES IT OUTPUT \n???

'{\n  "step_by_step_thinking": "To answer the question about the biggest bone in the body, let\'s analyze the provided documents. The documents mention that the femur is the bone of the thigh and specifically state that it is the longest bone in the body. The humerus is another long bone found in the arm, but it is not mentioned to be the biggest or longest. The term \'cubitus\' typically refers to the elbow region, not a specific bone. Lastly, the heart is an organ and not a bone. Based on this information, the femur is identified as the largest bone in the body.",\n  "answer_choice": "A"\n}'

In [58]:
%reload_ext autoreload

In [59]:
%autoreload 2