In [1]:
# Use Cohere / OpenAI / Claude embeddings 
# Weaviate Database
# Step 0: Llama-Index / Langchain for indexing


# sparse (BM-25) vs dense (Semantic) retrieval


# The textbooks have already been chunked, so we need to do indexing


In [2]:
%load_ext autoreload
%autoreload 2


In [2]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os
load_dotenv()
embeddings = OpenAIEmbeddings()

No need for document / text splitting since the dataset has already done that for us.

In [3]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
loader = TextLoader("demo.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(docs[0].page_content)

Hello eversif sdjfkal fjdlsaf sdfnj asfdjl sdfnjsndfs
 sjdlf sdfnjs d'
  sdjf


In [4]:
# Testing it out
x = embeddings.embed_query("Hello")

In [5]:
# Compile and add all the doc strings
import json
def compile_docs(chunks_dir) -> list:
    arr = []
    current_dir = os.getcwd()
    for file in os.listdir(chunks_dir):
        with open(os.path.join(chunks_dir, file)) as jsonl_file:
            for line in jsonl_file:
                # Parse the JSON data from the line
                d = json.loads(line)
                arr.append(d)
    return arr
docs = compile_docs("textbooks/chunk")

In [6]:
len(docs)

125847

In [19]:
import weaviate
import os

client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WCS_URL"),  # Replace with your Weaviate Cloud URL
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCS_API_KEY")),  # Replace with your Weaviate Cloud key
    headers={'X-OpenAI-Api-key': os.getenv("OPENAI_API_KEY")}  # Replace with your OpenAI API key
)

In [67]:
def import_batches_docs(docs, collection_name) -> None:
    collection = client.collections.get(collection_name)
    with collection.batch.dynamic() as batch:
        old_data = ""
        # Docs to check: Neurology_Adams
        for data in docs:
            if old_data != data["title"]:
                print(data["title"])
                old_data = data["title"]
            batch.add_object(
                properties={"title": data["title"], "content":data["content"]},
                vector= embeddings.embed_query(data["content"]),
            )

In [33]:
# import_batches_docs(docs, "Medical_RAG_Data")

Pathoma_Husain
Pathology_Robbins
Anatomy_Gray


  serialized_response = await self._cython_call.unary_unary(


Obstentrics_Williams
Cell_Biology_Alberts
Surgery_Schwartz


In [35]:
medical_RAG_collection = client.collections.get("Medical_RAG_Data")

Testing out some features (ignore)

In [49]:
from weaviate.classes.query import MetadataQuery
# response = medical_RAG_collection.query.near_text(
#     query="What is anatomy? Anatomy includes those structures that can be seen grossly (without the aid of magnification) and microscopically (with the aid of magnification). Typically, when used by itself, the term anatomy tends to mean gross or macroscopic anatomy\u2014that is, the study of structures that can be seen without using a microscopic. Microscopic anatomy, also called histology, is the study of cells and tissues using a microscope. Anatomy forms the basis for the practice of medicine. Anatomy leads the physician toward an understanding of a patient\u2019s disease, whether he or she is carrying out a physical examination or using the most advanced imaging techniques. Anatomy is also important for dentists, chiropractors, physical therapists, and all others involved in any aspect of patient treatment that begins with an analysis of clinical signs. The ability to interpret a clinical observation correctly is therefore the endpoint of a sound anatomical understanding.",
#     limit=2,
#     # target_vector="title_country",  # Specify the target vector for named vector collections
#     # return_metadata=MetadataQuery(distance=True)
# )

In [39]:
agg = medical_RAG_collection.aggregate.over_all()
print(agg)

AggregateReturn(properties={}, total_count=114397)


In [56]:
response = medical_RAG_collection.query.fetch_objects(
    include_vector=True,
    limit=1
)
print(response.objects[0].properties)
# print(response.objects[0].vector)

{'content': 'severe symptoms or circumstances in which other processes, e.g., infection, are strongly suspected. Nasal oxygen should be used as appropriate to protect arterial saturation. Most crises resolve in 1–7 days. Use of blood transfusion should be reserved for extreme cases: transfusions do not shorten the duration of the crisis.', 'title': 'InternalMed_Harrison'}


## BM-25 Search

In [89]:
response = medical_RAG_collection.query.bm25(
    query="food",
    limit=1,
    query_properties=["content"],
)
print(response.objects[0])

Object(uuid=_WeaviateUUIDInt('2c4037b2-63d4-48c1-9ec0-6acc3c5e1a9b'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'content': 'Obsessive-compulsive features, both related and unrelated to food, are often prominent. Most individuals with anorexia nervosa are preoccupied with thoughts of food. Some col- lect recipes or hoard food. Observations of behaviors associated with other forms of star- vation suggest that obsessions ancl compulsions related to food may be exacerbated by undemutrition. When individuals with anorexia nervosa exhibit obsessions and compul- sions that are not related to food, body shape, or weight, an additional diagnosis of obses- sive-compulsive disorder (OCD) may be warranted.', 'title': 'Psichiatry_DSM-5'}, references=None, vector={}, collection='Medical_RAG_Data')


## Near Vector Search

response = medical_RAG_collection.query.near_vector(
    near_vector=embeddings.embed_query("What is the biggest bone in the body?"), # your query vector goes here
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)
print(response.objects[0])

In [None]:
# Can also search with images. Could be useful for Multimodal benchmarking

## Near Text Search (cannot be done with what I currently have set up)

In [77]:
import weaviate.classes as wvc
client.collections.create(
    "Test",
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
    ],
    vector_index_config= wvc.config.Configure.VectorIndex.hnsw(),
    vectorizer_config=[ # NEED TO SPECIFY THIS FOR NEAR TEXT TO WORK
        wvc.config.Configure.NamedVectors.text2vec_openai(
            name="embeddings",
            source_properties=["content"],
            vectorize_collection_name=False,
        ),
    ],
)

In [71]:
import_batches_docs(docs[0:2], "Test")

Histology_Ross


In [72]:
test = client.collections.get("Test")
agg = test.aggregate.over_all()
print(agg)

AggregateReturn(properties={}, total_count=2)


In [75]:
response = test.query.near_text(
    query="anatomy",
    limit=1,
    # target_vector="title_country",  # Specify the target vector for named vector collections
    # return_metadata=MetadataQuery(distance=True)
)