In [9]:
import json
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm

def create_and_save_vector_store(json_path="all_drug_data.json", save_path="faiss_drug_index"):
    """Loads data, creates chunks, embeds them, and saves to a FAISS vector store."""

    # 1. Load your collected data
    print(f"Loading data from {json_path}...")
    with open(json_path, 'r') as f:
        data = json.load(f)

    all_docs = []
    print("Processing raw data into documents...")
    # 2. Process raw JSON into LangChain Document objects
    for entry in tqdm(data, desc="Processing entries"):
        # We only process entries that have some useful information
        if not entry:
            continue

        brand_name_list = entry.get("openfda", {}).get("brand_name", ["Unknown Brand"])
        generic_name_list = entry.get("openfda", {}).get("generic_name", ["Unknown Generic"])
        brand_name = brand_name_list[0] if brand_name_list else "Unknown Brand"
        generic_name = generic_name_list[0] if generic_name_list else "Unknown Generic"

        sections_to_process = {
            "drug_interactions": "Drug Interactions",
            "adverse_reactions": "Adverse Reactions",
            "contraindications": "Contraindications",
            "description": "Description"
        }

        for key, section_name in sections_to_process.items():
            # Extract text, which is often nested in a list
            text_list = entry.get(key)
            if text_list and isinstance(text_list, list) and text_list[0].strip():
                text = text_list[0]
                metadata = {"brand_name": brand_name, "generic_name": generic_name, "section": section_name}
                doc = Document(page_content=text, metadata=metadata)
                all_docs.append(doc)

    print(f"Created {len(all_docs)} documents.")

    # 3. Split the documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    split_docs = text_splitter.split_documents(all_docs)
    print(f"Split into {len(split_docs)} chunks.")

    # 4. Initialize the embedding model
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    print(f"Loading embedding model: {model_name}...")
    embeddings = HuggingFaceEmbeddings(model_name=model_name)

    # 5. Create the FAISS vector store and save it
    print("Creating and saving the FAISS vector store...")
    vector_store = FAISS.from_documents(split_docs, embeddings)
    vector_store.save_local(save_path)
    print(f"✅ Vector store created and saved locally as '{save_path}'.")

if __name__ == "__main__":
    create_and_save_vector_store()

Loading data from all_drug_data.json...
Processing raw data into documents...


Processing entries: 100%|██████████| 10000/10000 [00:00<00:00, 147059.17it/s]

Created 13833 documents.





Split into 48614 chunks.
Loading embedding model: sentence-transformers/all-MiniLM-L6-v2...


  embeddings = HuggingFaceEmbeddings(model_name=model_name)
  from .autonotebook import tqdm as notebook_tqdm


Creating and saving the FAISS vector store...
✅ Vector store created and saved locally as 'faiss_drug_index'.
