In [None]:
import json
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Open the JSON file
with open("aggregated_data.json", "r") as f:
    all_data = json.load(f)

# Initialize embedding model "all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Prepare documents and metadata
documents = [item["content"] for item in all_data] # Create list of all the 'content'

metadatas = [{
    "title": item.get("title", "Untitled"), 
    "topic": item.get("topic", "No topic"), 
    "keywords": ", ".join(item.get("keywords", [])), 
    "source": item.get("source", "Unknown")
} for item in all_data] # Create list of all metadata

ids = [f"doc_{i}" for i in range(len(all_data))] # Create list of unique ids for each document

# Create vector store
vector_store = Chroma.from_texts(
    texts=documents,           # The text content from the documents
    embedding=embedding_model,  # The HuggingFace embeddings model
    metadatas=metadatas,       # The metadata associated with each document
    ids=ids,                   # Unique IDs for each document
    persist_directory="./chroma_db"  # Directory to save the persistent Chroma database
)
