In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import json

with open("../../data/scrape_result.json", "r") as f: 
    raw = json.load(f)

documents = []

for page in raw["data"]:
    content = page.get("markdown", page.get("content", ""))
    metadata = page.get("metadata", {})
    
    # create document with metadata
    doc = Document(
        page_content=content,
        metadata=metadata
    )
    documents.append(doc)

print(f"Created {len(documents)} documents with metadata")

Created 10 documents with metadata


In [3]:
# initializing embedding model and vector store
from langchain_openai import OpenAIEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
from uuid import uuid4
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=openai_api_key)

# index is created using the embeddings model on a dummy document (can be changed)
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

# adding a document to the vector store
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)

['89e56f4a-13a3-458a-8361-76f384eaa50b',
 'db7d613f-45c9-41f6-a887-ab364e066ab9',
 '0912cd76-680d-403e-a592-a032d2820162',
 'fdbf5683-2945-4aeb-8913-df6844dea146',
 '4b53c024-e69f-4dd7-8397-8240e1883ab7',
 'e2db98ba-b119-47f6-a012-865ad9608c80',
 'a4bcd967-d112-40fc-a2f3-655afd3627f9',
 '2ab03c15-c25d-4b5e-8957-339d3b4220d2',
 '7a0203b6-3366-412b-8592-63ba8986e07b',
 'c57455e4-7b0f-4e73-a33c-b4d91857957e']

In [6]:
#keyword / sparse embeddings model
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

#FAISS retriever
faiss_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)

#BM25 retriever
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k=5

# ensemble retriever
ensemble_retriever = EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.8, 0.2])

In [7]:
def test_retriever(retriever, query, max_results=3):
    print(f"Query: {query}")
    print("-" * 60)
    
    results = retriever.get_relevant_documents(query)
    
    print(f"Found {len(results)} documents")
    print()
    
    for i, doc in enumerate(results[:max_results], 1):
        print(f"Document {i}:")
        print(f"Source: {doc.metadata.get('source', 'No URL available')}")
        print(f"Title: {doc.metadata.get('title', 'No title')}")
        print(f"Content: {doc.page_content[:200]}...")
        print()
        
# mock questions
qa_list = [
{
    "question": "what are the step-by-step stages from requesting a quote to the solar system being activated, and how long does the installation itself take?",
    "answer": (
        "1. generate a free solar estimate instantly via the online calculator\n"
        "2. consult with a solar specialist (they’ll reach out to you, typically within 24 hrs)\n"
        "3. receive a personalized solar proposal by email\n"
        "4. confirmatory site survey and e-sign contract\n"
        "5. installation and commissioning (usually completed in 3–5 days by the getsolar team + licensed electrical worker)\n"
        "6. system ready for use\n"
        "7. post-installation maintenance and monitoring\n"
        "installation itself (steps 5–6) takes about 3–5 days."
    )
},
{
    "question": "what information and documents will i need to provide when i speak to a sales specialist?",
    "answer": (
        "you’ll be asked for:\n"
        "- your property address and proof of ownership or tenant consent\n"
        "- a copy of your nric or passport\n"
        "- recent electricity bills (to gauge your consumption)\n"
        "- roof dimensions or a digital property survey (they guide you through a quick sketch or photos)\n"
        "- meter configuration details"
    )
},
{
    "question": "is there any upfront payment or deposit required for the rent-to-own plan?",
    "answer": (
        "no. the getsolar rent-to-own plan has zero upfront cost—no deposit or hidden fees.\n"
        "if you prefer, you can also opt for an upfront purchase plan and pay in full before installation."
    )
},
{
    "question": "how can i contact getsolar if i have questions, and what channels do they offer?",
    "answer": (
        "you can reach them via:\n"
        "- whatsapp: +65 8779 6122 (https://api.whatsapp.com/send?phone=6587796122)\n"
        "- email: support@getsolar.ai\n"
        "- contact form: https://getsolar.ai/contact\n"
        "they typically respond within one business day during standard office hours."
    )
},
{
    "question": "what post-installation maintenance and support do they provide, and are any inspections free?",
    "answer": (
        "getsolar offers daily remote monitoring plus routine system inspections:\n"
        "- free inspections and maintenance for customers on the rent-to-own plan (for the full 5 or 10 year term)\n"
        "- upfront-purchase customers pay a small fee per inspection\n"
        "- ongoing technical support is available via whatsapp or email"
    )
},
]

# run
for l in qa_list:
    test_retriever(ensemble_retriever, l["question"])
    print("=" * 80)
    print()

Query: what are the step-by-step stages from requesting a quote to the solar system being activated, and how long does the installation itself take?
------------------------------------------------------------


  results = retriever.get_relevant_documents(query)


Found 6 documents

Document 1:
Source: No URL available
Title: Solar PV Systems | Installation in Singapore | GetSolar
Content: # Installing Solar PV Systems

Enjoy substantial savings with your very own PV system, while actively contributing to a greener planet.

[Get your free quote](https://calculator.getsolar.ai/en-SG/)

!...

Document 2:
Source: No URL available
Title: Commercial Solar Panel Installation | $0 Upfront | GetSolar
Content: # Commercial & Non-Residential Solar Panel Installation

Higher and higher electricity bills? Turn your rooftop into long-term savings. GetSolar offers tailored solar panel installation for commercial...

Document 3:
Source: No URL available
Title: GetSolar: Solar Panel Installation Company in Singapore
Content: # GetSolar – The Trusted Solar Panel Company in Singapore

Unlock your savings while saving the planet. We make the process of getting solar simple for you through our zero-upfront cost plans.

[Reque...


Query: what information and docume