In [None]:
import json

with open("cleaned_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} documents")


Loaded 1552 documents


In [2]:
from langchain_core.documents import Document

docs = [
    Document(page_content=item["content"], metadata={"source": item["url"]})
    for item in data
]


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(docs)
print(f"Generated {len(chunks)} document chunks.")


Generated 14402 document chunks.


In [9]:
!pip install -U langchain-community





[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:

import os
import pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")



In [13]:
from pinecone import Pinecone

pc = Pinecone(
    api_key="pinecone_api_key")

In [None]:
index_name = "changi-rag-chatbot"
from pinecone import ServerlessSpec

if not pc.has_index(name=index_name):
    pc.create_index(
        name=index_name,
        dimension=768, 
        metric="cosine",
        spec=ServerlessSpec(
           cloud="aws",
           region="us-east-1"
        )
    )

index = pc.Index(name=index_name)

index.describe_index_stats()

In [55]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
Collecting transformers<5.0.0,>=4.41.0
  Using cached transformers-4.53.3-py3-none-any.whl (10.8 MB)
Collecting torch>=1.11.0
  Using cached torch-2.7.1-cp311-cp311-win_amd64.whl (216.1 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
Collecting scipy
  Using cached scipy-1.16.0-cp311-cp311-win_amd64.whl (38.6 MB)
Collecting huggingface-hub>=0.20.0
  Using cached huggingface_hub-0.33.5-py3-none-any.whl (515 kB)
Collecting Pillow
  Using cached pillow-11.3.0-cp311-cp311-win_amd64.whl (7.0 MB)
Collecting filelock
  Using cached filelock-3.18.0-py3-none-any.whl (16 kB)
Collecting fsspec>=2023.5.0
  Using cached fsspec-2025.7.0-py3-none-any.whl (199 kB)
Collecting sympy>=1.13.3
  Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Collecting networkx
  Using cached networkx-3.5-py3-none-any.whl (2.0 MB)
Collecting jinja2
  Using cached jinja2


[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [57]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [59]:
[doc.metadata for doc in chunks]


[{'source': 'http://www.changiairport.com/en/rewards/feedback.html',
  'chunk_num': 0,
  'text': "Feedback FEEDBACK FORM Find answers to your questions on our FAQs . Otherwise, reach out to us by completing the form below. Fields marked with * are mandatory. Please note that your personal data may be shared with our partners and other relevant third parties - to enable them to follow-up and reply to you directly on your feedback. By provision of your personal particulars, you are deemed to have provided your consent to the collection, use or disclosure of your personal data for this purpose.\xa0All processing of personal data will be in accordance with CAG's Privacy Policy . Category Select Category Account Services Compliments e-Voucher Enquiries General Enquiries Points - Retrospective Claim Appeal Rewards Redemption, Promotions Enquiries on BLACKPINK Presale access Name Email Contact Number Changi Rewards Card Number 0000 - 2001 - member I am not a Changi Rewards member member I am 

In [61]:
vector_store = PineconeVectorStore(index=index, embedding= embedding_model)

In [62]:
def clean_url_for_title(url):
    return url.split("/")[-1].replace(".md", "")

def generate_ids(doc_chunk):
    title = clean_url_for_title(doc_chunk.metadata['source'])
    chunk_num = doc_chunk.metadata['chunk_num']
    feature = doc_chunk.metadata.get('feature', 'na')
    return f"release_{title}#feature_{feature}#chunk_num{chunk_num}"

ids = [generate_ids(doc) for doc in chunks]


In [None]:
vector_store.add_documents(documents=chunks, ids=ids)

['release_feedback.html#feature_na#chunk_num0',
 'release_feedback.html#feature_na#chunk_num1',
 'release_feedback.html#feature_na#chunk_num2',
 'release_feedback.html#feature_na#chunk_num3',
 'release_feedback.html#feature_na#chunk_num4',
 'release_at-changi.html#feature_na#chunk_num5',
 'release_at-changi.html#feature_na#chunk_num6',
 'release_at-changi.html#feature_na#chunk_num7',
 'release_explore-by-terminal.html#feature_na#chunk_num8',
 'release_explore-by-terminal.html#feature_na#chunk_num9',
 'release_facilities-and-services-directory.html#feature_na#chunk_num10',
 'release_facilities-and-services-directory.html#feature_na#chunk_num11',
 'release_facilities-and-services-directory.html#feature_na#chunk_num12',
 'release_airport-parking.html#feature_na#chunk_num13',
 'release_airport-parking.html#feature_na#chunk_num14',
 'release_airport-parking.html#feature_na#chunk_num15',
 'release_airport-parking.html#feature_na#chunk_num16',
 'release_airport-parking.html#feature_na#chunk_n

In [None]:
from pinecone import ServerlessSpec
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 14402}},
 'total_vector_count': 14402,
 'vector_type': 'dense'}

In [None]:
# Test query
query_text = "What is the refund policy?"  # Replace with relevant test query
query_embedding = embedding_model.embed_query(query_text)

result = index.query(vector=query_embedding, top_k=3, include_metadata=True)

for match in result['matches']:
    print(f"Score: {match['score']}")
    print(f"ID: {match['id']}")
    print(f"Metadata: {match['metadata']}")
    print("-" * 50)


Score: 0.743133605
ID: release_return-policy.html#feature_na#chunk_num426
Metadata: {'chunk_num': 426.0, 'source': 'https://www.changiairport.com/au/en/dine-and-shop/shopping-concierge/return-policy.html', 'text': 'Return Policy SHOPPING CONCIERGE RETURN POLICY FOR DELIVERED OR COLLECTED PRODUCTS RETURN POLICY If you are not satisfied with your purchase after receiving your item, you may request for a product exchange or return within thirty (30) days of purchase. Please get in touch with our Concierge, indicating your order number in your request. Products must be returned in the condition as set out in this Return Policy. Once your request is processed, you will receive the refund via your original mode of payment within fourteen (14) business days. The following policy applies to purchases of products where (a) the products have been collected from the Shopping Concierge within Singapore Changi Airport; or (b) the products are to be delivered or have been delivered to the designated