In [6]:
import os
import re
import numpy as np
from PyPDF2 import PdfReader
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
import math

from langchain.embeddings import HuggingFaceEmbeddings

class PDFProcessor:
    def __init__(self):
        self.embedding_model = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
        self.batch_progress = None

    def process_pdfs(self, pdf_folder):
        pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
        print(f" Found {len(pdf_files)} PDFs to process")

        with tqdm(total=len(pdf_files), desc=" PDF Files", unit="file") as file_pbar:
            with ThreadPoolExecutor() as executor:
                futures = []
                for pdf_file in pdf_files:
                    future = executor.submit(
                        self._process_single_pdf,
                        os.path.join(pdf_folder, pdf_file)
                    )
                    future.add_done_callback(lambda _: file_pbar.update(1))
                    futures.append(future)

                results = []
                for future in tqdm(futures, desc=" Processing", unit="task"):
                    results.append(future.result())

        all_chunks = np.concatenate([r[0] for r in results])
        all_embeddings = np.concatenate([r[1] for r in results])

        return all_chunks, all_embeddings

    def _process_single_pdf(self, pdf_path):
        text = []
        with open(pdf_path, 'rb') as f:
            reader = PdfReader(f)
            with tqdm(total=len(reader.pages), desc=f"📄 {os.path.basename(pdf_path)}", leave=False, unit="page") as page_pbar:
                for page in reader.pages:
                    text.append(page.extract_text() or "")
                    page_pbar.update(1)

        clean_text = re.sub(r'\s+', ' ', " ".join(text)).strip()
        chunks = self._semantic_chunking(clean_text)

        embeddings = []
        batch_size = 32
        self.batch_progress = tqdm(total=math.ceil(len(chunks)/batch_size),
                                   desc=" Generating Embeddings",
                                   leave=False,
                                   unit="batch")

        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            embedded = self.embedding_model.embed_documents(batch)
            embeddings.append(np.array(embedded))
            self.batch_progress.update(1)

        self.batch_progress.close()
        return np.array(chunks), np.concatenate(embeddings)

    def _semantic_chunking(self, text):
        sections = re.split(r'\n\s*\n', text)
        chunks = []
        with tqdm(total=len(sections), desc=" Chunking Text", leave=False, unit="section") as chunk_pbar:
            for section in sections:
                words = section.split()
                for i in range(0, len(words), 512 - 64):
                    chunk = ' '.join(words[i:i + 512])
                    chunks.append(chunk)
                chunk_pbar.update(1)
        return chunks


if __name__ == "__main__":
    processor = PDFProcessor()

    print("Starting Indian Tourism PDF Processing Pipeline (with LangChain)")
    chunks, embeddings = processor.process_pdfs("pdfs")

    np.save("chunks.npy", chunks)
    np.save("embeddings.npy", embeddings)

    print(f"\n Processed {len(chunks)} chunks")
    print(f"Embeddings shape: {embeddings.shape}")


🚀 Starting Indian Tourism PDF Processing Pipeline (with LangChain)
🔍 Found 3 PDFs to process


📂 PDF Files:   0%|          | 0/3 [00:00<?, ?file/s]

🔄 Processing:   0%|          | 0/3 [00:00<?, ?task/s]

📄 tourism1.pdf:   0%|          | 0/48 [00:00<?, ?page/s]

📄 LonelyPlanet.pdf:   0%|          | 0/1207 [00:00<?, ?page/s]

📄 indiatourism2.pdf:   0%|          | 0/53 [00:00<?, ?page/s]

✂️ Chunking Text:   0%|          | 0/1 [00:00<?, ?section/s]

🧠 Generating Embeddings:   0%|          | 0/2 [00:00<?, ?batch/s]

✂️ Chunking Text:   0%|          | 0/1 [00:00<?, ?section/s]

🧠 Generating Embeddings:   0%|          | 0/2 [00:00<?, ?batch/s]

✂️ Chunking Text:   0%|          | 0/1 [00:00<?, ?section/s]

🧠 Generating Embeddings:   0%|          | 0/50 [00:00<?, ?batch/s]


✅ Done! Processed 1673 chunks
📊 Embeddings shape: (1673, 768)


In [15]:
import chromadb
import numpy as np
from tqdm import tqdm
import re
import json

class ChromaDBStore:
    def __init__(self, persist_dir="chroma_db"):
        self.client = chromadb.PersistentClient(path=persist_dir)
        self.collection = self.client.get_or_create_collection(
            name="indian_tourism",
            metadata={"hnsw:space": "cosine"}
        )

    def store_embeddings(self, chunks, embeddings, metadatas=None):

        embeddings_list = embeddings.tolist()
        ids = [f"id_{i}" for i in range(len(chunks))]


        if metadatas is None:
            metadatas = [{} for _ in chunks]


        safe_metadatas = []
        for meta in metadatas:
            safe_meta = {}
            for k, v in meta.items():
                if isinstance(v, (list, dict)):

                    safe_meta[k] = json.dumps(v)
                elif isinstance(v, (str, int, float, bool)):
                    safe_meta[k] = v
                else:

                    safe_meta[k] = str(v)
            safe_metadatas.append(safe_meta)


        batch_size = 100
        with tqdm(total=len(ids), desc="Storing in ChromaDB") as pbar:
            for i in range(0, len(ids), batch_size):
                batch = {
                    "ids": ids[i:i + batch_size],
                    "embeddings": embeddings_list[i:i + batch_size],
                    "documents": [str(chunk) for chunk in chunks[i:i + batch_size]],
                    "metadatas": safe_metadatas[i:i + batch_size]
                }


                self._validate_batch(batch)
                self.collection.add(**batch)
                pbar.update(len(batch["ids"]))

        print(f"Successfully stored {len(ids)} embeddings")

    def _validate_batch(self, batch):
        """Strict validation for ChromaDB requirements"""
        required_fields = ["ids", "embeddings", "documents"]
        for field in required_fields:
            if field not in batch:
                raise ValueError(f"Missing required field: {field}")

            if not isinstance(batch[field], list):
                raise ValueError(f"{field} must be a list")


        base_length = len(batch["ids"])
        for field in ["embeddings", "documents"]:
            if len(batch[field]) != base_length:
                raise ValueError(f"Length mismatch: ids ({base_length}) vs {field} ({len(batch[field])})")


        if "metadatas" in batch:
            if len(batch["metadatas"]) != base_length:
                raise ValueError(f"Metadata length mismatch: expected {base_length}, got {len(batch['metadatas'])}")

            for meta in batch["metadatas"]:
                if not isinstance(meta, dict):
                    raise ValueError("Each metadata must be a dictionary")
                for k, v in meta.items():
                    if not isinstance(v, (str, int, float, bool)):
                        raise ValueError(f"Metadata value {k} must be str/int/float/bool, got {type(v)}")

    def generate_metadata(self, chunks):
        metadatas = []
        for chunk in chunks:
            try:
                chunk = str(chunk)


                locations = [word for word in re.findall(r'\b[A-Z][a-z]+\b', chunk)
                          if word not in ['India', 'Indian', 'The', 'You']
                          and len(word) > 2][:3]
                locations_str = ", ".join(locations)


                activities = []
                activity_map = {
                    'heritage': ['fort', 'palace', 'museum', 'temple'],
                    'adventure': ['trek', 'hike', 'camp', 'rafting'],
                    'spiritual': ['ashram', 'meditation', 'yoga'],
                    'beach': ['beach', 'sunset', 'wave', 'coast']
                }
                for activity, keywords in activity_map.items():
                    if any(kw in chunk.lower() for kw in keywords):
                        activities.append(activity)
                activities_str = ", ".join(activities)

                metadatas.append({
                    "locations": locations_str,
                    "activities": activities_str,
                    "word_count": len(chunk.split())
                })
            except Exception as e:
                print(f" Metadata generation error: {str(e)[:100]}...")
                metadatas.append({
                    "locations": "",
                    "activities": "",
                    "word_count": 0
                })
        return metadatas


if __name__ == "__main__":

    chunks = np.load("chunks.npy", allow_pickle=True)
    embeddings = np.load("embeddings.npy")


    chroma_store = ChromaDBStore()


    print("Generating ChromaDB-compatible metadata...")
    metadatas = chroma_store.generate_metadata(chunks)


    try:
        chroma_store.store_embeddings(chunks, embeddings, metadatas)
    except Exception as e:
        print(f"Critical error: {e}")
        print("Debugging tips:")
        print("- Check for None values in chunks")
        print("- Verify embeddings are finite numbers")
        print("- Inspect first 5 metadata items:")
        for i, meta in enumerate(metadatas[:5]):
            print(f"  {i}: {meta}")

🏷️ Generating ChromaDB-compatible metadata...


📤 Storing in ChromaDB: 100%|██████████| 1673/1673 [00:05<00:00, 319.73it/s]

✅ Successfully stored 1673 embeddings





In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence
from langchain_community.chat_models import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

import chromadb
import os


class ItineraryAgent:
    def __init__(self, db_path="chroma_db"):
        os.environ["OPENAI_API_KEY"] = "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"
        os.environ["OPENAI_API_BASE"] = "https://api.groq.com/openai/v1"

        self.embedding_model = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )

        self.chroma_client = chromadb.PersistentClient(path=db_path)
        self.vector_db = Chroma(
            client=self.chroma_client,
            collection_name="indian_tourism",
            embedding_function=self.embedding_model
        )

        self.llm = ChatOpenAI(
            model="llama3-8b-8192",
            temperature=0.7,
            max_tokens=2048
        )

        self.prompt = PromptTemplate.from_template(
            """You are a travel expert specializing in Indian tourism.
Use the following context to create a detailed itinerary based on the user's query.

Context:
{context}

User Query:
{input}

Create a day-by-day itinerary with:
1. Morning, afternoon, and evening activities
2. Travel time estimates between locations
3. Recommended dining options
4. Cultural tips and precautions

Format the itinerary like:

DAY 1
------
- Morning:
- Afternoon:
- Evening:
- Travel Time:
- Food Suggestions:
- Cultural Tips:

DAY 2
------
...and so on."""
        )

        self.retriever = self.vector_db.as_retriever(search_kwargs={"k": 5})
        self.chain = self.prompt | self.llm

    def generate_itinerary(self, query):
        try:

            docs = self.retriever.invoke(query)


            context_text = "\n\n".join([doc.page_content for doc in docs])


            response = self.chain.invoke({
                "context": context_text,
                "input": query
            })

            return response.content if hasattr(response, "content") else str(response)

        except Exception as e:
            return f"Error generating itinerary: {str(e)}"
if __name__ == "__main__":
    agent = ItineraryAgent()

    print("Welcome to India Travel Planner!")
    print("Ask about destinations, activities, or request a full itinerary.")
    print("Type 'exit' to quit.\n")

    while True:
        query = input("Your travel query: ")
        if query.lower() == 'exit':
            break

        response = agent.generate_itinerary(query)
        print("\nSuggested Itinerary:")
        print(response)
        print("\n" + "=" * 80 + "\n")


Welcome to India Travel Planner!
Ask about destinations, activities, or request a full itinerary.
Type 'exit' to quit.

Your travel query: south india travel plan

Suggested Itinerary:
Here is a suggested 7-day itinerary for South India:

**DAY 1: Mumbai to Hampi**

* Morning: Explore the iconic Gateway of India and Marine Drive promenade in Mumbai
* Afternoon: Travel to Hospet (approximately 5 hours) and visit the scenic Tungabhadra Dam
* Evening: Check-in to a hotel in Hampi and enjoy a relaxing evening stroll around the ancient temple complex
* Travel Time: 5 hours by road
* Food Suggestions: Try some local street food at Hospet or enjoy a meal at a hotel restaurant
* Cultural Tips: Be prepared for crowds and heat in Hampi, and dress modestly when visiting temples

**DAY 2: Hampi**

* Morning: Visit the iconic Vitthala Temple and the Royal Enclosure in Hampi
* Afternoon: Explore the ancient city and its many temples, including the Virupaksha Temple
* Evening: Enjoy a sunset view of 