In [4]:
# ==============================================================================
# FINAL SCRIPT: VectorDB Creation with Data Cleaning
# This script creates a clean ChromaDB from your unified corpus.
# ==============================================================================

# --- Cell 1: Dependencies & Config ---
print("--- Initializing Setup for VectorDB Creation ---")
!pip install chromadb sentence-transformers langchain langchain-community langdetect -q --progress-bar off

import json
import os
import shutil
import re
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langdetect import detect, LangDetectException # Used in clean_text_for_ner

# --- Configuration Variables ---
# Make sure these match your main script and Drive paths
UNIFIED_INPUT_FILE_DRIVE = "/content/drive/MyDrive/extracted_content/unified_corpus.json"
UNIFIED_INPUT_FILE_LOCAL = "/content/unified_corpus_vectordb.json" # Local path for VectorDB's corpus copy

CHROMA_DB_PATH_DRIVE = "/content/drive/MyDrive/chroma_db" # Path where ChromaDB will be saved/loaded from Drive
CHROMA_DB_PATH_LOCAL = "/content/local_chroma_db" # Local path where we'll work with DB

CHROMA_COLLECTION_NAME = "mosdac_knowledge_unified"
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
MAX_CHUNK_CHARS = 500
OVERLAP_CHARS = 100

# --- Mount Drive and Copy Corpus Locally for reliable I/O ---
from google.colab import drive
drive.mount('/content/drive', force_remount=True) # Ensure fresh mount
print("\nCopying unified_corpus.json from Google Drive to local Colab for VectorDB processing...")
os.makedirs(os.path.dirname(UNIFIED_INPUT_FILE_LOCAL), exist_ok=True)
!cp -f "{UNIFIED_INPUT_FILE_DRIVE}" "{UNIFIED_INPUT_FILE_LOCAL}"
print("Copy complete. VectorDB will use local corpus.")

print("✅ Cell 1: Dependencies installed and configuration set.")

# --- Cell 2: Main Logic with Data Cleaning ---

# --- Data Cleaning Function (Proven effective from KG script) ---
def clean_text_for_ner(text): # Using this function name for consistency with KG
    """Cleans text to remove noise before processing for VectorDB."""
    if not isinstance(text, str) or len(text) < 20: return None
    try:
        if len(text.strip()) > 50: # Only try to detect language for longer strings
            if detect(text) != 'en': return None
    except LangDetectException:
        return None

    text = re.sub(r'https?://\S+|www.\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r'\[.*?\]\(.*?\)', '', text) # Remove markdown links
    text = re.sub(r'\w+=\S+|&[a-z_]+=', '', text) # Remove URL parameters
    text = re.sub(r'[^A-Za-z0-9\s\.\-]', ' ', text) # Keep alphanumeric, spaces, dots, dashes
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def create_and_populate_vectordb():
    print(f"\n--- Starting VectorDB Creation with Data Cleaning ---")

    # 1. Load Corpus from LOCAL path
    try:
        with open(UNIFIED_INPUT_FILE_LOCAL, 'r', encoding='utf-8') as f:
            unified_corpus = json.load(f)
        print(f"Successfully loaded {len(unified_corpus)} documents from local corpus.")
    except Exception as e:
        print(f"Error loading local corpus: {e}"); return

    # 2. Convert and CLEAN raw documents
    print("Cleaning and preparing documents for VectorDB...")
    documents = []
    for doc_entry in unified_corpus:
        # Apply the proven clean_text_for_ner function
        cleaned_page_content = clean_text_for_ner(doc_entry.get("text_content", ""))
        if cleaned_page_content and cleaned_page_content.strip():
            documents.append(
                Document(
                    page_content=cleaned_page_content,
                    metadata={"source": doc_entry.get("source_url", "N/A")}
                )
            )
    print(f"Prepared {len(documents)} clean documents for VectorDB processing.")

    # 3. Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=MAX_CHUNK_CHARS, chunk_overlap=OVERLAP_CHARS)
    doc_chunks = text_splitter.split_documents(documents)
    print(f"Split documents into {len(doc_chunks)} chunks.")

    # 4. Initialize Embedding Model
    embedding_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    print("Embedding model loaded.")

    # 5. Create and Populate ChromaDB
    # Delete old local DB if exists
    if os.path.exists(CHROMA_DB_PATH_LOCAL):
        shutil.rmtree(CHROMA_DB_PATH_LOCAL)
        print(f"Deleted old local database at {CHROMA_DB_PATH_LOCAL}.")

    print(f"Creating new VectorDB at local path {CHROMA_DB_PATH_LOCAL}... This may take a while.")
    vector_store = Chroma.from_documents(
        documents=doc_chunks,
        embedding=embedding_model,
        collection_name=CHROMA_COLLECTION_NAME,
        persist_directory=CHROMA_DB_PATH_LOCAL # Persist to local Colab disk
    )
    print(f"✅ VectorDB creation complete. Total chunks: {vector_store._collection.count()}")

    # 6. Perform a Sample Semantic Search to verify
    print("\n--- Performing a Sample Semantic Search on the NEW clean database ---")
    query_text = "What are the details of INSAT-3DR mission and its features?"
    print(f"Query: '{query_text}'")
    retriever = vector_store.as_retriever(search_kwargs={'k': 3})
    results = retriever.get_relevant_documents(query_text)
    if results:
        print("\nTop 3 Retrieved Chunks:")
        for i, doc in enumerate(results):
            print(f"\n--- Result {i+1} ---")
            print(f"Source URL: {doc.metadata.get('source', 'N/A')}")
            print(f"Content: {doc.page_content}")
    else:
        print("No results found for the query.")

    print("\n--- VectorDB Creation and Population Completed ---")

# Run the creation process
create_and_populate_vectordb()

# --- OPTIONAL: Copy Local ChromaDB back to Google Drive (for persistence across sessions) ---
print(f"\nCopying local ChromaDB from {CHROMA_DB_PATH_LOCAL} back to Google Drive {CHROMA_DB_PATH_DRIVE}...")
# Ensure parent directory exists for Drive copy
os.makedirs(os.path.dirname(CHROMA_DB_PATH_DRIVE), exist_ok=True)
if os.path.exists(CHROMA_DB_PATH_DRIVE): # Clear old Drive copy first
    shutil.rmtree(CHROMA_DB_PATH_DRIVE)
    print("Deleted old Drive copy.")
shutil.copytree(CHROMA_DB_PATH_LOCAL, CHROMA_DB_PATH_DRIVE)
print("✅ Local ChromaDB copied back to Google Drive.")

--- Initializing Setup for VectorDB Creation ---
Mounted at /content/drive

Copying unified_corpus.json from Google Drive to local Colab for VectorDB processing...
Copy complete. VectorDB will use local corpus.
✅ Cell 1: Dependencies installed and configuration set.

--- Starting VectorDB Creation with Data Cleaning ---
Successfully loaded 67838 documents from local corpus.
Cleaning and preparing documents for VectorDB...
Prepared 58382 clean documents for VectorDB processing.
Split documents into 61606 chunks.


  embedding_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model loaded.
Creating new VectorDB at local path /content/local_chroma_db... This may take a while.
✅ VectorDB creation complete. Total chunks: 61606

--- Performing a Sample Semantic Search on the NEW clean database ---
Query: 'What are the details of INSAT-3DR mission and its features?'


  results = retriever.get_relevant_documents(query_text)



Top 3 Retrieved Chunks:

--- Result 1 ---
Source URL: https://www.mosdac.gov.in/node?qt-latest_products=4&qt-services_quicktab=4
Content: Link Text INSAT-3A Target URL Context ...ttps INSAT-3D KALPANA-1 INSAT-3A MeghaTropiques SARAL-AltiKa OCEANSAT-2 htt...

--- Result 2 ---
Source URL: https://www.mosdac.gov.in/node?qt-latest_products=4
Content: Link Text INSAT-3A Target URL Context ...ttps INSAT-3D KALPANA-1 INSAT-3A MeghaTropiques SARAL-AltiKa OCEANSAT-2 htt...

--- Result 3 ---
Source URL: https://www.mosdac.gov.in/node?qt-latest_products=3%2F&qt-services_quicktab=3
Content: Link Text INSAT-3A Target URL Context ...ttps INSAT-3D KALPANA-1 INSAT-3A MeghaTropiques SARAL-AltiKa OCEANSAT-2 htt...

--- VectorDB Creation and Population Completed ---

Copying local ChromaDB from /content/local_chroma_db back to Google Drive /content/drive/MyDrive/chroma_db...
✅ Local ChromaDB copied back to Google Drive.
