In [None]:
!pip uninstall -y pinecone-client pinecone -q
!pip install -q pinecone
!pip install -q transformers accelerate sentence-transformers
!pip install -q jsonlines ftfy unidecode


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.8/44.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m235.8/235.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import jsonlines
import re
import ftfy
from unidecode import unidecode
from tqdm.auto import tqdm
from typing import List, Dict
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone, ServerlessSpec
import numpy as np
import gc


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f" Running on: {DEVICE}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


üöÄ Running on: cuda
GPU: Tesla T4
Memory: 15.83 GB


In [None]:
class OCRTextCleaner:
    """
    Specialized cleaner for OCR-extracted Ayurvedic texts
    Handles: encoding issues, OCR artifacts, page numbers, headers, gibberish
    """

    def __init__(self):
        # Common OCR artifacts and patterns to clean
        self.patterns_to_remove = [
            r'Vol\.\s*Page\.',  # Volume and page references
            r'Vol\.\s*Rage\.',  # OCR typo
            r'Page\.\s*\d+',    # Page numbers
            r'\d+\s*Vol\.',     # Volume numbers
            r'INDEX\.',         # Index markers
            r'i{2,}',           # Multiple 'i' characters (OCR artifacts)
            r'\^+',             # Caret symbols
            r'\.{3,}',          # Multiple dots
            r'\s{3,}',          # Multiple spaces
            r'[‚ñ†‚ñ°‚óè‚óã‚ñ™‚ñ´‚óÜ‚óá‚òÖ‚òÜ]+',  # Bullet symbols
            r'ÔøΩ+',              # Replacement characters
            r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]',  # Control chars
        ]

        # Compile patterns
        self.compiled_patterns = [re.compile(p) for p in self.patterns_to_remove]

    def fix_encoding(self, text: str) -> str:
        """Fix encoding issues using ftfy"""
        try:
            text = ftfy.fix_text(text)
        except:
            pass
        return text

    def remove_ocr_artifacts(self, text: str) -> str:
        """Remove common OCR artifacts and patterns"""
        for pattern in self.compiled_patterns:
            text = pattern.sub(' ', text)

        # Remove standalone single characters (OCR noise)
        text = re.sub(r'\b[a-zA-Z]\b', '', text)

        # Fix common OCR mistakes in Ayurvedic terms
        replacements = {
            'S\'': 'Sh',  # S'ira -> Shira
            's\'': 'sh',
            'A\'': 'A',   # A'yur -> Ayur
            'a\'': 'a',
        }
        for old, new in replacements.items():
            text = text.replace(old, new)

        return text

    def normalize_whitespace(self, text: str) -> str:
        """Normalize all whitespace"""
        # Replace newlines with spaces
        text = text.replace('\n', ' ')
        # Collapse multiple spaces
        text = ' '.join(text.split())
        return text

    def is_meaningful(self, text: str, min_words: int = 5, min_avg_word_len: float = 2.5) -> bool:
        """Check if text contains meaningful content"""
        if not text or len(text.strip()) < 20:
            return False

        words = text.split()
        if len(words) < min_words:
            return False

        # Check average word length (gibberish tends to have short words)
        avg_word_len = sum(len(w) for w in words) / len(words)
        if avg_word_len < min_avg_word_len:
            return False

        # Check if contains mostly alphabetic characters
        alpha_ratio = sum(c.isalpha() or c.isspace() for c in text) / len(text)
        if alpha_ratio < 0.7:
            return False

        return True

    def clean(self, text: str) -> str:
        """Complete cleaning pipeline"""
        if not text:
            return ""

        # Step 1: Fix encoding
        text = self.fix_encoding(text)

        # Step 2: Remove OCR artifacts
        text = self.remove_ocr_artifacts(text)

        # Step 3: Normalize whitespace
        text = self.normalize_whitespace(text)

        # Step 4: Remove extra punctuation
        text = re.sub(r'([^\w\s])\1{2,}', r'\1', text)

        return text.strip()

# Initialize cleaner
cleaner = OCRTextCleaner()

# Test on your sample data
sample_text = """IDfiX. Their number S'arkara S'arkaras'mari S'arkaratvuda Sarpa-satlva S'ara-pumkha S'ar;iri-mukha S'arira-Sihana Vol. Page.

H ii iiiSarpih (clarified butter) i Sarpir-meha ii"""

print(" Testing cleaner on sample:")
print(f"Original: {sample_text[:150]}...")
cleaned = cleaner.clean(sample_text)
print(f"Cleaned: {cleaned[:150]}...")
print(f"Meaningful: {cleaner.is_meaningful(cleaned)}\n")


üß™ Testing cleaner on sample:
Original: IDfiX. Their number S'arkara S'arkaras'mari S'arkaratvuda Sarpa-satlva S'ara-pumkha S'ar;iri-mukha S'arira-Sihana Vol. Page.

H ii iiiSarpih (clarifie...
Cleaned: IDfiX. Their number 'arkara 'arkarashmari 'arkaratvuda Sarpa-satlva 'ara-pumkha 'ar;iri-mukha 'arira-Sihana Sarpih (clarified butter) Sarpir-meha...
Meaningful: True



In [None]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIM = 384

print(f" Loading model: {MODEL_NAME}")

# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)
model.eval()

print(" Model loaded!")
if DEVICE == "cuda":
    print(f"üíæ GPU Memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB\n")


üì¶ Loading model: sentence-transformers/all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

‚úÖ Model loaded!
üíæ GPU Memory: 0.05 GB



In [None]:
def load_and_clean_data(file_path: str) -> List[Dict]:
    """Load JSONL and clean OCR text"""
    dataset = []
    stats = {"total": 0, "valid": 0, "empty": 0, "gibberish": 0}

    print(f"üìñ Loading from: {file_path}")

    with jsonlines.open(file_path) as reader:
        for idx, obj in enumerate(reader):
            stats["total"] += 1

            if not isinstance(obj, dict):
                stats["empty"] += 1
                continue

            # Get text content
            text = obj.get("content") or obj.get("text") or ""
            if not text.strip():
                stats["empty"] += 1
                continue

            # Clean text
            cleaned_text = cleaner.clean(text)

            # Validate quality
            if not cleaner.is_meaningful(cleaned_text):
                stats["gibberish"] += 1
                continue

            # Extract metadata
            meta = obj.get("metadata", {})
            dataset.append({
                "text": cleaned_text,
                "source": meta.get("source", "unknown"),
                "page": meta.get("page", -1),
                "doc_id": idx
            })
            stats["valid"] += 1

    print(f"‚úÖ Loaded {stats['valid']}/{stats['total']} valid documents")
    print(f"‚ö†Ô∏è  Filtered: {stats['empty']} empty, {stats['gibberish']} gibberish\n")

    return dataset

# Load your data
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/ayurveda_docs_final.jsonl"
dataset = load_and_clean_data(DATA_PATH)

if not dataset:
    raise ValueError("‚ùå No valid data loaded! Check your JSONL file and cleaning thresholds.")


üìñ Loading from: /content/drive/MyDrive/Colab Notebooks/ayurveda_docs_final.jsonl
‚úÖ Loaded 4605/4748 valid documents
‚ö†Ô∏è  Filtered: 60 empty, 83 gibberish



In [None]:
class SmartChunker:
    """Sentence-aware chunking with overlap"""

    def __init__(self, chunk_size=400, overlap=80):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.sent_pattern = re.compile(r'[.!?]\s+')

    def chunk_text(self, text: str) -> List[str]:
        """Create overlapping chunks"""
        # Split into sentences
        sentences = self.sent_pattern.split(text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 15]

        if not sentences:
            return []

        chunks = []
        current = []
        current_len = 0

        for sent in sentences:
            sent_len = len(sent)

            if current_len + sent_len > self.chunk_size and current:
                chunks.append(" ".join(current))

                # Keep last sentence for overlap
                if len(current) > 1:
                    current = [current[-1]]
                    current_len = len(current[-1])
                else:
                    current = []
                    current_len = 0

            current.append(sent)
            current_len += sent_len

        if current:
            chunks.append(" ".join(current))

        return chunks

chunker = SmartChunker(chunk_size=400, overlap=80)

# Create chunks
print("üî™ Chunking documents...")
chunked_dataset = []

for item in tqdm(dataset, desc="Chunking"):
    chunks = chunker.chunk_text(item["text"])

    for idx, chunk in enumerate(chunks):
        if len(chunk) >= 30:  # Minimum chunk size
            chunked_dataset.append({
                "text": chunk,
                "source": item["source"],
                "page": item["page"],
                "doc_id": item["doc_id"],
                "chunk_id": idx
            })

print(f"‚úÖ Created {len(chunked_dataset)} chunks")
if chunked_dataset:
    avg_len = np.mean([len(c["text"]) for c in chunked_dataset])
    print(f"üìä Average chunk length: {avg_len:.0f} chars\n")


üî™ Chunking documents...


Chunking:   0%|          | 0/4605 [00:00<?, ?it/s]

‚úÖ Created 26844 chunks
üìä Average chunk length: 383 chars



In [None]:
@torch.no_grad()
def embed_texts(texts: List[str], batch_size: int = 16) -> np.ndarray:
    """Generate embeddings with memory optimization"""
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        encoded = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        ).to(DEVICE)

        outputs = model(**encoded)

        # Mean pooling
        attention_mask = encoded['attention_mask']
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        # Normalize
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu().numpy())

        # Cleanup
        del encoded, outputs, token_embeddings, attention_mask, input_mask_expanded, embeddings
        if DEVICE == "cuda":
            torch.cuda.empty_cache()

    return np.vstack(all_embeddings)


In [None]:
PINECONE_API_KEY = "API-KEY-HERE"
PINECONE_INDEX = "ayurveda-rag-v2"  # New index name

print("üîå Connecting to Pinecone...")
pc = Pinecone(api_key=PINECONE_API_KEY)

# Delete old index if exists with wrong dimensions
if PINECONE_INDEX in [idx.name for idx in pc.list_indexes()]:
    print(f"  Deleting existing index: {PINECONE_INDEX}")
    pc.delete_index(PINECONE_INDEX)
    import time
    time.sleep(10)  # Wait for deletion

# Create new index with correct dimensions
print(f" Creating new index with dimension {EMBEDDING_DIM}...")
pc.create_index(
    name=PINECONE_INDEX,
    dimension=EMBEDDING_DIM,  # 384 for all-MiniLM-L6-v2
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

print(" Waiting for index initialization...")
import time
time.sleep(60)

index = pc.Index(PINECONE_INDEX)
print(f" Connected to index: {PINECONE_INDEX}\n")


üîå Connecting to Pinecone...
üìù Creating new index with dimension 384...
‚è≥ Waiting for index initialization...
‚úÖ Connected to index: ayurveda-rag-v2



In [None]:
BATCH_SIZE = 100  # Can be larger now with smaller embeddings

print(f" Starting upsert (batch size: {BATCH_SIZE})")
total_batches = (len(chunked_dataset) + BATCH_SIZE - 1) // BATCH_SIZE

for i in tqdm(range(0, len(chunked_dataset), BATCH_SIZE), total=total_batches, desc="Uploading"):
    batch = chunked_dataset[i:i + BATCH_SIZE]

    # Prepare IDs
    ids = [f"{b['source'].split('\\')[-1]}|p{b['page']}|d{b['doc_id']}|c{b['chunk_id']}" for b in batch]

    # Generate embeddings
    texts = [b["text"] for b in batch]
    embeddings = embed_texts(texts, batch_size=16)

    # Prepare metadata
    metadata = [
        {
            "source": b["source"].split("\\")[-1],  # Just filename
            "page": int(b["page"]),
            "doc_id": int(b["doc_id"]),
            "chunk_id": int(b["chunk_id"]),
            "text": b["text"][:1000]
        }
        for b in batch
    ]

    # Upsert
    vectors = list(zip(ids, embeddings.tolist(), metadata))
    index.upsert(vectors=vectors)

    # Cleanup
    del embeddings, vectors
    if DEVICE == "cuda":
        torch.cuda.empty_cache()

print(f" Upload complete!")
print(f" Index stats: {index.describe_index_stats()}\n")

üöÄ Starting upsert (batch size: 100)


Uploading:   0%|          | 0/269 [00:00<?, ?it/s]

‚úÖ Upload complete!
üìä Index stats: {'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '189',
                                    'content-type': 'application/json',
                                    'date': 'Fri, 21 Nov 2025 21:55:09 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '33',
                                    'x-pinecone-request-id': '5649497424917596348',
                                    'x-pinecone-request-latency-ms': '32'}},
 'dimension': 384,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 26844}},
 'storageFullness': 0.0,
 'total_vector_count': 26844,
 'vector_type': 'dense'}



In [None]:
def query_rag(query: str, top_k: int = 5) -> List[Dict]:
    """Query the RAG system"""
    # Clean query
    query = cleaner.clean(query)

    # Generate embedding
    query_emb = embed_texts([query], batch_size=1)[0].tolist()

    # Search
    results = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True
    )

    return results.matches

# Test queries
print(" Testing retrieval...\n")
test_queries = [
    "What are the three doshas?",
    "treatment of burns and scalds",
    "pitta vata kapha characteristics",
    "Sarpih clarified butter",
]

for query in test_queries:
    print(f"{'='*60}")
    print(f"Q: {query}")
    print(f"{'='*60}")

    results = query_rag(query, top_k=3)

    for i, match in enumerate(results, 1):
        print(f"\n{i}. Score: {match.score:.4f}")
        print(f"   Source: {match.metadata['source']}, Page: {match.metadata['page']}")
        print(f"   Text: {match.metadata['text'][:200]}...")
    print()

print("="*60)
print(" PIPELINE COMPLETE!")
print(f" Total vectors: {len(chunked_dataset)}")
print(f" Ready for queries using: query_rag('your question', top_k=5)")
print("="*60)

üîç Testing retrieval...

Q: What are the three doshas?

1. Score: 0.7053
   Source: sushutra samhita 3.pdf, Page: 444
   Text: The three Doshas, the (seven) Dhatus, feces and unne‚Äîthese, in their normal state, hold together the corporeal frame in conjunction with the (six Rasas necessary (for the constitution) Puru-'ha or hum...

2. Score: 0.6741
   Source: sushutra samhita 3.pdf, Page: 365
   Text: .\'mptoms specifically belonging to the three ])reccding Dosha-originated )'pes, are simultaneous )- present in the type due to ihe concerted action of the three Doshas (Tri-Dhosliaja) together with i...

3. Score: 0.6737
   Source: sushutra samhita 3.pdf, Page: 444
   Text: The three Dhatus Vayu, Pitta and Kapha in their normal state, cannot properly be called Doshas The state in which all the three Dhatus are in their normal state, is said to be the 63 rd Combination (s...

Q: treatment of burns and scalds

1. Score: 0.6427
   Source: Sushruta Samhita 1.pdf, Page: 197
   Text: ), are 