In [16]:
# Cell 1: Install required packages (simplified)
!pip install torch transformers sentence-transformers tqdm scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
# Cell 2: Configure environment and logging
import os
import logging

# Suppress unnecessary warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Reduce TensorFlow logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Configure main logger
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)
logger.info("Logging configured successfully")

In [18]:
# Cell 3: Import packages with verification
import numpy as np
import sqlite3
import hashlib
import torch
from typing import List, Dict
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

logger.info("Core packages imported successfully")

# Verify CUDA availability
if torch.cuda.is_available():
    logger.info(f"CUDA enabled using {torch.cuda.get_device_name(0)}")
else:
    logger.warning("CUDA not available, using CPU")

In [19]:
# Cell 4: QnA System Class (enhanced)
class QnASystem:
    def __init__(self, db_path: str = "/kaggle/working/qna_db.sqlite"):
        self.db_path = db_path
        self.conn = None
        self.model = None
        self.tokenizer = None
        self._initialize_components()
        self._verify_system_health()

    def _initialize_components(self):
        """Initialize database and ML components with verification"""
        try:
            # Database setup with check_same_thread=False for notebook environments
            self.conn = sqlite3.connect(self.db_path, check_same_thread=False)
            self._initialize_db()
            logger.info("Database component initialized")
            
            # Model setup
            self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
            self.model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
            if torch.cuda.is_available():
                self.model = self.model.to('cuda')
            logger.info("Model loaded successfully")
            
            # Test embedding generation
            test_embed = self._text_to_vector("test")
            logger.debug(f"Test embedding shape: {test_embed.shape}")
            
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            raise

    def _verify_system_health(self):
        """Perform system health checks"""
        cursor = self.conn.execute("SELECT count(*) FROM sqlite_master")
        if cursor.fetchone()[0] < 1:
            raise RuntimeError("Database tables not initialized properly")
            
        test_text = "system health check"
        emb = self._text_to_vector(test_text)
        if emb.shape != (768,):
            raise RuntimeError(f"Invalid embedding shape: {emb.shape}")
            
        logger.info("System health verification passed")

    def _initialize_db(self):
        """Create database schema with error recovery"""
        try:
            self.conn.executescript("""
                CREATE TABLE IF NOT EXISTS qna_pairs (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    question TEXT UNIQUE NOT NULL,
                    answer TEXT NOT NULL,
                    category TEXT,
                    keywords TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                CREATE TABLE IF NOT EXISTS qna_embeddings (
                    qna_id INTEGER PRIMARY KEY,
                    question_vector BLOB NOT NULL,
                    FOREIGN KEY(qna_id) REFERENCES qna_pairs(id)
                );
            """)
        except sqlite3.Error as e:
            logger.error(f"Database error: {str(e)}")
            raise

    def _text_to_vector(self, text: str) -> np.ndarray:
        """Convert text to embedding vector with proper pooling"""
        try:
            inputs = self.tokenizer(
                text, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=512
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.to('cuda') for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Mean pooling implementation
            token_embeddings = outputs.last_hidden_state
            attention_mask = inputs['attention_mask']
            input_mask_expanded = (
                attention_mask
                .unsqueeze(-1)
                .expand(token_embeddings.size())
                .float()
            )
            embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            embeddings = embeddings / sum_mask
            
            return embeddings.cpu().numpy().squeeze()
        
        except Exception as e:
            logger.error(f"Vectorization failed: {str(e)}")
            raise

    def _get_embeddings(self) -> Dict[int, np.ndarray]:
            """Retrieve all stored embeddings from database"""
            try:
                cursor = self.conn.execute("""
                    SELECT qna_id, question_vector 
                    FROM qna_embeddings
                """)
                
                embeddings = {}
                for qna_id, vec_blob in cursor.fetchall():
                    embeddings[qna_id] = np.frombuffer(vec_blob, dtype=np.float32)
                
                logger.debug(f"Loaded {len(embeddings)} embeddings from database")
                return embeddings
                
            except sqlite3.Error as e:
                logger.error(f"Failed to load embeddings: {str(e)}")
                raise

    def _preload_embeddings(self):
        """Load all embeddings into memory for fast access"""
        try:
            cursor = self.conn.execute("SELECT qna_id, question_vector FROM qna_embeddings")
            results = cursor.fetchall()
            
            self.embedding_ids = []
            vectors = []
            for qna_id, vec_blob in results:
                self.embedding_ids.append(qna_id)
                vectors.append(np.frombuffer(vec_blob, dtype=np.float32))
            
            self.embedding_matrix = np.array(vectors)
            logger.info(f"Loaded {len(self.embedding_ids)} embeddings into memory")

        except sqlite3.Error as e:
            logger.error(f"Embedding preload failed: {str(e)}")
            raise

    def _batch_text_to_vectors(self, texts: List[str]) -> np.ndarray:
        """Convert batch of texts to embeddings with GPU optimization"""
        try:
            inputs = self.tokenizer(
                texts, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=512
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.to('cuda') for k, v in inputs.items()}

            with torch.no_grad(), torch.cuda.amp.autocast():
                outputs = self.model(**inputs)
            
            # Optimized mean pooling
            attention_mask = inputs['attention_mask']
            last_hidden = outputs.last_hidden_state.masked_fill(
                ~attention_mask[..., None].bool(), 0.0
            )
            embeddings = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
            
            return embeddings.cpu().numpy().astype(np.float32)
        
        except Exception as e:
            logger.error(f"Batch vectorization failed: {str(e)}")
            raise
    
    # Modified data ingestion method
    def ingest_batch(self, qna_batch: List[Dict], batch_size: int = 500):
        """Optimized batch ingestion with embeddings"""
        try:
            with self.conn:
                # Insert QnA pairs
                self.conn.executemany(
                    """INSERT OR IGNORE INTO qna_pairs 
                    (question, answer, category, keywords)
                    VALUES (?, ?, ?, ?)""",
                    [(q["question"], q["answer"], q["category"], q["keywords"]) 
                     for q in qna_batch]
                )

                # Get inserted IDs
                questions = [q["question"] for q in qna_batch]
                cursor = self.conn.execute(
                    "SELECT id, question FROM qna_pairs WHERE question IN (%s)" 
                    % ",".join("?"*len(questions)), questions)
                id_map = {row[1]: row[0] for row in cursor.fetchall()}

            # Generate embeddings in batches
            embedding_data = []
            for i in range(0, len(qna_batch), batch_size):
                batch = qna_batch[i:i+batch_size]
                texts = [q["question"] for q in batch]
                vectors = self._batch_text_to_vectors(texts)
                
                for q, vec in zip(batch, vectors):
                    if q["question"] in id_map:
                        embedding_data.append((
                            id_map[q["question"]],
                            vec.tobytes()
                        ))

            # Insert embeddings
            with self.conn:
                self.conn.executemany(
                    "INSERT OR REPLACE INTO qna_embeddings VALUES (?, ?)",
                    embedding_data
                )
            
            # Update in-memory embeddings
            self._preload_embeddings()

            logger.info(f"Ingested {len(qna_batch)} QnAs with {len(embedding_data)} embeddings")

        except Exception as e:
            logger.error(f"Batch ingestion failed: {str(e)}")
            raise

    def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Optimized search using preloaded embeddings"""
        try:
            # Convert query once
            query_vector = self._text_to_vector(query)
            
            # Use preloaded matrix
            similarities = cosine_similarity(
                [query_vector], 
                self.embedding_matrix
            ).squeeze()

            # Get top indices
            top_indices = np.argpartition(similarities, -top_k)[-top_k:]
            top_indices = top_indices[np.argsort(similarities[top_indices])][::-1]

            # Retrieve results
            results = []
            for idx in top_indices:
                qna_id = self.embedding_ids[idx]
                cursor = self.conn.execute(
                    "SELECT question, answer, category FROM qna_pairs WHERE id = ?",
                    (qna_id,)
                )
                question, answer, category = cursor.fetchone()
                results.append({
                    "qna_id": qna_id,
                    "question": question,
                    "answer": answer,
                    "category": category,
                    "similarity": float(similarities[idx])
                })

            return results

        except Exception as e:
            logger.error(f"Search error: {str(e)}")
            return []

    def close(self):
        """Close database connection"""
        if self.conn:
            self.conn.close()
            self.conn = None
            logger.info("Database connection closed")
            
    def __del__(self):
        """Cleanup resources"""
        self.close()
        logger.info("System shutdown complete")

In [21]:
# Cell 5: Data ingestion with verification (fixed)
qna_data = [
    {
        "question": "What is blockchain?",
        "answer": "A decentralized digital ledger technology...",
        "category": "Blockchain Basics",
        "keywords": "distributed ledger, cryptography"
    }
]

# Close any existing connection
if 'system' in locals():
    system.conn.close()
    del system

system = QnASystem()

# Cell 5: Batch Data Ingestion
try:
    logger.info("Starting bulk data ingestion...")
    system = QnASystem()
    
    # Example batch loading - replace with your actual data
    large_dataset = [...]  # Your 10k+ QnA pairs
    
    batch_size = 500  # Adjust based on available memory
    for i in range(0, len(large_dataset), batch_size):
        batch = large_dataset[i:i+batch_size]
        system.ingest_batch(batch)
        logger.info(f"Ingested {i+batch_size} items")
        
    logger.info("Bulk ingestion completed successfully")

except Exception as e:
    logger.error(f"Ingestion failed: {str(e)}")
    raise
finally:
    system.conn.close()

TypeError: 'ellipsis' object is not subscriptable

In [None]:
# Cell 6: Enhanced search test (fixed)
try:
    logger.info("\n=== Testing Search Functionality ===")
    test_query = "distributed ledger technology"
    
    # Create new connection for search
   # In your search test cell:
    with sqlite3.connect("/kaggle/working/qna_db.sqlite", check_same_thread=False) as temp_conn:
        system.conn = temp_conn
        results = system.semantic_search(test_query)
    
    if not results:
        logger.warning("No results found for test query")
    else:
        logger.info("Top 3 results:")
        for i, result in enumerate(results[:3], 1):
            logger.info(f"{i}. {result['answer']}")
            
except Exception as e:
    logger.error(f"Search test failed: {str(e)}")
    raise