In [None]:
# Cell 1: Install required packages (simplified)
!pip install torch transformers sentence-transformers tqdm scikit-learn

In [None]:
# Cell 2: Configure environment and logging
import os
import logging

# Suppress unnecessary warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Reduce TensorFlow logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Configure main logger
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)
logger.info("Logging configured successfully")

In [None]:
# Cell 3: Import packages with verification
import numpy as np
import sqlite3
import hashlib
import torch
from typing import List, Dict
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

logger.info("Core packages imported successfully")

# Verify CUDA availability
if torch.cuda.is_available():
    logger.info(f"CUDA enabled using {torch.cuda.get_device_name(0)}")
else:
    logger.warning("CUDA not available, using CPU")

In [None]:
# Cell 4: QnA System Class (enhanced)
class QnASystem:
    def __init__(self, db_path: str = "qna.db"):
        self.db_path = db_path
        self.conn = None
        self.model = None
        self.tokenizer = None
        self._initialize_components()
        self._verify_system_health()

    def _initialize_components(self):
        """Initialize database and ML components with verification"""
        try:
            # Database setup
            self.conn = sqlite3.connect(self.db_path)
            self._initialize_db()
            logger.info("Database component initialized")
            
            # Model setup
            self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
            self.model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
            if torch.cuda.is_available():
                self.model = self.model.to('cuda')
            logger.info("Model loaded successfully")
            
            # Test embedding generation
            test_embed = self._text_to_vector("test")
            logger.debug(f"Test embedding shape: {test_embed.shape}")
            
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            raise

    def _verify_system_health(self):
        """Perform system health checks"""
        cursor = self.conn.execute("SELECT count(*) FROM sqlite_master")
        if cursor.fetchone()[0] < 1:
            raise RuntimeError("Database tables not initialized properly")
            
        test_text = "system health check"
        emb = self._text_to_vector(test_text)
        if emb.shape != (768,):
            raise RuntimeError(f"Invalid embedding shape: {emb.shape}")
            
        logger.info("System health verification passed")

    def _initialize_db(self):
        """Create database schema with error recovery"""
        try:
            self.conn.executescript("""
                CREATE TABLE IF NOT EXISTS qna_pairs (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    question TEXT UNIQUE NOT NULL,
                    answer TEXT NOT NULL,
                    category TEXT,
                    keywords TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
                CREATE TABLE IF NOT EXISTS qna_embeddings (
                    qna_id INTEGER PRIMARY KEY,
                    question_vector BLOB NOT NULL,
                    FOREIGN KEY(qna_id) REFERENCES qna_pairs(id)
                );
            """)
        except sqlite3.Error as e:
            logger.error(f"Database error: {str(e)}")
            raise

    # Remaining methods remain similar but with enhanced logging

In [None]:
# Cell 5: Data ingestion with verification
qna_data = [
    {
        "question": "What is blockchain?",
        "answer": "A decentralized digital ledger technology...",
        "category": "Blockchain Basics",
        "keywords": "distributed ledger, cryptography"
    }
]

system = QnASystem()

try:
    logger.info("Starting data ingestion...")
    system.conn.executemany("""
        INSERT OR IGNORE INTO qna_pairs 
        (question, answer, category, keywords)
        VALUES (?, ?, ?, ?)
    """, [(q["question"], q["answer"], q["category"], q["keywords"]) for q in qna_data])
    
    # Verify insertion
    cursor = system.conn.execute("SELECT COUNT(*) FROM qna_pairs")
    count = cursor.fetchone()[0]
    logger.info(f"Successfully stored {count} QnA pairs")
    
    if count < 1:
        raise RuntimeError("Data insertion failed")

except Exception as e:
    logger.error(f"Data ingestion failed: {str(e)}")
    raise

In [None]:
# Cell 6: Enhanced search test
try:
    logger.info("\n=== Testing Search Functionality ===")
    test_query = "distributed ledger technology"
    results = system.semantic_search(test_query)
    
    if not results:
        logger.warning("No results found for test query")
    else:
        logger.info("Top 3 results:")
        for i, result in enumerate(results[:3], 1):
            logger.info(f"{i}. {result['answer']}")
            
except Exception as e:
    logger.error(f"Search test failed: {str(e)}")
    raise