In [5]:
# Cell 1: Install required packages
!pip install torch transformers sentence-transformers psycopg2-binary pgvector tqdm scikit-learn
!sudo apt-get update && apt-get install -y postgresql postgresql-contrib postgresql-14-pgvector
!service postgresql start
!sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0% [Working]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease                         
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease                 
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease               
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease               
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease                     
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ERROR:  could not open extension control file "/usr/share/postgresql/14/extension/vector.control": No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
# Cell 2: Configure environment
import os
os.environ["POSTGRES_URL"] = "postgres://postgres@localhost/postgres"

# Cell 3: Import packages
import numpy as np
import sqlite3
import logging
import hashlib
import psycopg2
import re
import pandas as pd
import torch
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from pgvector.psycopg2 import register_vector

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("All packages imported successfully!")

In [7]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Then your regular imports:
import torch
from transformers import AutoModel, AutoTokenizer

In [8]:
# Cell 4: Database connection verification
try:
    pg_conn = psycopg2.connect(os.environ["POSTGRES_URL"])
    register_vector(pg_conn)
    pg_conn.close()
    logger.info("PostgreSQL connection successful!")
except Exception as e:
    logger.error(f"PostgreSQL connection failed: {str(e)}")

class QnASystem:
    def __init__(self, db_path: str = "qna.db", model_name: str = "sentence-transformers/all-mpnet-base-v2"):
        self.db_path = db_path
        self.model_name = model_name
        self.conn = None
        self.model = None
        self.tokenizer = None
        self.embedding_dim = 768
        self._initialize_components()

    def _initialize_components(self):
        """Initialize database and ML components"""
        try:
            # Configure SQLite for bulk operations
            self.conn = sqlite3.connect(self.db_path)
            self.conn.execute("PRAGMA journal_mode = WAL")
            self.conn.execute("PRAGMA cache_size = -10000")  # 10MB cache
            self._initialize_db()
            
            # Initialize ML model
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
            if torch.cuda.is_available():
                self.model = self.model.to('cuda')
            logger.info("System initialized successfully")
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            raise

    # Modified _initialize_db method
    def _initialize_db(self):
        """Create optimized database schema with idempotent checks"""
        try:
            with self.conn:
                # Create tables
                self.conn.executescript("""
                    CREATE TABLE IF NOT EXISTS qna_pairs (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        question TEXT NOT NULL CHECK(length(question) BETWEEN 10 AND 1000),
                        answer TEXT NOT NULL CHECK(length(answer) BETWEEN 20 AND 5000),
                        category TEXT CHECK(length(category) <= 50),
                        keywords TEXT CHECK(length(keywords) <= 200),
                        question_hash TEXT UNIQUE,
                        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                        last_accessed TIMESTAMP,
                        usage_count INTEGER DEFAULT 0
                    );
    
                    CREATE TABLE IF NOT EXISTS qna_embeddings (
                        qna_id INTEGER PRIMARY KEY,
                        question_vector BLOB NOT NULL,
                        answer_vector BLOB NOT NULL,
                        FOREIGN KEY (qna_id) REFERENCES qna_pairs(id)
                    );
                """)
                
                # Create indexes with existence checks
                self._create_index_if_not_exists(
                    "idx_qna_search", 
                    "qna_pairs(question_hash, category)"
                )
                self._create_index_if_not_exists(
                    "idx_embedding_search", 
                    "qna_embeddings(qna_id)"
                )
                
            logger.info("Database initialized")
        except sqlite3.Error as e:
            logger.error(f"Database error: {str(e)}")
            raise
    
    def _create_index_if_not_exists(self, index_name: str, columns: str):
        """Helper function for idempotent index creation"""
        try:
            self.conn.execute(f"""
                CREATE INDEX {index_name} 
                ON {columns}
            """)
        except sqlite3.OperationalError as e:
            if "already exists" in str(e):
                logger.debug(f"Index {index_name} already exists")
            else:
                raise

    def _text_to_vector(self, text: str) -> np.ndarray:
        """Generate embeddings with GPU optimization"""
        try:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=512,
                padding=True
            ).to(device)
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Efficient mean pooling
            return outputs.last_hidden_state.mean(dim=1).cpu().numpy().squeeze()
        except Exception as e:
            logger.error(f"Embedding error: {str(e)}")
            raise

    def validate_qna(self, qna: Dict) -> bool:
        """Validate QnA entry quality"""
        return all([
            len(qna['question']) >= 10,
            len(qna['answer']) >= 20,
            'category' in qna,
            cosine_similarity(
                [self._text_to_vector(qna['question'])],
                [self._text_to_vector(qna['answer'])]
            )[0][0] < 0.7
        ])

    # Modified batch_insert method
    def batch_insert(self, qna_list: List[Dict], batch_size: int = 500):
        """Optimized bulk insert with duplicate handling"""
        try:
            valid_data = [q for q in qna_list if self.validate_qna(q)]
            logger.info(f"Processing {len(valid_data)} valid entries")
            
            with ThreadPoolExecutor(max_workers=4) as executor:
                for batch in self._chunk_list(valid_data, batch_size):
                    with self.conn:
                        # Generate hashes first
                        hashes = [
                            hashlib.sha256(q['question'].encode()).hexdigest()
                            for q in batch
                        ]
                        
                        # Check existing hashes
                        cursor = self.conn.execute(f"""
                            SELECT question_hash FROM qna_pairs
                            WHERE question_hash IN ({','.join(['?']*len(hashes))})
                        """, hashes)
                        existing_hashes = {row[0] for row in cursor.fetchall()}
                        
                        # Filter out duplicates
                        filtered_batch = [
                            q for q, h in zip(batch, hashes)
                            if h not in existing_hashes
                        ]
                        
                        # Insert only new QnA pairs
                        qna_values = [(
                            q['question'],
                            q['answer'],
                            q['category'],
                            q.get('keywords', ''),
                            hashlib.sha256(q['question'].encode()).hexdigest()
                        ) for q in filtered_batch]
                        
                        self.conn.executemany("""
                            INSERT OR IGNORE INTO qna_pairs 
                            (question, answer, category, keywords, question_hash)
                            VALUES (?, ?, ?, ?, ?)
                        """, qna_values)
    
                        # Get IDs of newly inserted items
                        cursor = self.conn.execute(f"""
                            SELECT id, question_hash FROM qna_pairs
                            WHERE question_hash IN ({','.join(['?']*len(hashes))})
                        """, hashes)
                        id_mapping = {row[1]: row[0] for row in cursor.fetchall()}
                        
                        # Generate embeddings only for new entries
                        embeddings = []
                        for qna in filtered_batch:
                            qna_id = id_mapping.get(
                                hashlib.sha256(qna['question'].encode()).hexdigest()
                            )
                            if qna_id:
                                embeddings.append((
                                    qna_id,
                                    self._text_to_vector(qna['question']).tobytes(),
                                    self._text_to_vector(qna['answer']).tobytes()
                                ))
                        
                        # Insert embeddings
                        if embeddings:
                            self.conn.executemany("""
                                INSERT OR REPLACE INTO qna_embeddings 
                                (qna_id, question_vector, answer_vector)
                                VALUES (?, ?, ?)
                            """, embeddings)
                            
            logger.info(f"Successfully processed {len(valid_data)} records")
        except Exception as e:
            logger.error(f"Batch insert failed: {str(e)}")
            self.conn.rollback()
            raise

    def _process_embeddings(self, qna: Dict, qna_id: int):
        """Helper for parallel embedding processing"""
        return (
            qna_id,
            self._text_to_vector(qna['question']).tobytes(),
            self._text_to_vector(qna['answer']).tobytes()
        )

    def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Hybrid semantic search"""
        try:
            query_embedding = self._text_to_vector(query)
            
            # First-stage: ANN search
            cursor = self.conn.execute("""
                SELECT qna_id, question_vector 
                FROM qna_embeddings
                ORDER BY vector_cosine_similarity(question_vector, ?)
                LIMIT 100
            """, (np.array(query_embedding).tobytes(),))
            
            # Second-stage: reranking
            results = []
            for qna_id, q_vec in cursor.fetchall():
                q_sim = cosine_similarity(
                    [query_embedding], 
                    [np.frombuffer(q_vec)]
                )[0][0]
                results.append((qna_id, q_sim))
            
            top_ids = [x[0] for x in sorted(results, key=lambda x: x[1], reverse=True)[:top_k]]
            return self._get_qna_by_ids(top_ids)
        except Exception as e:
            logger.error(f"Search failed: {str(e)}")
            return []

    def _get_qna_by_ids(self, ids: List[int]) -> List[Dict]:
        """Retrieve full records by IDs"""
        try:
            cursor = self.conn.execute(f"""
                SELECT * FROM qna_pairs 
                WHERE id IN ({','.join(['?']*len(ids))})
            """, ids)
            columns = [col[0] for col in cursor.description]
            return [dict(zip(columns, row)) for row in cursor.fetchall()]
        except sqlite3.Error as e:
            logger.error(f"Record retrieval failed: {str(e)}")
            return []

    def migrate_to_postgres(self):
        """Partitioned migration to PostgreSQL"""
        try:
            pg_conn = psycopg2.connect(os.environ["POSTGRES_URL"])
            register_vector(pg_conn)
            
            with pg_conn.cursor() as cursor:
                # Create partitioned table
                cursor.execute("""
                    CREATE TABLE IF NOT EXISTS qna_pairs (
                        id INT GENERATED ALWAYS AS IDENTITY,
                        question TEXT NOT NULL,
                        answer TEXT NOT NULL,
                        category TEXT,
                        keywords TEXT,
                        question_hash TEXT UNIQUE,
                        created_at TIMESTAMP,
                        last_accessed TIMESTAMP,
                        usage_count INT,
                        PRIMARY KEY (id, category)
                    ) PARTITION BY LIST (category);
                """)
                
                # Create partitions
                categories = self.conn.execute("""
                    SELECT DISTINCT category FROM qna_pairs
                """).fetchall()
                
                for category in categories:
                    cursor.execute(f"""
                        CREATE TABLE IF NOT EXISTS qna_{category[0].lower()}
                        PARTITION OF qna_pairs
                        FOR VALUES IN ('{category[0]}');
                    """)
                
                # Migrate data
                data = self.conn.execute("""
                    SELECT question, answer, category, keywords, question_hash,
                           created_at, last_accessed, usage_count
                    FROM qna_pairs
                """).fetchall()
                
                cursor.executemany("""
                    INSERT INTO qna_pairs 
                    (question, answer, category, keywords, question_hash,
                     created_at, last_accessed, usage_count)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                """, data)
                
                # Create HNSW index
                cursor.execute("""
                    CREATE INDEX ON qna_embeddings 
                    USING hnsw (question_vector vector_cosine_ops);
                """)
                
            pg_conn.commit()
            logger.info("Migration to PostgreSQL completed")
        except Exception as e:
            logger.error(f"Migration failed: {str(e)}")
            pg_conn.rollback()
            raise

    def _chunk_list(self, lst: List, n: int):
        """Utility for batch processing"""
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

In [9]:
# Cell 5: Sample QnA Data
qna_data = [
    {
        "question": "What is a blockchain?",
        "answer": "Decentralized digital ledger recording transactions across networked computers. Features: Immutability, transparency, security through cryptography.",
        "category": "Blockchain Basics",
        "keywords": "distributed ledger, cryptography, decentralization"
    },
    {
        "question": "What is proof-of-stake?",
        "answer": "Consensus mechanism where validators stake crypto to verify transactions. Benefits: Energy efficiency vs PoW. Risks: Centralization through pooling.",
        "category": "Consensus",
        "keywords": "staking, validation, energy efficiency"
    }
]

# Cell 6: System Initialization
qna_system = QnASystem()
qna_system.batch_insert(qna_data)

# Cell 7: Test Search
try:
    test_query = "energy efficient consensus mechanism"
    results = qna_system.semantic_search(test_query)
    logger.info("Top result for test query:")
    logger.info(results[0]['answer'] if results else "No results found")
except Exception as e:
    logger.error(f"Test failed: {str(e)}")

# Cell 8: PostgreSQL Migration (Optional)
# qna_system.migrate_to_postgres()