In [14]:
# Cell 1: Install required packages
!pip install torch transformers sentence-transformers psycopg2-binary pgvector tqdm scikit-learn
!apt-get update && apt-get install -y postgresql postgresql-contrib
!service postgresql start
!sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0% [Working]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease                          
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease                                    
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease                                          
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease                                              
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ERROR:  could not open extension control file "/usr/share/postgresql/14/extension/vector.control": No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
# Cell 2: Configure environment
import os
os.environ["POSTGRES_URL"] = "postgres://postgres@localhost/postgres"

# Cell 3: Import packages
import numpy as np
import sqlite3
import logging
import hashlib
import psycopg2
import time
import pandas as pd
import torch
from typing import List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from pgvector.psycopg2 import register_vector

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
print("All packages imported successfully!")

All packages imported successfully!


In [16]:
# Cell 4: Verify PostgreSQL connection
try:
    pg_conn = psycopg2.connect(os.environ["POSTGRES_URL"])
    register_vector(pg_conn)
    pg_conn.close()
    logger.info("PostgreSQL connection successful!")
except Exception as e:
    logger.error(f"PostgreSQL connection failed: {str(e)}")

class QnASystem:
    def __init__(self, db_path: str = "qna.db", model_name: str = "sentence-transformers/all-mpnet-base-v2"):
        self.db_path = db_path
        self.model_name = model_name
        self.conn = None
        self.model = None
        self.tokenizer = None
        self.embedding_dim = 768
        self._initialize_components()

    def _initialize_components(self):
        """Initialize database and ML components with error handling"""
        try:
            self.conn = sqlite3.connect(self.db_path)
            self._initialize_db()
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
            if torch.cuda.is_available():
                self.model = self.model.to('cuda')
            logger.info("System initialized successfully")
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            raise

    def _initialize_db(self):
        """Create optimized database schema with transaction support"""
        try:
            with self.conn:
                # Create tables
                self.conn.executescript("""
                    CREATE TABLE IF NOT EXISTS qna_pairs (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        question TEXT NOT NULL,
                        answer TEXT NOT NULL,
                        category TEXT,
                        keywords TEXT,
                        question_hash TEXT UNIQUE,
                        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                        last_accessed TIMESTAMP,
                        usage_count INTEGER DEFAULT 0
                    );
                
                    CREATE TABLE IF NOT EXISTS qna_embeddings (
                        qna_id INTEGER PRIMARY KEY,
                        question_vector BLOB,
                        answer_vector BLOB,
                        keywords_vector BLOB,
                        FOREIGN KEY (qna_id) REFERENCES qna_pairs(id)
                    );""")
                
            logger.info("Database initialized successfully")
        except sqlite3.Error as e:
            logger.error(f"Database initialization error: {str(e)}")
            raise

    def _text_to_vector(self, text: str) -> np.ndarray:
        """Generate embeddings with GPU support and fallback"""
        try:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            self.model = self.model.to(device)
            
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=512,
                padding=True
            ).to(device)
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Contextual pooling with attention weights
            last_hidden_state = outputs.last_hidden_state
            attention_mask = inputs.attention_mask.unsqueeze(-1)
            vector = (last_hidden_state * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
            return vector.cpu().numpy().squeeze()
        except Exception as e:
            logger.error(f"Embedding generation failed: {str(e)}")
            raise

    def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Hybrid search with semantic and keyword components"""
        try:
            query_embedding = self._text_to_vector(query)
            keyword_matches = self.keyword_search(query, limit=top_k*3)
            if not keyword_matches:
                return []

            ids = [str(qid) for qid, _, _ in keyword_matches]
            cursor = self.conn.execute(f"""
                SELECT qna_id, question_vector, answer_vector 
                FROM qna_embeddings 
                WHERE qna_id IN ({','.join(['?']*len(ids))})
            """, ids)
            
            results = []
            for qna_id, q_vec, a_vec in cursor.fetchall():
                q_sim = cosine_similarity([query_embedding], [np.frombuffer(q_vec)])[0][0]
                a_sim = cosine_similarity([query_embedding], [np.frombuffer(a_vec)])[0][0]
                combined_score = 0.6*q_sim + 0.4*a_sim
                results.append((qna_id, combined_score))
            
            top_ids = [x[0] for x in sorted(results, key=lambda x: x[1], reverse=True)[:top_k]]
            return self.get_qna_by_ids(top_ids)
        except Exception as e:
            logger.error(f"Search failed: {str(e)}")
            return []

    def keyword_search(self, query: str, limit: int = 15) -> List[tuple]:
        """Keyword-based search using FTS5"""
        try:
            cursor = self.conn.execute("""
                SELECT rowid, question, answer 
                FROM qna_search 
                WHERE question MATCH ? 
                ORDER BY bm25(qna_search) 
                LIMIT ?
            """, (query, limit))
            return cursor.fetchall()
        except sqlite3.Error as e:
            logger.error(f"Keyword search failed: {str(e)}")
            return []

    def get_qna_by_ids(self, ids: List[int]) -> List[Dict]:
        """Retrieve full QnA records by IDs"""
        try:
            cursor = self.conn.execute(f"""
                SELECT * FROM qna_pairs 
                WHERE id IN ({','.join(['?']*len(ids))})
            """, ids)
            columns = [col[0] for col in cursor.description]
            return [dict(zip(columns, row)) for row in cursor.fetchall()]
        except sqlite3.Error as e:
            logger.error(f"Get QnA by IDs failed: {str(e)}")
            return []

    def batch_insert(self, qna_list: List[Dict], batch_size: int = 100):
        """Optimized batch processing with transactions"""
        try:
            for batch in tqdm(self._chunk_list(qna_list, batch_size), desc="Processing batches"):
                with self.conn:
                    # Insert into qna_pairs
                    qna_values = [
                        (
                            q['question'],
                            q['answer'],
                            q.get('category'),
                            len(q['answer'].split()),
                            q.get('keywords', ''),
                            self._normalize_text(q['question']),
                            self._generate_hash(q['question'])
                        ) for q in batch
                    ]
                    self.conn.executemany("""
                        INSERT INTO qna_pairs 
                        (question, answer, category, word_count, keywords, normalized_question, question_hash)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    """, qna_values)

                    # Insert into qna_search
                    search_values = [
                        (q['question'], q['answer'], q.get('keywords', '')) 
                        for q in batch
                    ]
                    self.conn.executemany("""
                        INSERT INTO qna_search 
                        (question, answer, keywords)
                        VALUES (?, ?, ?)
                    """, search_values)

                    # Get inserted IDs
                    cursor = self.conn.execute("""
                        SELECT id FROM qna_pairs 
                        ORDER BY id DESC LIMIT ?
                    """, (len(batch),))
                    inserted_ids = [row[0] for row in cursor.fetchall()][::-1]

                    # Generate embeddings
                    embeddings = []
                    for qna_id, qna in zip(inserted_ids, batch):
                        q_vec = self._text_to_vector(qna['question']).tobytes()
                        a_vec = self._text_to_vector(qna['answer']).tobytes()
                        k_vec = self._text_to_vector(qna.get('keywords', '')).tobytes()
                        embeddings.append((qna_id, q_vec, a_vec, k_vec))

                    self.conn.executemany("""
                        INSERT INTO qna_embeddings 
                        (qna_id, question_vector, answer_vector, keywords_vector)
                        VALUES (?, ?, ?, ?)
                    """, embeddings)
        except Exception as e:
            logger.error(f"Batch insert failed: {str(e)}")
            self.conn.rollback()
            raise

    def migrate_to_postgres(self):
        """Database migration with full data transfer"""
        try:
            pg_conn = psycopg2.connect(os.environ["POSTGRES_URL"])
            register_vector(pg_conn)
            
            with pg_conn.cursor() as cursor, self.conn:
                # Create PostgreSQL schema
                cursor.execute(f"""
                    CREATE TABLE IF NOT EXISTS qna_pairs (
                        id INTEGER PRIMARY KEY,
                        question TEXT NOT NULL,
                        answer TEXT NOT NULL,
                        category TEXT,
                        word_count INTEGER,
                        created_at TIMESTAMP,
                        last_accessed TIMESTAMP,
                        usage_count INTEGER,
                        keywords TEXT,
                        normalized_question TEXT,
                        question_hash TEXT UNIQUE
                    )""")
                
                cursor.execute(f"""
                    CREATE TABLE IF NOT EXISTS qna_embeddings (
                        qna_id INTEGER PRIMARY KEY,
                        question_vector VECTOR({self.embedding_dim}),
                        answer_vector VECTOR({self.embedding_dim}),
                        keywords_vector VECTOR({self.embedding_dim})
                    )""")

                # Migrate qna_pairs
                sqlite_data = self.conn.execute("""
                    SELECT id, question, answer, category, word_count, created_at,
                           last_accessed, usage_count, keywords, normalized_question, question_hash
                    FROM qna_pairs
                """).fetchall()
                
                cursor.executemany("""
                    INSERT INTO qna_pairs 
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """, sqlite_data)

                # Migrate embeddings with vector conversion
                embedding_data = self.conn.execute("""
                    SELECT qna_id, question_vector, answer_vector, keywords_vector 
                    FROM qna_embeddings
                """).fetchall()
                
                converted_embeddings = []
                for row in embedding_data:
                    converted = (
                        row[0],
                        np.frombuffer(row[1]).tolist(),
                        np.frombuffer(row[2]).tolist(),
                        np.frombuffer(row[3]).tolist()
                    )
                    converted_embeddings.append(converted)
                
                cursor.executemany("""
                    INSERT INTO qna_embeddings 
                    VALUES (%s, %s, %s, %s)
                """, converted_embeddings)

                pg_conn.commit()
            logger.info("Migration completed successfully")
        except Exception as e:
            logger.error(f"Migration failed: {str(e)}")
            if 'pg_conn' in locals():
                pg_conn.rollback()
            raise

    # Helper methods
    def _chunk_list(self, lst: List, n: int):
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    def _normalize_text(self, text: str) -> str:
        return text.lower().strip()

    def _generate_hash(self, text: str) -> str:
        return hashlib.sha256(text.encode()).hexdigest()

In [17]:
qna_data = [
    {
        "question": "What is a Merkle Tree?",
        "answer": "A Merkle tree is a binary tree with hash pointers that allows efficient and secure verification of large data structures in blockchain systems.",
        "category": "Blockchain Fundamentals",
        "keywords": "data structure, hash function, verification"
    },
    {
        "question": "What does the output refer to in Bitcoin transactions?",
        "answer": "The output refers to the destination address used in the Bitcoin transaction, specifying where funds are being sent.",
        "category": "Bitcoin Transactions",
        "keywords": "UTXO, scripting, receiver address"
    },
    {
        "question": "What is confidentiality in cryptography?",
        "answer": "Confidentiality means that transmitted messages are only received and readable by authorized parties through encryption.",
        "category": "Cryptography",
        "keywords": "encryption, data privacy, secure communication"
    },
    {
        "question": "What is authentication in blockchain?",
        "answer": "Authentication is the verification of the sender's identity at the receiver end using cryptographic signatures.",
        "category": "Network Security",
        "keywords": "digital signatures, identity verification"
    },
    {
        "question": "What is the Genesis Block?",
        "answer": "The Genesis Block is the first block in a blockchain, serving as the foundation of the entire chain structure.",
        "category": "Blockchain Basics",
        "keywords": "block height 0, initial block"
    },
    {
        "question": "What are smart contracts?",
        "answer": "Self-executing contracts with terms directly written into code that automatically execute when conditions are met.",
        "category": "Smart Contracts",
        "keywords": "automation, if-then logic, decentralized"
    },
    {
        "question": "What is proof-of-work?",
        "answer": "A consensus mechanism where miners solve cryptographic puzzles to validate transactions and create new blocks.",
        "category": "Consensus Mechanisms",
        "keywords": "mining, computational power, security"
    },
    {
        "question": "What is cold storage?",
        "answer": "A security method keeping cryptocurrency wallets offline to protect from internet-based attacks.",
        "category": "Wallet Security",
        "keywords": "offline storage, hardware wallets"
    },
    {
        "question": "What is Bitcoin mining?",
        "answer": "The process of validating transactions and securing the network through computational work rewarded with new BTC.",
        "category": "Mining",
        "keywords": "block reward, hashing, ASICs"
    },
    {
        "question": "What is a hard fork?",
        "answer": "A permanent divergence in blockchain creating two separate networks with shared history but different protocols.",
        "category": "Blockchain Governance",
        "keywords": "protocol change, chain split"
    },
    {
        "question": "What is a 51% attack?",
        "answer": "When a single entity controls majority of network hashing power, enabling transaction manipulation.",
        "category": "Network Security",
        "keywords": "double-spend, consensus attack"
    },
    {
        "question": "What is SHA-256?",
        "answer": "The cryptographic hash function used in Bitcoin mining and blockchain security protocols.",
        "category": "Cryptography",
        "keywords": "hashing algorithm, mining"
    },
    {
        "question": "What is a decentralized exchange (DEX)?",
        "answer": "Peer-to-peer trading platform operating without central authority using smart contracts.",
        "category": "Trading",
        "keywords": "P2P, non-custodial"
    },
    {
        "question": "What is yield farming?",
        "answer": "Generating returns by providing liquidity to DeFi protocols through token staking.",
        "category": "DeFi",
        "keywords": "liquidity pools, APY"
    },
    {
        "question": "What is the Lightning Network?",
        "answer": "Layer-2 solution enabling fast, low-cost Bitcoin transactions through payment channels.",
        "category": "Scaling Solutions",
        "keywords": "micropayments, off-chain"
    },
    {
        "question": "What is an NFT?",
        "answer": "Non-fungible token representing unique digital ownership on blockchain networks.",
        "category": "Digital Assets",
        "keywords": "collectibles, digital art"
    },
    {
        "question": "What is staking?",
        "answer": "Locking cryptocurrency to support network operations and earn rewards in proof-of-stake systems.",
        "category": "Consensus Mechanisms",
        "keywords": "validation, passive income"
    },
    {
        "question": "What is Web3?",
        "answer": "Decentralized internet paradigm using blockchain and token-based economics.",
        "category": "Blockchain Ecosystem",
        "keywords": "dApps, decentralized web"
    },
    {
        "question": "What is gas fee?",
        "answer": "Payment required to execute transactions or smart contracts on Ethereum network.",
        "category": "Ethereum",
        "keywords": "transaction cost, Gwei"
    },
    {
        "question": "What is a DAO?",
        "answer": "Decentralized Autonomous Organization governed by smart contracts and member voting.",
        "category": "Governance",
        "keywords": "smart contracts, community-led"
    }
]

In [18]:
# Cell 4: User QnA Data Input (BEFORE system initialization)
qna_data = [
    {
        "question": "What is Bitcoin?",
        "answer": "A decentralized digital currency...",
        "category": "Cryptocurrency Basics",
        "keywords": "blockchain, digital currency"
    },
    # Add your custom QnAs here
]

# Cell 5: System Initialization & Data Insertion
qna_system = QnASystem()  # Initialize AFTER data input
qna_system.batch_insert(qna_data)

Processing batches: 1it [00:04,  4.70s/it]


In [19]:
logger.info("System initialized successfully!")

# Cell 6 (Revised)
try:
    test_qna = {
        "question": "What is Kaggle?",
        "answer": "A data science competition platform",
        "keywords": "platform"
    }
    
    with qna_system.conn:
        qna_system.batch_insert([test_qna])
    
    results = qna_system.semantic_search("data science platform")
    logger.info(f"Test results: {results[0]['answer'] if results else 'No matches'}")
except Exception as e:
    logger.error(f"Test failed: {str(e)}")

Processing batches: 0it [00:00, ?it/s]
