In [1]:
import os
import sqlite3
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from typing import List, Dict, Optional
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

2025-05-19 22:56:01.229228: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747695361.256650     388 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747695361.264817     388 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Configuration
class Config:
    DB_PATH = "qna_database.db"
    EMBEDDING_MODEL = "all-MiniLM-L6-v2"
    CHUNK_SIZE = 500  # For batch processing
    VECTOR_DIM = 384  # Dimension for all-MiniLM-L6-v2 embeddings

In [3]:
class QnADatabase:
    def __init__(self, db_path: str = Config.DB_PATH):
        """Initialize with optimized SQLite settings"""
        self.db_path = db_path
        self.conn = None
        self.embedding_model = None
        self._initialize_db()
        
    def _initialize_db(self):
        """Create database with optimized schema"""
        self.conn = sqlite3.connect(self.db_path, timeout=30)
        self.conn.execute("PRAGMA journal_mode = WAL")
        self.conn.execute("PRAGMA synchronous = NORMAL")
        self.conn.execute("PRAGMA cache_size = -100000")  # 100MB cache
        
        # Main Q&A table
        self.conn.execute("""
        CREATE TABLE IF NOT EXISTS qna_pairs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            question TEXT NOT NULL,
            answer TEXT NOT NULL,
            category TEXT,
            word_count INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            last_accessed TIMESTAMP,
            usage_count INTEGER DEFAULT 0
        )""")
        
        # Vector embeddings table
        self.conn.execute(f"""
        CREATE TABLE IF NOT EXISTS qna_embeddings (
            qna_id INTEGER PRIMARY KEY,
            question_vector BLOB,
            answer_vector BLOB,
            FOREIGN KEY (qna_id) REFERENCES qna_pairs(id)
        )""")
        
        # Create indexes
        self.conn.execute("CREATE INDEX IF NOT EXISTS idx_category ON qna_pairs(category)")
        self.conn.execute("CREATE INDEX IF NOT EXISTS idx_word_count ON qna_pairs(word_count)")
        
        # Full-text search
        self.conn.execute("""
        CREATE VIRTUAL TABLE IF NOT EXISTS qna_search 
        USING fts5(question, answer, tokenize='porter unicode61')
        """)

    def _get_embedding_model(self):
        """Lazy load embedding model"""
        if self.embedding_model is None:
            self.embedding_model = SentenceTransformer(Config.EMBEDDING_MODEL)
        return self.embedding_model

    def _text_to_vector(self, text: str) -> bytes:
        """Convert text to compressed vector"""
        model = self._get_embedding_model()
        vector = model.encode(text)
        return vector.tobytes()

    def _vector_to_array(self, blob: bytes) -> np.ndarray:
        """Convert blob back to numpy array"""
        return np.frombuffer(blob, dtype=np.float32)

    def batch_insert(self, qna_list: List[Dict]):
        """Optimized bulk insert with embeddings"""
        if not qna_list:
            return
            
        with self.conn:
            cursor = self.conn.cursor()
            
            # Check for existing questions to prevent duplicates
            existing_questions = set()
            cursor.execute("SELECT question FROM qna_pairs")
            for row in cursor.fetchall():
                existing_questions.add(row[0].strip().lower())
            
            # Filter out duplicates
            unique_qna = []
            for qna in qna_list:
                norm_question = qna['question'].strip().lower()
                if norm_question not in existing_questions:
                    unique_qna.append(qna)
                    existing_questions.add(norm_question)
            
            if not unique_qna:
                print("No new Q&A pairs to insert")
                return
            
            # Insert only unique Q&A pairs
            cursor.executemany("""
            INSERT INTO qna_pairs (question, answer, category, word_count)
            VALUES (?, ?, ?, ?)
            """, [(q['question'], q['answer'], q.get('category'), 
                  len(q['answer'].split())) for q in unique_qna])
            
            # Get inserted IDs - more reliable method
            cursor.execute("SELECT last_insert_rowid() - ? + 1, last_insert_rowid()", (len(unique_qna),))
            first_id, last_id = cursor.fetchone()
            
            # Generate and store embeddings
            for i in tqdm(range(len(unique_qna)), desc="Generating embeddings"):
                qna = unique_qna[i]
                q_vector = self._text_to_vector(qna['question'])
                a_vector = self._text_to_vector(qna['answer'])
                cursor.execute("""
                INSERT INTO qna_embeddings (qna_id, question_vector, answer_vector)
                VALUES (?, ?, ?)
                """, (first_id + i, q_vector, a_vector))
            
            # Update full-text search index
            cursor.executemany("""
            INSERT INTO qna_search (question, answer)
            VALUES (?, ?)
            """, [(q['question'], q['answer']) for q in unique_qna])

    def semantic_search(self, query: str, top_k: int = 5, threshold: float = 0.5):
        """Properly implemented semantic search"""
        cursor = self.conn.cursor()
        
        # Get query embedding
        query_vec = self._text_to_vector(query)
        query_arr = self._vector_to_array(query_vec)
        
        # Get all stored embeddings
        cursor.execute("""
        SELECT qna_pairs.id, qna_pairs.question, qna_embeddings.question_vector 
        FROM qna_pairs
        JOIN qna_embeddings ON qna_pairs.id = qna_embeddings.qna_id
        """)
        
        results = []
        seen_questions = set()
        
        for qna_id, question, q_vec_blob in cursor.fetchall():
            q_vec = self._vector_to_array(q_vec_blob)
            similarity = cosine_similarity([query_arr], [q_vec])[0][0]
            
            if similarity >= threshold:
                # Get the full answer
                cursor.execute("SELECT answer FROM qna_pairs WHERE id = ?", (qna_id,))
                answer = cursor.fetchone()[0]
                
                # Deduplicate by question text
                norm_question = question.strip().lower()
                if norm_question not in seen_questions:
                    results.append((question, answer, similarity))
                    seen_questions.add(norm_question)
        
        # Sort by similarity and get top unique results
        results.sort(key=lambda x: x[2], reverse=True)
        return [(q, a) for q, a, _ in results[:top_k]]

    def keyword_search(self, query: str, limit: int = 5):
        """Traditional keyword search"""
        cursor = self.conn.cursor()
        
        # Try FTS first
        cursor.execute("""
        SELECT question, answer 
        FROM qna_search 
        WHERE qna_search MATCH ?
        ORDER BY rank
        LIMIT ?
        """, (f'"{query}"', limit))
        
        results = cursor.fetchall()
        
        # Fallback to LIKE if no results
        if not results:
            cursor.execute("""
            SELECT question, answer 
            FROM qna_pairs 
            WHERE question LIKE ? OR answer LIKE ?
            LIMIT ?
            """, (f'%{query}%', f'%{query}%', limit))
            results = cursor.fetchall()
            
        return results

    def get_all_data(self, limit: Optional[int] = None):
        """Export all data with optional limit"""
        query = "SELECT * FROM qna_pairs"
        if limit:
            query += f" LIMIT {limit}"
        return pd.read_sql(query, self.conn)

    def optimize(self):
        """Database maintenance"""
        print("Optimizing database...")
        self.conn.execute("VACUUM")
        self.conn.execute("ANALYZE")
        self.conn.execute("PRAGMA optimize")

    def close(self):
        """Clean up resources"""
        if self.conn:
            self.conn.close()
        if self.embedding_model:
            del self.embedding_model

In [4]:
# Data Processing Utilities
class DataProcessor:
    @staticmethod
    def parse_text_file(file_path: str, question_prefix: str = "Q:", answer_prefix: str = "A:"):
        """Parse Q&A from text files with paragraph answers"""
        qna_pairs = []
        current_q = None
        current_a = []
        
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line.startswith(question_prefix):
                    if current_q is not None:
                        qna_pairs.append({
                            "question": current_q,
                            "answer": "\n".join(current_a).strip()
                        })
                    current_q = line[len(question_prefix):].strip()
                    current_a = []
                elif line.startswith(answer_prefix):
                    current_a.append(line[len(answer_prefix):].strip())
                elif current_a and line:
                    current_a.append(line)
            
            # Add the last pair
            if current_q is not None:
                qna_pairs.append({
                    "question": current_q,
                    "answer": "\n".join(current_a).strip()
                })
                
        return qna_pairs

    @staticmethod
    def chunk_list(lst, chunk_size):
        """Yield successive chunk_size chunks from lst"""
        for i in range(0, len(lst), chunk_size):
            yield lst[i:i + chunk_size]

In [5]:
def main():
    # Initialize database with cleanup
    if os.path.exists(Config.DB_PATH):
        os.remove(Config.DB_PATH)
    db = QnADatabase()
    
    # Sample data for demonstration
    sample_data = [
        {
            "question": "What is P2P? ",
            "answer": "A Peer-to-Peer (P2P) payment system, seamlessly integrated with blockchain technology, a decentralized application (DApp), and MetaMask wallet, orchestrates a streamlined and secure process for transparent transactions among users."
        },
        {
            "question": "Compare and Contrast Private and Public Key: ",
            "answer": "The private key allows you to have access to your funds through the crypto wallet. it is used to send Bitcoin and must be protected and secured. As for the public key, it is used to receive Bitcoin and can be published anywhere safely."
        }
    ]
    
    # Auto-categorize questions
    def detect_category(question: str) -> str:
        question_lower = question.lower()
        if 'p2p' in question_lower or 'peer-to-peer' in question_lower:
            return "networking"
        elif 'private key' in question_lower or 'public key' in question_lower:
            return "security"
        elif 'blockchain' in question_lower:
            return "fundamentals"
        elif 'proof of work' in question_lower or 'pow' in question_lower:
            return "consensus"
        elif 'wallet' in question_lower:
            return "wallets"
        elif 'smart contract' in question_lower:
            return "development"
        else:
            return "general"
    
    # Add categories to sample data
    for item in sample_data:
        item["category"] = detect_category(item["question"])
    
    # Process and insert data
    print("Inserting sample data...")
    db.batch_insert(sample_data)
    
    # For large files
    try:
        file_path = "/kaggle/input/db-19-txt"
        if os.path.exists(file_path):
            print("Processing large file...")
            qna_pairs = DataProcessor.parse_text_file(file_path)
            
            # Auto-categorize parsed questions
            for item in qna_pairs:
                item["category"] = detect_category(item["question"])
            
            print(f"Processing {len(qna_pairs)} Q&A pairs...")
            for chunk in DataProcessor.chunk_list(qna_pairs, Config.CHUNK_SIZE):
                db.batch_insert(chunk)
    except Exception as e:
        print(f"Error processing large file: {e}")
    
    # Test with new questions not in the sample data
    test_questions = [
        "How does blockchain ensure security?",
        "What are the advantages of P2P networks?",
        "Explain the difference between hot and cold wallets",
        "How do smart contracts work?",
        "What is the role of miners in blockchain?"
    ]
    
    print("\n" + "="*50)
    print("Testing with new questions not in sample data")
    print("="*50)
    
    for question in test_questions:
        print(f"\nQuestion: '{question}'")
        
        # Semantic search
        print("\nSemantic search results:")
        semantic_results = db.semantic_search(question)
        if semantic_results:
            for i, (q, a) in enumerate(semantic_results, 1):
                print(f"{i}. Question: {q}")
                print(f"   Answer: {a}")
                print(f"   {'-'*50}")
        else:
            print("No semantic matches found")
        
        # Keyword search
        print("\nKeyword search results:")
        keyword_results = db.keyword_search(question)
        if keyword_results:
            for i, (q, a) in enumerate(keyword_results, 1):
                print(f"{i}. Question: {q}")
                print(f"   Answer: {a}")
                print(f"   {'-'*50}")
        else:
            print("No keyword matches found")
    
    # Export data with categories
    df = db.get_all_data(limit=10)
    print("\nSample data from database:")
    print(df[['question', 'category']].head())
    
    # Show category distribution
    print("\nCategory distribution:")
    print(df['category'].value_counts())
    
    # Maintenance
    db.optimize()
    db.close()

if __name__ == "__main__":
    main()

Inserting sample data...


Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Testing with new questions not in sample data

Question: 'How does blockchain ensure security?'

Semantic search results:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No semantic matches found

Keyword search results:
No keyword matches found

Question: 'What are the advantages of P2P networks?'

Semantic search results:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

1. Question: What is P2P? 
   Answer: A Peer-to-Peer (P2P) payment system, seamlessly integrated with blockchain technology, a decentralized application (DApp), and MetaMask wallet, orchestrates a streamlined and secure process for transparent transactions among users.
   --------------------------------------------------

Keyword search results:
No keyword matches found

Question: 'Explain the difference between hot and cold wallets'

Semantic search results:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No semantic matches found

Keyword search results:
No keyword matches found

Question: 'How do smart contracts work?'

Semantic search results:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No semantic matches found

Keyword search results:
No keyword matches found

Question: 'What is the role of miners in blockchain?'

Semantic search results:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No semantic matches found

Keyword search results:
No keyword matches found

Sample data from database:
                                        question    category
0                                  What is P2P?   networking
1  Compare and Contrast Private and Public Key:     security

Category distribution:
category
networking    1
security      1
Name: count, dtype: int64
Optimizing database...
