In [9]:
# Cell 1: Install required packages
!pip install torch transformers sentence-transformers psycopg2-binary pgvector tqdm scikit-learn
!apt-get update && apt-get install -y postgresql postgresql-contrib
!service postgresql start
!sudo -u postgres psql -c "CREATE EXTENSION IF NOT EXISTS vector;"

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinu

In [10]:
# Cell 2: Configure environment
import os
os.environ["POSTGRES_URL"] = "postgres://postgres@localhost/postgres"

# Cell 3: Import packages
import numpy as np
import sqlite3
import logging
import hashlib
import psycopg2
import time
import pandas as pd
import torch
from typing import List, Dict, Optional
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from pgvector.psycopg2 import register_vector

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
print("All packages imported successfully!")

All packages imported successfully!


In [11]:
# Cell 4: Verify PostgreSQL connection
try:
    pg_conn = psycopg2.connect(os.environ["POSTGRES_URL"])
    register_vector(pg_conn)
    pg_conn.close()
    logger.info("PostgreSQL connection successful!")
except Exception as e:
    logger.error(f"PostgreSQL connection failed: {str(e)}")

class QnASystem:
    def __init__(self, db_path: str = "qna.db", model_name: str = "sentence-transformers/all-mpnet-base-v2"):
        self.db_path = db_path
        self.model_name = model_name
        self.conn = None
        self.model = None
        self.tokenizer = None
        self.embedding_dim = 768
        self._initialize_components()

    def _initialize_components(self):
        """Initialize database and ML components with error handling"""
        try:
            self.conn = sqlite3.connect(self.db_path)
            self._initialize_db()
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
            if torch.cuda.is_available():
                self.model = self.model.to('cuda')
            logger.info("System initialized successfully")
        except Exception as e:
            logger.error(f"Initialization failed: {str(e)}")
            raise

    def _initialize_db(self):
        """Create optimized database schema with transaction support"""
        try:
            with self.conn:
                # Create tables
                self.conn.executescript("""
                    CREATE TABLE IF NOT EXISTS qna_pairs (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        question TEXT NOT NULL,
                        answer TEXT NOT NULL,
                        category TEXT,
                        word_count INTEGER,
                        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                        last_accessed TIMESTAMP,
                        usage_count INTEGER DEFAULT 0,
                        keywords TEXT,
                        normalized_question TEXT,
                        question_hash TEXT UNIQUE
                    );
                    
                    CREATE TABLE IF NOT EXISTS qna_embeddings (
                        qna_id INTEGER PRIMARY KEY,
                        question_vector BLOB,
                        answer_vector BLOB,
                        keywords_vector BLOB,
                        FOREIGN KEY (qna_id) REFERENCES qna_pairs(id)
                    );
                    
                    CREATE INDEX IF NOT EXISTS idx_category ON qna_pairs(category);
                    CREATE INDEX IF NOT EXISTS idx_keywords ON qna_pairs(keywords);
                    
                    CREATE VIRTUAL TABLE IF NOT EXISTS qna_search 
                    USING fts5(question, answer, keywords, tokenize='porter unicode61');
                """)
            logger.info("Database initialized successfully")
        except sqlite3.Error as e:
            logger.error(f"Database initialization error: {str(e)}")
            raise

    def _text_to_vector(self, text: str) -> np.ndarray:
        """Generate embeddings with GPU support and fallback"""
        try:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            self.model = self.model.to(device)
            
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=512,
                padding=True
            ).to(device)
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Contextual pooling with attention weights
            last_hidden_state = outputs.last_hidden_state
            attention_mask = inputs.attention_mask.unsqueeze(-1)
            vector = (last_hidden_state * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
            return vector.cpu().numpy().squeeze()
        except Exception as e:
            logger.error(f"Embedding generation failed: {str(e)}")
            raise

    def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Hybrid search with semantic and keyword components"""
        try:
            query_embedding = self._text_to_vector(query)
            keyword_matches = self.keyword_search(query, limit=top_k*3)
            if not keyword_matches:
                return []

            ids = [str(qid) for qid, _, _ in keyword_matches]
            cursor = self.conn.execute(f"""
                SELECT qna_id, question_vector, answer_vector 
                FROM qna_embeddings 
                WHERE qna_id IN ({','.join(['?']*len(ids))})
            """, ids)
            
            results = []
            for qna_id, q_vec, a_vec in cursor.fetchall():
                q_sim = cosine_similarity([query_embedding], [np.frombuffer(q_vec)])[0][0]
                a_sim = cosine_similarity([query_embedding], [np.frombuffer(a_vec)])[0][0]
                combined_score = 0.6*q_sim + 0.4*a_sim
                results.append((qna_id, combined_score))
            
            top_ids = [x[0] for x in sorted(results, key=lambda x: x[1], reverse=True)[:top_k]]
            return self.get_qna_by_ids(top_ids)
        except Exception as e:
            logger.error(f"Search failed: {str(e)}")
            return []

    def keyword_search(self, query: str, limit: int = 15) -> List[tuple]:
        """Keyword-based search using FTS5"""
        try:
            cursor = self.conn.execute("""
                SELECT rowid, question, answer 
                FROM qna_search 
                WHERE question MATCH ? 
                ORDER BY bm25(qna_search) 
                LIMIT ?
            """, (query, limit))
            return cursor.fetchall()
        except sqlite3.Error as e:
            logger.error(f"Keyword search failed: {str(e)}")
            return []

    def get_qna_by_ids(self, ids: List[int]) -> List[Dict]:
        """Retrieve full QnA records by IDs"""
        try:
            cursor = self.conn.execute(f"""
                SELECT * FROM qna_pairs 
                WHERE id IN ({','.join(['?']*len(ids))})
            """, ids)
            columns = [col[0] for col in cursor.description]
            return [dict(zip(columns, row)) for row in cursor.fetchall()]
        except sqlite3.Error as e:
            logger.error(f"Get QnA by IDs failed: {str(e)}")
            return []

    def batch_insert(self, qna_list: List[Dict], batch_size: int = 100):
        """Optimized batch processing with transactions"""
        try:
            for batch in tqdm(self._chunk_list(qna_list, batch_size), desc="Processing batches"):
                with self.conn:
                    # Insert into qna_pairs
                    qna_values = [
                        (
                            q['question'],
                            q['answer'],
                            q.get('category'),
                            len(q['answer'].split()),
                            q.get('keywords', ''),
                            self._normalize_text(q['question']),
                            self._generate_hash(q['question'])
                        ) for q in batch
                    ]
                    self.conn.executemany("""
                        INSERT INTO qna_pairs 
                        (question, answer, category, word_count, keywords, normalized_question, question_hash)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    """, qna_values)

                    # Insert into qna_search
                    search_values = [
                        (q['question'], q['answer'], q.get('keywords', '')) 
                        for q in batch
                    ]
                    self.conn.executemany("""
                        INSERT INTO qna_search 
                        (question, answer, keywords)
                        VALUES (?, ?, ?)
                    """, search_values)

                    # Get inserted IDs
                    cursor = self.conn.execute("""
                        SELECT id FROM qna_pairs 
                        ORDER BY id DESC LIMIT ?
                    """, (len(batch),))
                    inserted_ids = [row[0] for row in cursor.fetchall()][::-1]

                    # Generate embeddings
                    embeddings = []
                    for qna_id, qna in zip(inserted_ids, batch):
                        q_vec = self._text_to_vector(qna['question']).tobytes()
                        a_vec = self._text_to_vector(qna['answer']).tobytes()
                        k_vec = self._text_to_vector(qna.get('keywords', '')).tobytes()
                        embeddings.append((qna_id, q_vec, a_vec, k_vec))

                    self.conn.executemany("""
                        INSERT INTO qna_embeddings 
                        (qna_id, question_vector, answer_vector, keywords_vector)
                        VALUES (?, ?, ?, ?)
                    """, embeddings)
        except Exception as e:
            logger.error(f"Batch insert failed: {str(e)}")
            self.conn.rollback()
            raise

    def migrate_to_postgres(self):
        """Database migration with full data transfer"""
        try:
            pg_conn = psycopg2.connect(os.environ["POSTGRES_URL"])
            register_vector(pg_conn)
            
            with pg_conn.cursor() as cursor, self.conn:
                # Create PostgreSQL schema
                cursor.execute(f"""
                    CREATE TABLE IF NOT EXISTS qna_pairs (
                        id INTEGER PRIMARY KEY,
                        question TEXT NOT NULL,
                        answer TEXT NOT NULL,
                        category TEXT,
                        word_count INTEGER,
                        created_at TIMESTAMP,
                        last_accessed TIMESTAMP,
                        usage_count INTEGER,
                        keywords TEXT,
                        normalized_question TEXT,
                        question_hash TEXT UNIQUE
                    )""")
                
                cursor.execute(f"""
                    CREATE TABLE IF NOT EXISTS qna_embeddings (
                        qna_id INTEGER PRIMARY KEY,
                        question_vector VECTOR({self.embedding_dim}),
                        answer_vector VECTOR({self.embedding_dim}),
                        keywords_vector VECTOR({self.embedding_dim})
                    )""")

                # Migrate qna_pairs
                sqlite_data = self.conn.execute("""
                    SELECT id, question, answer, category, word_count, created_at,
                           last_accessed, usage_count, keywords, normalized_question, question_hash
                    FROM qna_pairs
                """).fetchall()
                
                cursor.executemany("""
                    INSERT INTO qna_pairs 
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """, sqlite_data)

                # Migrate embeddings with vector conversion
                embedding_data = self.conn.execute("""
                    SELECT qna_id, question_vector, answer_vector, keywords_vector 
                    FROM qna_embeddings
                """).fetchall()
                
                converted_embeddings = []
                for row in embedding_data:
                    converted = (
                        row[0],
                        np.frombuffer(row[1]).tolist(),
                        np.frombuffer(row[2]).tolist(),
                        np.frombuffer(row[3]).tolist()
                    )
                    converted_embeddings.append(converted)
                
                cursor.executemany("""
                    INSERT INTO qna_embeddings 
                    VALUES (%s, %s, %s, %s)
                """, converted_embeddings)

                pg_conn.commit()
            logger.info("Migration completed successfully")
        except Exception as e:
            logger.error(f"Migration failed: {str(e)}")
            if 'pg_conn' in locals():
                pg_conn.rollback()
            raise

    # Helper methods
    def _chunk_list(self, lst: List, n: int):
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    def _normalize_text(self, text: str) -> str:
        return text.lower().strip()

    def _generate_hash(self, text: str) -> str:
        return hashlib.sha256(text.encode()).hexdigest()

In [12]:
# Cell 5: Initialize system
qna_system = QnASystem()
logger.info("System initialized successfully!")

# Cell 6: Test functionality
try:
    test_data = [{
        "question": "What is Kaggle?",
        "answer": "A data science competition platform",
        "keywords": "platform"
    }]
    qna_system.batch_insert(test_data)
    results = qna_system.semantic_search("data science platform")
    logger.info(f"Test search results: {results}")
except Exception as e:
    logger.error(f"Initial test failed: {str(e)}")

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

2025-05-20 19:32:43.019183: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747769563.322478      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747769563.407733      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Processing batches: 1it [00:00,  2.61it/s]
