In [None]:
# Enhanced database schema with proper indexing and normalization
def _initialize_db(self):
    """Create optimized database schema"""
    self.conn.execute("""
    CREATE TABLE IF NOT EXISTS qna_pairs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        question TEXT NOT NULL,
        answer TEXT NOT NULL,
        category TEXT,
        word_count INTEGER,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        last_accessed TIMESTAMP,
        usage_count INTEGER DEFAULT 0,
        keywords TEXT,
        normalized_question TEXT,
        question_hash TEXT UNIQUE  -- For duplicate detection
    )""")
    
    # Vector embeddings table with proper indexing
    self.conn.execute("""
    CREATE TABLE IF NOT EXISTS qna_embeddings (
        qna_id INTEGER PRIMARY KEY,
        question_vector BLOB,
        answer_vector BLOB,
        keywords_vector BLOB,
        FOREIGN KEY (qna_id) REFERENCES qna_pairs(id)
    )""")
    
    # Create optimized indexes
    self.conn.execute("CREATE INDEX IF NOT EXISTS idx_category ON qna_pairs(category)")
    self.conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords ON qna_pairs(keywords)")
    self.conn.execute("CREATE INDEX IF NOT EXISTS idx_normalized ON qna_pairs(normalized_question)")
    
    # Full-text search with proper tokenizer
    self.conn.execute("""
    CREATE VIRTUAL TABLE IF NOT EXISTS qna_search 
    USING fts5(question, answer, keywords, tokenize='porter unicode61')
    """)

In [None]:
def semantic_search(self, query: str, top_k: int = 5) -> List[Dict]:
    """Enhanced semantic search with RAG"""
    # Generate query embedding
    query_embedding = self._text_to_vector(query)
    
    # First perform keyword pre-filtering
    keyword_matches = self.keyword_search(query, limit=top_k*3)
    
    if not keyword_matches:
        return []
    
    # Get embeddings for pre-filtered results
    ids = [str(qid) for qid, _, _ in keyword_matches]
    cursor = self.conn.execute(f"""
    SELECT qna_id, question_vector, answer_vector 
    FROM qna_embeddings 
    WHERE qna_id IN ({','.join(['?']*len(ids))})
    """, ids)
    
    # Calculate similarities
    results = []
    for qna_id, q_vec, a_vec in cursor.fetchall():
        q_sim = cosine_similarity([query_embedding], [self._vector_to_array(q_vec)])[0][0]
        a_sim = cosine_similarity([query_embedding], [self._vector_to_array(a_vec)])[0][0]
        combined_score = 0.6*q_sim + 0.4*a_sim
        results.append((qna_id, combined_score))
    
    # Get top results with full data
    top_ids = [x[0] for x in sorted(results, key=lambda x: x[1], reverse=True)[:top_k]]
    return self.get_qna_by_ids(top_ids)

In [None]:
def _text_to_vector(self, text: str) -> np.ndarray:
    """GPU-accelerated embedding generation"""
    if torch.cuda.is_available():
        self.model = self.model.to('cuda')
    
    inputs = self.tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=512,
        padding=True
    )
    
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = self.model(**inputs)
    
    # Average pooling
    last_hidden_state = outputs.last_hidden_state
    vector = last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return vector

In [None]:
def batch_insert(self, qna_list: List[Dict], batch_size: int = 100):
    """Optimized batch insert with progress tracking"""
    for batch in tqdm(self._chunk_list(qna_list, batch_size), desc="Processing batches"):
        # Process embeddings in parallel
        with ThreadPoolExecutor() as executor:
            embeddings = list(executor.map(self._process_single_qna, batch))
        
        # Bulk insert
        with self.conn:
            self.conn.executemany("""
            INSERT INTO qna_pairs 
            (question, answer, category, word_count, keywords, normalized_question, question_hash)
            VALUES (?, ?, ?, ?, ?, ?, ?)
            """, [(q['question'], q['answer'], q.get('category'), 
                  len(q['answer'].split()), q['keywords'], 
                  self._normalize_text(q['question']),
                  self._generate_hash(q['question']) for q in batch])
            
            # Insert embeddings
            self.conn.executemany("""
            INSERT INTO qna_embeddings 
            (qna_id, question_vector, answer_vector, keywords_vector)
            VALUES (?, ?, ?, ?)
            """, embeddings)

In [None]:
def migrate_to_postgres(self):
    """Migration path to PostgreSQL"""
    import psycopg2
    from pgvector.psycopg2 import register_vector
    
    # Connect to PostgreSQL
    pg_conn = psycopg2.connect(os.getenv("POSTGRES_URL"))
    register_vector(pg_conn)
    
    # Create optimized schema
    with pg_conn.cursor() as cursor:
        cursor.execute("""
        CREATE TABLE IF NOT EXISTS qna_pairs (
            id SERIAL PRIMARY KEY,
            question TEXT NOT NULL,
            answer TEXT NOT NULL,
            category TEXT,
            word_count INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            last_accessed TIMESTAMP,
            usage_count INTEGER DEFAULT 0,
            keywords TEXT,
            normalized_question TEXT,
            question_hash TEXT UNIQUE
        )""")
        
        cursor.execute("""
        CREATE TABLE IF NOT EXISTS qna_embeddings (
            qna_id INTEGER PRIMARY KEY REFERENCES qna_pairs(id),
            question_vector VECTOR(1024),
            answer_vector VECTOR(1024),
            keywords_vector VECTOR(1024)
        )""")
        
        # Create specialized indexes
        cursor.execute("CREATE INDEX ON qna_pairs USING GIN(to_tsvector('english', question))")
        cursor.execute("CREATE INDEX ON qna_pairs USING GIN(to_tsvector('english', answer))")
        cursor.execute("CREATE INDEX ON qna_embeddings USING ivfflat (question_vector vector_l2_ops)")

In [None]:
def evaluate_domain_coverage(self, domain_terms: List[str]) -> Dict:
    """Evaluate how well the model covers domain-specific terms"""
    vocab = set(self.tokenizer.get_vocab().keys())
    missing = [term for term in domain_terms if term.lower() not in vocab]
    
    return {
        "total_terms": len(domain_terms),
        "covered_terms": len(domain_terms) - len(missing),
        "coverage_percentage": (len(domain_terms) - len(missing)) / len(domain_terms) * 100,
        "missing_terms": missing[:20]  # Show first 20 missing terms
    }

In [None]:
def benchmark_search(self, test_queries: List[str], iterations: int = 10):
    """Benchmark search performance"""
    results = []
    for query in test_queries:
        times = []
        for _ in range(iterations):
            start = time.time()
            self.semantic_search(query)
            times.append(time.time() - start)
        
        results.append({
            "query": query,
            "avg_time": sum(times)/len(times),
            "min_time": min(times),
            "max_time": max(times)
        })
    
    return pd.DataFrame(results)

In [None]:
def evaluate_answer_quality(self, test_set: List[Dict]) -> Dict:
    """Evaluate answer quality against a test set"""
    correct = 0
    for item in test_set:
        result = self.semantic_search(item["question"], top_k=1)
        if result and result[0]["answer"] == item["expected_answer"]:
            correct += 1
    
    return {
        "total_questions": len(test_set),
        "correct_answers": correct,
        "accuracy": correct / len(test_set) * 100
    }