# Vector Database Response Logger for Format A and Format B

This notebook logs and examines Pinecone vector database responses for binary queries in both Format A and Format B architectures.

In [1]:
import json
import logging
import sys
import os
from datetime import datetime
from typing import Dict, List, Any, Optional
import pandas as pd
from pinecone import Pinecone
import openai
import numpy as np

# Add project root to path
sys.path.append(os.path.dirname(os.path.abspath('')))

# Import the RAG architectures
from src.architectures.rag_format_a import FormatARAG
from src.architectures.rag_format_b import FormatBRAG

print("Imports successful - configuring OpenAI...")

Imports successful - configuring OpenAI...


## Setup Logging Configuration

In [2]:
# Create logs directory if it doesn't exist
log_dir = "vector_db_logs"
os.makedirs(log_dir, exist_ok=True)

# Setup file logger for vector DB responses
log_filename = f"{log_dir}/pinecone_responses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
json_log_filename = f"{log_dir}/pinecone_responses_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger('vector_db_logger')
logger.info(f"Logging vector DB responses to: {log_filename}")
logger.info(f"JSON responses will be saved to: {json_log_filename}")

INFO:vector_db_logger:Logging vector DB responses to: vector_db_logs/pinecone_responses_20250921_001945.log
INFO:vector_db_logger:JSON responses will be saved to: vector_db_logs/pinecone_responses_20250921_001945.json


## Enhanced Query Functions with Detailed Logging

In [3]:
class VectorDBResponseLogger:
    """Logger for capturing detailed Pinecone responses"""
    
    def __init__(self, config_path="experiments/config.json"):
        self.config_path = config_path
        self.all_responses = []  # Store all responses for later analysis
        
        # Load config from experiments folder
        with open(config_path, 'r') as f:
            self.config = json.load(f)
        
        logger.info(f"Loaded config from: {config_path}")
        
        # Initialize Pinecone
        self.pc = Pinecone(api_key=self.config['pinecone_api_key'])
        self.index = self.pc.Index(self.config['pinecone_index_name'])
        
        # Configure OpenAI directly
        openai.api_key = self.config['openai_api_key']
        self.openai_client = openai.OpenAI(api_key=self.config['openai_api_key'])
        
        logger.info("VectorDBResponseLogger initialized with direct OpenAI client")
        logger.info(f"Using OpenAI key starting with: {self.config['openai_api_key'][:20]}...")
    
    def get_embedding_direct(self, text: str) -> Optional[List[float]]:
        """Generate embedding using direct OpenAI client"""
        try:
            response = self.openai_client.embeddings.create(
                input=text,
                model="text-embedding-ada-002"
            )
            embedding = response.data[0].embedding
            logger.debug(f"Successfully generated embedding for: {text[:50]}...")
            return embedding
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            logger.error(f"API Key prefix: {self.config['openai_api_key'][:20]}...")
            return None
    
    def query_format_a_with_logging(self, drug: str, side_effect: str, top_k: int = 10):
        """Query Format A with detailed logging of Pinecone response"""
        
        query_info = {
            "format": "A",
            "drug": drug,
            "side_effect": side_effect,
            "query_text": f"{drug} {side_effect}",
            "timestamp": datetime.now().isoformat()
        }
        
        logger.info("="*80)
        logger.info(f"FORMAT A QUERY: Drug='{drug}', Side Effect='{side_effect}'")
        
        # Generate embedding with direct client
        query_embedding = self.get_embedding_direct(query_info["query_text"])
        
        if not query_embedding:
            logger.error("Failed to generate embedding - check OpenAI API key")
            return None
        
        logger.info(f"Embedding generated successfully, dimension: {len(query_embedding)}")
        
        try:
            # Query Pinecone
            logger.info(f"Querying Pinecone index: {self.config['pinecone_index_name']}")
            logger.info(f"Namespace: drug-side-effects-formatA, top_k={top_k}")
            
            results = self.index.query(
                vector=query_embedding,
                top_k=top_k,
                namespace="drug-side-effects-formatA",
                include_metadata=True
            )
            
            # Log raw response
            logger.info(f"✅ Received {len(results.matches)} matches from Pinecone")
            
            # Process and log each match
            processed_matches = []
            for i, match in enumerate(results.matches):
                match_data = {
                    "rank": i + 1,
                    "score": float(match.score),
                    "id": match.id,
                    "metadata": dict(match.metadata) if match.metadata else {}
                }
                processed_matches.append(match_data)
                
                logger.info(f"  Match {i+1}:")
                logger.info(f"    Score: {match.score:.4f}")
                logger.info(f"    ID: {match.id}")
                if match.metadata:
                    logger.info(f"    Drug: {match.metadata.get('drug', 'N/A')}")
                    text_preview = str(match.metadata.get('text', ''))[:200]
                    logger.info(f"    Text preview: {text_preview}...")
            
            # Compile full response
            full_response = {
                "query": query_info,
                "pinecone_response": {
                    "total_matches": len(results.matches),
                    "namespace": "drug-side-effects-formatA",
                    "matches": processed_matches
                }
            }
            
            self.all_responses.append(full_response)
            return full_response
            
        except Exception as e:
            logger.error(f"Pinecone query failed: {e}")
            return None
    
    def query_format_b_with_logging(self, drug: str, side_effect: str, top_k: int = 10):
        """Query Format B with detailed logging of Pinecone response"""
        
        query_info = {
            "format": "B",
            "drug": drug,
            "side_effect": side_effect,
            "query_text": f"{drug} {side_effect}",
            "timestamp": datetime.now().isoformat()
        }
        
        logger.info("="*80)
        logger.info(f"FORMAT B QUERY: Drug='{drug}', Side Effect='{side_effect}'")
        
        # Generate embedding with direct client
        query_embedding = self.get_embedding_direct(query_info["query_text"])
        
        if not query_embedding:
            logger.error("Failed to generate embedding - check OpenAI API key")
            return None
        
        logger.info(f"Embedding generated successfully, dimension: {len(query_embedding)}")
        
        try:
            # Query Pinecone
            logger.info(f"Querying Pinecone index: {self.config['pinecone_index_name']}")
            logger.info(f"Namespace: drug-side-effects-formatB, top_k={top_k}")
            
            results = self.index.query(
                vector=query_embedding,
                top_k=top_k,
                namespace="drug-side-effects-formatB",
                include_metadata=True
            )
            
            # Log raw response
            logger.info(f"✅ Received {len(results.matches)} matches from Pinecone")
            
            # Process and log each match
            processed_matches = []
            drug_relevant_matches = 0
            
            for i, match in enumerate(results.matches):
                match_data = {
                    "rank": i + 1,
                    "score": float(match.score),
                    "id": match.id,
                    "metadata": dict(match.metadata) if match.metadata else {}
                }
                processed_matches.append(match_data)
                
                logger.info(f"  Match {i+1}:")
                logger.info(f"    Score: {match.score:.4f}")
                logger.info(f"    ID: {match.id}")
                if match.metadata:
                    pair_drug = match.metadata.get('drug', 'N/A')
                    pair_effect = match.metadata.get('side_effect', 'N/A')
                    logger.info(f"    Drug-Effect Pair: {pair_drug} → {pair_effect}")
                    
                    # Check if this match is relevant to the queried drug
                    if drug.lower() in str(pair_drug).lower():
                        drug_relevant_matches += 1
                        logger.info(f"    ✓ RELEVANT to queried drug '{drug}'")
                    else:
                        logger.info(f"    ✗ Not relevant to queried drug '{drug}'")
            
            logger.info(f"\nSummary: {drug_relevant_matches}/{len(results.matches)} matches are relevant to '{drug}'")
            
            # Compile full response
            full_response = {
                "query": query_info,
                "pinecone_response": {
                    "total_matches": len(results.matches),
                    "drug_relevant_matches": drug_relevant_matches,
                    "namespace": "drug-side-effects-formatB",
                    "matches": processed_matches
                }
            }
            
            self.all_responses.append(full_response)
            return full_response
            
        except Exception as e:
            logger.error(f"Pinecone query failed: {e}")
            return None
    
    def test_connections(self):
        """Test OpenAI and Pinecone connections"""
        print("\n" + "="*60)
        print("TESTING CONNECTIONS")
        print("="*60)
        
        # Test OpenAI
        print("\n1. Testing OpenAI connection...")
        test_embedding = self.get_embedding_direct("test connection")
        if test_embedding:
            print(f"   ✅ OpenAI connection successful")
            print(f"   Embedding dimension: {len(test_embedding)}")
        else:
            print(f"   ❌ OpenAI connection failed")
            print(f"   Check your API key in experiments/config.json")
        
        # Test Pinecone
        print("\n2. Testing Pinecone connection...")
        try:
            stats = self.index.describe_index_stats()
            print(f"   ✅ Pinecone connection successful")
            print(f"   Index name: {self.config['pinecone_index_name']}")
            print(f"   Total vectors: {stats.total_vector_count}")
            print(f"   Namespaces: {list(stats.namespaces.keys())}")
            for ns_name, ns_stats in stats.namespaces.items():
                print(f"      - {ns_name}: {ns_stats.vector_count} vectors")
        except Exception as e:
            print(f"   ❌ Pinecone connection failed: {e}")
        
        return test_embedding is not None
    
    def save_responses_to_json(self):
        """Save all collected responses to JSON file"""
        with open(json_log_filename, 'w') as f:
            json.dump(self.all_responses, f, indent=2)
        logger.info(f"\nSaved {len(self.all_responses)} responses to {json_log_filename}")
        return json_log_filename

## Initialize the Logger

In [4]:
# Initialize the vector DB response logger with experiments/config.json
vdb_logger = VectorDBResponseLogger(config_path="experiments/config.json")
print(f"Vector DB Logger initialized using experiments/config.json")
print(f"Log file: {log_filename}")
print(f"JSON file: {json_log_filename}")

INFO:vector_db_logger:Loaded config from: experiments/config.json
  from .autonotebook import tqdm as notebook_tqdm
INFO:vector_db_logger:VectorDBResponseLogger initialized with direct OpenAI client
INFO:vector_db_logger:Using OpenAI key starting with: sk-proj-6XfIoTQOgxt8...


Vector DB Logger initialized using experiments/config.json
Log file: vector_db_logs/pinecone_responses_20250921_001945.log
JSON file: vector_db_logs/pinecone_responses_20250921_001945.json


In [5]:
# Test connections before running queries
if vdb_logger.test_connections():
    print("\n✅ All connections successful - ready to log vector DB responses")
else:
    print("\n⚠️ Connection issues detected - check configuration")
    print("\nTroubleshooting tips:")
    print("1. Verify OpenAI API key is valid and has credits")
    print("2. Check if the key starts with 'sk-' and is not expired")
    print("3. Ensure Pinecone API key and index name are correct")


TESTING CONNECTIONS

1. Testing OpenAI connection...
   ✅ OpenAI connection successful
   Embedding dimension: 1536

2. Testing Pinecone connection...
   ✅ Pinecone connection successful
   Index name: drug-side-effects-text-embedding-ada-002
   Total vectors: 246346
   Namespaces: ['drug-side-effects-enhanced-formatB', 'drug-side-effects-formatA', 'drug-side-effects-clinical', 'drug-side-effects-formatB']
      - drug-side-effects-enhanced-formatB: 122579 vectors
      - drug-side-effects-formatA: 976 vectors
      - drug-side-effects-clinical: 190 vectors
      - drug-side-effects-formatB: 122601 vectors

✅ All connections successful - ready to log vector DB responses


## Test Queries - Format A

In [6]:
# Define test queries
test_queries = [
    {"drug": "aspirin", "side_effect": "headache"},
    {"drug": "ibuprofen", "side_effect": "nausea"},
    {"drug": "metformin", "side_effect": "dizziness"},
    {"drug": "lisinopril", "side_effect": "cough"},
    {"drug": "atorvastatin", "side_effect": "muscle pain"},
]

print("\n" + "="*80)
print("TESTING FORMAT A QUERIES")
print("="*80)

format_a_responses = []
for query in test_queries:
    response = vdb_logger.query_format_a_with_logging(
        drug=query["drug"],
        side_effect=query["side_effect"],
        top_k=5  # Get top 5 matches for each query
    )
    if response:
        format_a_responses.append(response)
        print(f"\n✓ Logged Format A response for: {query['drug']} - {query['side_effect']}")
        print(f"  Total matches: {response['pinecone_response']['total_matches']}")
        if response['pinecone_response']['matches']:
            print(f"  Top match score: {response['pinecone_response']['matches'][0]['score']:.4f}")

INFO:vector_db_logger:FORMAT A QUERY: Drug='aspirin', Side Effect='headache'



TESTING FORMAT A QUERIES


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatA, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8547
INFO:vector_db_logger:    ID: format_a_aspirin_594
INFO:vector_db_logger:    Drug: aspirin
INFO:vector_db_logger:    Text preview: The drug aspirin causes the following side effects or adverse reactions: acidosis, agitation, anaemia, anaphylactic shock, angioedema, arthralgia, asthma, blood cholesterol increased, breast feeding, ...
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8087
INFO:vector_db_logger:    ID: format_a_salicylate_842
INFO:vector_db_logger:    Drug: salicylate
INFO:vector_db_logger:    Text preview: The drug salicylate causes the following side effects or adverse reactions: abdominal discomfort,


✓ Logged Format A response for: aspirin - headache
  Total matches: 5
  Top match score: 0.8547


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatA, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8472
INFO:vector_db_logger:    ID: format_a_ibuprofen_77
INFO:vector_db_logger:    Drug: ibuprofen
INFO:vector_db_logger:    Text preview: The drug ibuprofen causes the following side effects or adverse reactions: abdominal discomfort, abdominal distension, abdominal pain, abdominal pain upper, abnormal dreams, acidosis, acute coronary s...
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8163
INFO:vector_db_logger:    ID: format_a_flurbiprofen_122
INFO:vector_db_logger:    Drug: flurbiprofen
INFO:vector_db_logger:    Text preview: The drug flurbiprofen causes the following side effects or adverse reactions: abdominal di


✓ Logged Format A response for: ibuprofen - nausea
  Total matches: 5
  Top match score: 0.8472


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatA, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8466
INFO:vector_db_logger:    ID: format_a_metformin_395
INFO:vector_db_logger:    Drug: metformin
INFO:vector_db_logger:    Text preview: The drug metformin causes the following side effects or adverse reactions: abdominal discomfort, abdominal distension, abdominal pain, abdominal pain upper, abnormal faeces, acetonaemia, acidosis, acu...
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8056
INFO:vector_db_logger:    ID: format_a_metolazone_471
INFO:vector_db_logger:    Drug: metolazone
INFO:vector_db_logger:    Text preview: The drug metolazone causes the following side effects or adverse reactions: abdominal discomf


✓ Logged Format A response for: metformin - dizziness
  Total matches: 5
  Top match score: 0.8466


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatA, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8356
INFO:vector_db_logger:    ID: format_a_lisinopril_134
INFO:vector_db_logger:    Drug: lisinopril
INFO:vector_db_logger:    Text preview: The drug lisinopril causes the following side effects or adverse reactions: abdominal pain, acute coronary syndrome, acute myocardial infarction, agranulocytosis, alopecia, anaemia, anaphylactic shock...
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8078
INFO:vector_db_logger:    ID: format_a_benazepril_474
INFO:vector_db_logger:    Drug: benazepril
INFO:vector_db_logger:    Text preview: The drug benazepril causes the following side effects or adverse reactions: abdominal pain,


✓ Logged Format A response for: lisinopril - cough
  Total matches: 5
  Top match score: 0.8356


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatA, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8570
INFO:vector_db_logger:    ID: format_a_atorvastatin_156
INFO:vector_db_logger:    Drug: atorvastatin
INFO:vector_db_logger:    Text preview: The drug atorvastatin causes the following side effects or adverse reactions: abdominal discomfort, abdominal pain, abnormal dreams, affect lability, ageusia, agitation, alanine aminotransferase incre...
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8506
INFO:vector_db_logger:    ID: format_a_pitavastatin_789
INFO:vector_db_logger:    Drug: pitavastatin
INFO:vector_db_logger:    Text preview: The drug pitavastatin causes the following side effects or adverse reactions: abdom


✓ Logged Format A response for: atorvastatin - muscle pain
  Total matches: 5
  Top match score: 0.8570


## Test Queries - Format B

In [7]:
print("\n" + "="*80)
print("TESTING FORMAT B QUERIES")
print("="*80)

format_b_responses = []
for query in test_queries:
    response = vdb_logger.query_format_b_with_logging(
        drug=query["drug"],
        side_effect=query["side_effect"],
        top_k=5  # Get top 5 matches for each query
    )
    if response:
        format_b_responses.append(response)
        print(f"\n✓ Logged Format B response for: {query['drug']} - {query['side_effect']}")
        print(f"  Total matches: {response['pinecone_response']['total_matches']}")
        print(f"  Drug-relevant matches: {response['pinecone_response']['drug_relevant_matches']}")
        if response['pinecone_response']['matches']:
            print(f"  Top match score: {response['pinecone_response']['matches'][0]['score']:.4f}")

INFO:vector_db_logger:FORMAT B QUERY: Drug='aspirin', Side Effect='headache'



TESTING FORMAT B QUERIES


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatB, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8710
INFO:vector_db_logger:    ID: format_b_aspirin_arthralgia_105304
INFO:vector_db_logger:    Drug-Effect Pair: aspirin → arthralgia
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'aspirin'
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8672
INFO:vector_db_logger:    ID: format_b_aspirin_vertigo_105278
INFO:vector_db_logger:    Drug-Effect Pair: aspirin → vertigo
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'aspirin'
INFO:vector_db_logger:  Match 3:
INFO:vector_db_logger:    Score: 0.8668
INFO:vector_db_logger:    ID: format_b_aspirin_hypertension_105321
INFO:vector_db_logger:    Drug-Effect Pair: aspir


✓ Logged Format B response for: aspirin - headache
  Total matches: 5
  Drug-relevant matches: 5
  Top match score: 0.8710


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatB, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.9003
INFO:vector_db_logger:    ID: format_b_ibuprofen_nausea_30923
INFO:vector_db_logger:    Drug-Effect Pair: ibuprofen → nausea
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'ibuprofen'
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8682
INFO:vector_db_logger:    ID: format_b_ibuprofen_vomiting_31046
INFO:vector_db_logger:    Drug-Effect Pair: ibuprofen → vomiting
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'ibuprofen'
INFO:vector_db_logger:  Match 3:
INFO:vector_db_logger:    Score: 0.8657
INFO:vector_db_logger:    ID: format_b_flurbiprofen_nausea_42575
INFO:vector_db_logger:    Drug-Effect Pair: flu


✓ Logged Format B response for: ibuprofen - nausea
  Total matches: 5
  Drug-relevant matches: 4
  Top match score: 0.9003


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatB, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.9017
INFO:vector_db_logger:    ID: format_b_metformin_dizziness_86966
INFO:vector_db_logger:    Drug-Effect Pair: metformin → dizziness
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'metformin'
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8599
INFO:vector_db_logger:    ID: format_b_metformin_dyspnoea_86975
INFO:vector_db_logger:    Drug-Effect Pair: metformin → dyspnoea
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'metformin'
INFO:vector_db_logger:  Match 3:
INFO:vector_db_logger:    Score: 0.8564
INFO:vector_db_logger:    ID: format_b_metformin_dysgeusia_86978
INFO:vector_db_logger:    Drug-Effect Pai


✓ Logged Format B response for: metformin - dizziness
  Total matches: 5
  Drug-relevant matches: 4
  Top match score: 0.9017


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatB, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8979
INFO:vector_db_logger:    ID: format_b_lisinopril_cough_45406
INFO:vector_db_logger:    Drug-Effect Pair: lisinopril → cough
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'lisinopril'
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8659
INFO:vector_db_logger:    ID: format_b_lisinopril_bronchitis_45308
INFO:vector_db_logger:    Drug-Effect Pair: lisinopril → bronchitis
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'lisinopril'
INFO:vector_db_logger:  Match 3:
INFO:vector_db_logger:    Score: 0.8617
INFO:vector_db_logger:    ID: format_b_lisinopril_bronchospasm_45437
INFO:vector_db_logger:    Drug-Effe


✓ Logged Format B response for: lisinopril - cough
  Total matches: 5
  Drug-relevant matches: 5
  Top match score: 0.8979


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatB, top_k=5
INFO:vector_db_logger:✅ Received 5 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8949
INFO:vector_db_logger:    ID: format_b_atorvastatin_musculoskeletal pain_50041
INFO:vector_db_logger:    Drug-Effect Pair: atorvastatin → musculoskeletal pain
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'atorvastatin'
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8878
INFO:vector_db_logger:    ID: format_b_atorvastatin_pain in extremity_50181
INFO:vector_db_logger:    Drug-Effect Pair: atorvastatin → pain in extremity
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'atorvastatin'
INFO:vector_db_logger:  Match 3:
INFO:vector_db_logger:    Score: 0.8839
INFO:vector_db_logger:    ID: format_b_atorvasta


✓ Logged Format B response for: atorvastatin - muscle pain
  Total matches: 5
  Drug-relevant matches: 5
  Top match score: 0.8949


## Analyze Response Patterns

In [8]:
def analyze_responses(responses, format_name):
    """Analyze patterns in vector DB responses"""
    print(f"\n{'='*60}")
    print(f"ANALYSIS: {format_name}")
    print(f"{'='*60}")
    
    if not responses:
        print("No responses to analyze")
        return
    
    # Collect statistics
    all_scores = []
    match_counts = []
    
    for resp in responses:
        query_info = resp['query']
        pinecone_resp = resp['pinecone_response']
        
        print(f"\nQuery: {query_info['drug']} - {query_info['side_effect']}")
        print(f"  Matches returned: {pinecone_resp['total_matches']}")
        
        if format_name == "Format B" and 'drug_relevant_matches' in pinecone_resp:
            print(f"  Drug-relevant matches: {pinecone_resp['drug_relevant_matches']}")
        
        match_counts.append(pinecone_resp['total_matches'])
        
        # Analyze score distribution
        if pinecone_resp['matches']:
            scores = [m['score'] for m in pinecone_resp['matches']]
            all_scores.extend(scores)
            print(f"  Score range: {min(scores):.4f} - {max(scores):.4f}")
            print(f"  Average score: {sum(scores)/len(scores):.4f}")
            
            # Show top metadata fields
            top_match = pinecone_resp['matches'][0]
            if top_match['metadata']:
                print(f"  Top match metadata keys: {list(top_match['metadata'].keys())}")
    
    # Overall statistics
    print(f"\nOverall Statistics for {format_name}:")
    print(f"  Total queries: {len(responses)}")
    print(f"  Average matches per query: {sum(match_counts)/len(match_counts):.1f}")
    if all_scores:
        print(f"  Overall score range: {min(all_scores):.4f} - {max(all_scores):.4f}")
        print(f"  Overall average score: {sum(all_scores)/len(all_scores):.4f}")

# Analyze Format A responses
analyze_responses(format_a_responses, "Format A")

# Analyze Format B responses  
analyze_responses(format_b_responses, "Format B")


ANALYSIS: Format A

Query: aspirin - headache
  Matches returned: 5
  Score range: 0.7986 - 0.8547
  Average score: 0.8146
  Top match metadata keys: ['drug', 'format', 'paper_spec', 'text']

Query: ibuprofen - nausea
  Matches returned: 5
  Score range: 0.8071 - 0.8472
  Average score: 0.8191
  Top match metadata keys: ['drug', 'format', 'paper_spec', 'text']

Query: metformin - dizziness
  Matches returned: 5
  Score range: 0.7930 - 0.8466
  Average score: 0.8079
  Top match metadata keys: ['drug', 'format', 'paper_spec', 'text']

Query: lisinopril - cough
  Matches returned: 5
  Score range: 0.7996 - 0.8356
  Average score: 0.8096
  Top match metadata keys: ['drug', 'format', 'paper_spec', 'text']

Query: atorvastatin - muscle pain
  Matches returned: 5
  Score range: 0.8394 - 0.8570
  Average score: 0.8470
  Top match metadata keys: ['drug', 'format', 'paper_spec', 'text']

Overall Statistics for Format A:
  Total queries: 5
  Average matches per query: 5.0
  Overall score range: 

## Save All Responses to JSON

In [9]:
# Save all responses to JSON file
saved_file = vdb_logger.save_responses_to_json()
print(f"\n✅ All vector DB responses saved to: {saved_file}")
print(f"\nYou can now examine the detailed responses in:")
print(f"  - Log file: {log_filename}")
print(f"  - JSON file: {json_log_filename}")

INFO:vector_db_logger:
Saved 10 responses to vector_db_logs/pinecone_responses_20250921_001945.json



✅ All vector DB responses saved to: vector_db_logs/pinecone_responses_20250921_001945.json

You can now examine the detailed responses in:
  - Log file: vector_db_logs/pinecone_responses_20250921_001945.log
  - JSON file: vector_db_logs/pinecone_responses_20250921_001945.json


## Display Sample Response Structure

In [10]:
# Display a sample response structure for examination
if vdb_logger.all_responses:
    print("\nSAMPLE RESPONSE STRUCTURE (first query):")
    print("="*60)
    sample = vdb_logger.all_responses[0]
    print(json.dumps(sample, indent=2)[:2000])  # Show first 2000 chars
    print("\n... (truncated for display)")
else:
    print("No responses collected yet")


SAMPLE RESPONSE STRUCTURE (first query):
{
  "query": {
    "format": "A",
    "drug": "aspirin",
    "side_effect": "headache",
    "query_text": "aspirin headache",
    "timestamp": "2025-09-21T00:19:47.829724"
  },
  "pinecone_response": {
    "total_matches": 5,
    "namespace": "drug-side-effects-formatA",
    "matches": [
      {
        "rank": 1,
        "score": 0.854675412,
        "id": "format_a_aspirin_594",
        "metadata": {
          "drug": "aspirin",
          "format": "A",
          "paper_spec": "aggregated_side_effects",
          "text": "The drug aspirin causes the following side effects or adverse reactions: acidosis, agitation, anaemia, anaphylactic shock, angioedema, arthralgia, asthma, blood cholesterol increased, breast feeding, chest discomfort, chest pain, coagulopathy, coma, confusional state, contusion, convulsion, deafness, dermatitis, diabetes mellitus, diarrhoea, dizziness, dyspepsia, dyspnoea, epigastric discomfort, feeling abnormal, flushing, f

## Custom Query Testing

In [12]:
# Test with your own custom queries
custom_drug = "aspirin"  # Change this
custom_side_effect = "bleeding"  # Change this

print(f"\nTesting custom query: {custom_drug} - {custom_side_effect}")
print("="*60)

# Test Format A
print("\nFormat A Response:")
format_a_custom = vdb_logger.query_format_a_with_logging(custom_drug, custom_side_effect, top_k=10)

# Test Format B
print("\nFormat B Response:")
format_b_custom = vdb_logger.query_format_b_with_logging(custom_drug, custom_side_effect, top_k=10)

# Save updated responses
vdb_logger.save_responses_to_json()
print("\n✅ Custom query responses logged and saved")

INFO:vector_db_logger:FORMAT A QUERY: Drug='aspirin', Side Effect='bleeding'



Testing custom query: aspirin - bleeding

Format A Response:


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatA, top_k=10
INFO:vector_db_logger:✅ Received 10 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8556
INFO:vector_db_logger:    ID: format_a_aspirin_594
INFO:vector_db_logger:    Drug: aspirin
INFO:vector_db_logger:    Text preview: The drug aspirin causes the following side effects or adverse reactions: acidosis, agitation, anaemia, anaphylactic shock, angioedema, arthralgia, asthma, blood cholesterol increased, breast feeding, ...
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8025
INFO:vector_db_logger:    ID: format_a_fondaparinux_442
INFO:vector_db_logger:    Drug: fondaparinux
INFO:vector_db_logger:    Text preview: The drug fondaparinux causes the following side effects or adverse reactions: abdominal pai


Format B Response:


INFO:vector_db_logger:Embedding generated successfully, dimension: 1536
INFO:vector_db_logger:Querying Pinecone index: drug-side-effects-text-embedding-ada-002
INFO:vector_db_logger:Namespace: drug-side-effects-formatB, top_k=10
INFO:vector_db_logger:✅ Received 10 matches from Pinecone
INFO:vector_db_logger:  Match 1:
INFO:vector_db_logger:    Score: 0.8795
INFO:vector_db_logger:    ID: format_b_aspirin_haemorrhage_105306
INFO:vector_db_logger:    Drug-Effect Pair: aspirin → haemorrhage
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'aspirin'
INFO:vector_db_logger:  Match 2:
INFO:vector_db_logger:    Score: 0.8778
INFO:vector_db_logger:    ID: format_b_aspirin_gastrointestinal haemorrhage_105292
INFO:vector_db_logger:    Drug-Effect Pair: aspirin → gastrointestinal haemorrhage
INFO:vector_db_logger:    ✓ RELEVANT to queried drug 'aspirin'
INFO:vector_db_logger:  Match 3:
INFO:vector_db_logger:    Score: 0.8747
INFO:vector_db_logger:    ID: format_b_aspirin_haematemesis_105275
INF


✅ Custom query responses logged and saved
