In [1]:
# Acne-Sense Dermatologist Assistant - Retrieval Evaluation
import pandas as pd
import json
import numpy as np
import sys
import requests
from pathlib import Path
from tqdm.auto import tqdm
from typing import Dict, List, Any, Optional, Union

# Make sure we can import from scripts
sys.path.append(str(Path.cwd().parent))
from scripts.minsearch import Index



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data from knowledge bases
acne_types_df = pd.read_csv('../data/knowledge-base/acne_types.csv', sep=';')
faqs_df = pd.read_csv('../data/knowledge-base/faqs.csv', sep=';')

# Convert to documents
acne_documents = acne_types_df.to_dict(orient='records')
faq_documents = faqs_df.to_dict(orient='records')

# Add source information
for doc in acne_documents:
    doc['source'] = 'acne_types'
for doc in faq_documents:
    doc['source'] = 'faqs'

all_documents = acne_documents + faq_documents

# Create index
index = Index(
    text_fields=[
        'Acne Type', 'Description', 'Common Locations', 'Common Causes', 
        'Initial Treatment', 'OTC Ingredients', 'Skincare Recommendations',
        'Skincare Ingredients to Avoid', 'When to Consult Dermatologist',
        'Expected Timeline', 'Combination Considerations', 'Skin Type Adjustments',
        'Age-Specific Considerations', 'Question', 'Answer', 'Category'
    ],
    keyword_fields=['source']
)

# Fit the index with all documents
index.fit(all_documents)

<scripts.minsearch.Index at 0x1d649a088c0>

In [5]:
# Part 1: Data Generation for Retrieval Evaluation

# Define prompt template for generating questions
acne_prompt_template = """
You emulate a user of our dermatology assistant application focused on acne diagnosis and treatment.
Formulate 5 questions this user might ask based on the provided acne information.
Make the questions specific to this acne type or FAQ.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record.

The record:
{record_text}

Provide the output in parsable JSON without using code blocks:
{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

# Function to call LLM (using Ollama in this case)
def call_llm(prompt: str, model: str = "qwen2:7b") -> str:
    """Get response from Ollama LLM"""
    try:
        response = requests.post(
            'http://localhost:11434/api/generate',
            json={
                "model": model,
                "prompt": prompt,
                "stream": False
            },
            timeout=120  
        )
        
        if response.status_code == 200:
            return response.json().get('response', 'No response generated')
        else:
            return f"Error: {response.status_code} - {response.text}"
    except Exception as e:
        return f"Error connecting to Ollama: {str(e)}"

# Function to format a document as text
def format_doc_text(doc):
    if doc.get('source') == 'acne_types':
        fields = [
            f"Acne Type: {doc.get('Acne Type', '')}",
            f"Description: {doc.get('Description', '')}",
            f"Common Locations: {doc.get('Common Locations', '')}",
            f"Common Causes: {doc.get('Common Causes', '')}",
            f"Initial Treatment: {doc.get('Initial Treatment', '')}",
            f"OTC Ingredients: {doc.get('OTC Ingredients', '')}",
            f"Skincare Recommendations: {doc.get('Skincare Recommendations', '')}",
            f"Skincare Ingredients to Avoid: {doc.get('Skincare Ingredients to Avoid', '')}"
        ]
        return "\n".join(fields)
    elif doc.get('source') == 'faqs':
        fields = [
            f"Question: {doc.get('Question', '')}",
            f"Answer: {doc.get('Answer', '')}",
            f"Category: {doc.get('Category', '')}"
        ]
        return "\n".join(fields)
    return str(doc)

# Function to generate questions for a document
def generate_questions(doc):
    doc_text = format_doc_text(doc)
    prompt = acne_prompt_template.format(record_text=doc_text)
    
    response = call_llm(prompt)
    
    try:
        parsed = json.loads(response)
        return parsed.get('questions', [])
    except json.JSONDecodeError:
        # Fallback if the response isn't valid JSON
        print(f"Failed to parse JSON response for document. Response: {response[:100]}...")
        return []

def generate_questions(doc):
    doc_text = format_doc_text(doc)
    prompt = acne_prompt_template.format(record_text=doc_text)
    
    response = call_llm(prompt)
    
    try:
        # Try to clean the response before parsing
        # Some common issues include extra whitespace, newlines, or markdown formatting
        response = response.strip()
        
        # Check if response is wrapped in markdown code blocks and remove them
        if response.startswith("```json") or response.startswith("```"):
            response = response.split("```")[1]
            if response.startswith("json"):
                response = response[4:]
        
        # Also handle potential trailing code block markers
        if "```" in response:
            response = response.split("```")[0]
        
        # Ensure we have a clean response before parsing
        response = response.strip()
        
        # Fix common JSON issues
        if not response.startswith("{"):
            response = "{" + response.split("{", 1)[1]
        if not response.endswith("}"):
            response = response.rsplit("}", 1)[0] + "}"
            
        parsed = json.loads(response)
        return parsed.get('questions', [])
    except json.JSONDecodeError as e:
        # More detailed error handling
        print(f"Failed to parse JSON for document with ID: {doc.get('Acne Type', doc.get('Question', ''))}")
        print(f"Error: {str(e)}")
        print(f"Response snippet: {response[:200]}...")
        
        # Attempt alternate parsing strategies
        try:
            # Try finding a valid JSON subset between braces
            import re
            json_pattern = r'\{.*?\}'
            match = re.search(json_pattern, response, re.DOTALL)
            if match:
                json_str = match.group(0)
                parsed = json.loads(json_str)
                return parsed.get('questions', [])
        except:
            pass
            
        return []


In [6]:
# Generate questions for a sample of documents (to save time)
# You can increase the sample size or use all documents
# acne_sample = acne_documents[:5]  # Sample 5 acne types
# faq_sample = faq_documents[:5]    # Sample 5 FAQs
# sample_docs = acne_sample + faq_sample
acne_docs = acne_documents + faq_documents

# Generate questions
results = {}
for doc in tqdm(acne_docs, desc="Generating questions"):
    doc_id = doc.get('Acne Type', doc.get('Question', ''))
    questions = generate_questions(doc)
    if questions:
        results[doc_id] = questions


Generating questions: 100%|██████████| 28/28 [01:49<00:00,  3.91s/it]


In [7]:
# Format the results
final_results = []
for doc_id, questions in results.items():
    for q in questions:
        final_results.append({'id': doc_id, 'question': q})

# Create DataFrame of questions
df_questions = pd.DataFrame(final_results)



In [8]:
df_questions

Unnamed: 0,id,question
0,Papule,What are common locations for papule acne?
1,Papule,Which treatments are initially recommended for...
2,Papule,Can over-the-counter products help with papule...
3,Papule,What skincare ingredients should I avoid when ...
4,Papule,How can I prevent or manage the inflammation a...
...,...,...
135,Is Acne Sense free to use?,Is Acne Sense free?
136,Is Acne Sense free to use?,What does the free version of Acne Sense offer?
137,Is Acne Sense free to use?,Do I need a subscription for premium features ...
138,Is Acne Sense free to use?,How can I track my progress with Acne Sense?


In [11]:
df_eval_questions = pd.read_csv('../data/knowledge-base/acne_retrieval_eval_questions.csv', 
                               sep=';',
                               names=['id', 'question'])