In [17]:
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv

from simple_xml_lookup import lookup_alphabetical_index, lookup_tabular_list
from descriptions import EXTRACT_ICD10_SYSTEM_PROMPT, PREPROCESSING_PROMPT, CONFIDENCE_SCORING_PROMPT
from openai_tools_converter import ICD10ContextVariables, execute_function_with_context, extract_tool_call_from_response, get_openai_tools_for_icd10




In [4]:
alphabetical_index_path = "icd10cm_index_2025.xml" 
tabular_list_path = "icd10cm_tabular_2025.xml"
query = lookup_alphabetical_index("diabetes", xml_file_path=alphabetical_index_path)

In [5]:
table = lookup_tabular_list("E11.01", xml_file_path=tabular_list_path)

In [18]:
MODEL_ID = "gemini-2.5-flash"
MODEL_ID = "gpt-4.1"
tools = get_openai_tools_for_icd10()
SYSTEM_PROMPT = EXTRACT_ICD10_SYSTEM_PROMPT.substitute(tools=tools)
MAX_TURNS = 50
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

def get_response(string, messages, system_prompt=SYSTEM_PROMPT):
    
    client = OpenAI(
        api_key=openai_api_key,
    )
    
    if len(messages) == 0:
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"Here is the transcript: {string}\n\nWhat's your first step?"}
        ]

    response = client.chat.completions.create(
        model=MODEL_ID,
        messages=messages,
    )
    return response, messages
    







In [7]:
def preprocess_transcript(transcript):
    """
    First step: Extract relevant medical details and remove fluff from transcript
    """
        
    client = OpenAI(api_key=openai_api_key)
    
    messages = [
        {"role": "system", "content": PREPROCESSING_PROMPT},
        {"role": "user", "content": f"Please extract the relevant medical information from this transcript and remove all fluff:\n\n{transcript}"}
    ]
    
    response = client.chat.completions.create(
        model=MODEL_ID,
        messages=messages,
    )
    
    cleaned_transcript = response.choices[0].message.content
    return cleaned_transcript


In [8]:
def add_confidence_scores(chat_history, entries):
    """
    Post-processing step: Add confidence scores to each ICD-10 entry based on chat history
    """
    client = OpenAI(api_key=openai_api_key)
    
    # Format the chat history and entries for review
    chat_summary = "\n".join([f"{msg['role']}: {msg['content']}" for msg in chat_history])
    
    # Format entries with codes for matching
    entries_for_review = []
    for i, entry in enumerate(entries):
        entries_for_review.append({
            "entry_index": i,
            "icd10_code": entry.get('icd10_code', 'Unknown'),
            "icd10_condition_name": entry.get('icd10_condition_name', 'Unknown'),
            "reasoning": entry.get('reasoning', 'No reasoning provided')
        })
    
    messages = [
        {"role": "system", "content": CONFIDENCE_SCORING_PROMPT},
        {"role": "user", "content": f"""
Please review this ICD-10 coding conversation and assign confidence scores to each final entry.

CHAT HISTORY:
{chat_summary}

FINAL ENTRIES TO REVIEW:
{entries_for_review}

Please return ONLY the confidence assessments as a JSON list in the same order as the entries, with format:
[
    {{
        "entry_index": 0,
        "icd10_code": "E11.9",
        "confidence_reasoning": "brief explanation of why you assigned this confidence level",
        "confidence": "confident" or "requires_human_review"
    }},
    ...
]
"""}
    ]
    
    response = client.chat.completions.create(
        model=MODEL_ID,
        messages=messages,
    )
    
    try:
        import json
        confidence_assessments = json.loads(response.choices[0].message.content)
        
        # Add confidence scores to original entries using matching
        enhanced_entries = []
        for i, entry in enumerate(entries):
            enhanced_entry = entry.copy()  # Copy original entry
            
            # Find matching confidence assessment
            matching_assessment = None
            for assessment in confidence_assessments:
                if (assessment.get('entry_index') == i or 
                    assessment.get('icd10_code') == entry.get('icd10_code')):
                    matching_assessment = assessment
                    break
            
            if matching_assessment:
                enhanced_entry['confidence_reasoning'] = matching_assessment['confidence_reasoning']
                enhanced_entry['confidence'] = matching_assessment['confidence']
            else:
                # Fallback if no match found
                enhanced_entry['confidence_reasoning'] = "Unable to match confidence assessment"
                enhanced_entry['confidence'] = "requires_human_review"
            
            enhanced_entries.append(enhanced_entry)
        
        return enhanced_entries
        
    except Exception as e:
        print(f"Failed to parse confidence scores: {e}")
        # Fallback: return original entries with default confidence
        fallback_entries = []
        for entry in entries:
            enhanced_entry = entry.copy()
            enhanced_entry['confidence_reasoning'] = "Confidence scoring failed"
            enhanced_entry['confidence'] = "requires_human_review"
            fallback_entries.append(enhanced_entry)
        return fallback_entries

In [9]:
df = pd.read_csv("~/Downloads/Test_Project_ICD10_Dataset.csv")
index = 3
print(df.columns)
print(df.reference_answer.iloc[index])

Index(['encounter_id', 'age', 'age_unit', 'sex', 'visit_reason',
       'reference_answer', 'transcript'],
      dtype='object')
J98.8 Wheezing-associated respiratory infection (WARI)
Z23 Encounter for immunization
J45.990 Exercise-induced RAD (reactive airway disease)


In [10]:
def get_icd10_codes_with_chat_history(transcript):
    """
    Modified version that returns both entries and chat history
    """
    counter = 0
    messages = []
    context_vars = ICD10ContextVariables(
        xml_file_path_alphabetical=alphabetical_index_path,
        xml_file_path_tabular=tabular_list_path,
        entries=[]
    )
    
    while True:
        response, messages = get_response(transcript, messages)
        text = response.choices[0].message.content
        if "stop" in text:
            break
        print("text", counter, text)
        tool_call = extract_tool_call_from_response(text)
        print("tool_call", counter, tool_call)
        if tool_call:
            result = execute_function_with_context(tool_call["function_name"], tool_call["arguments"], context_vars)
            print("result", counter, result)
            tool_call_message = {"role": "user", "content": f"Function returned: {result}. What's your next step?"}
        else:
            tool_call_message = {"role": "user", "content": f"No function call found. What's your next step?"}

        messages += [
                {"role": "assistant", "content": text},
                tool_call_message
            ]
        
        counter += 1
        if counter > MAX_TURNS:
            break

    return context_vars.entries, messages

def get_icd10_codes_with_full_pipeline(transcript):
    """
    Complete pipeline: preprocessing -> extraction -> confidence scoring
    """
    # Step 1: Preprocess transcript
    print("Step 1: Preprocessing transcript to extract relevant medical information...")
    cleaned_transcript = preprocess_transcript(transcript)
    print(f"Cleaned transcript: {cleaned_transcript}")
    print("\n" + "="*50 + "\n")
    
    # Step 2: Extract ICD-10 codes (modified to return chat history too)
    print("Step 2: Extracting ICD-10 codes from cleaned transcript...")
    entries, chat_history = get_icd10_codes_with_chat_history(cleaned_transcript)
    print(f"Extracted {len(entries)} entries")
    print("\n" + "="*50 + "\n")
    
    # Step 3: Add confidence scores
    print("Step 3: Adding confidence scores to entries...")
    scored_entries = add_confidence_scores(chat_history, entries)
    print(f"Added confidence scores to {len(scored_entries)} entries")
    
    return scored_entries

def extract_confidence_summary(entries):
    """
    Extract confidence summary from entries for storage in DataFrame
    """
    if not entries:
        return "no_entries"
    
    confident_count = sum(1 for entry in entries if entry.get('confidence') == 'confident')
    review_count = sum(1 for entry in entries if entry.get('confidence') == 'requires_human_review')
    total_count = len(entries)
    
    return f"{confident_count}_confident_{review_count}_review_{total_count}_total"

In [14]:
# Load existing entries
try:
    existing_df = pd.read_csv("test_run_with_confidence.csv")
    entries = existing_df["entries"].tolist() if "entries" in existing_df.columns else [None] * len(df)
    confidences = existing_df["confidence_summary"].tolist() if "confidence_summary" in existing_df.columns else [None] * len(df)
except:
    entries = [None] * len(df)
    confidences = [None] * len(df)
    cleaned_transcripts = [None] * len(df)

# Find indices that need processing (either empty entries or missing confidence)
indices_to_process = []
for i in range(len(df)):
    if (entries[i] == "[]" or entries[i] is None or 
        confidences[i] is None or confidences[i] == "no_entries"):
        indices_to_process.append(i)

print(f"Found {len(indices_to_process)} entries to process: {indices_to_process}")

cleaned_transcripts = [None] * len(df)


# Process entries with full pipeline
for index in indices_to_process:
    try:
        print(f"Processing index {index} with full pipeline...")
        
        # Step 1: Get the cleaned transcript from preprocessing
        cleaned_transcript = preprocess_transcript(df.transcript.iloc[index])
        
        # Step 2: Get ICD-10 codes from the cleaned transcript
        entries_result, chat_history = get_icd10_codes_with_chat_history(cleaned_transcript)
        
        # Step 3: Add confidence scores
        scored_entries = add_confidence_scores(chat_history, entries_result)
        
        # Store ALL outputs back to the dataframe
        entries[index] = scored_entries
        confidences[index] = extract_confidence_summary(scored_entries)
        
        # NEW: Store the cleaned transcript back to the dataframe
        cleaned_transcripts[index] = cleaned_transcript
        
        print(f"Successfully processed index {index}")
        print(f"Confidence summary: {confidences[index]}")
        
    except Exception as e:
        print(f"Error processing index {index}: {e}")
        entries[index] = []
        confidences[index] = "processing_failed"
        cleaned_transcripts[index] = "preprocessing_failed"  # NEW

    # Save after each iteration
    temp_df = df.copy()
    temp_df["entries"] = entries
    temp_df["confidence_summary"] = confidences
    temp_df["cleaned_transcript"] = cleaned_transcripts  # NEW: Add cleaned transcript column
    temp_df.to_csv("test_run_with_confidence.csv", index=False)
    print(f"Saved progress: {indices_to_process.index(index) + 1}/{len(indices_to_process)} entries processed")

print("Processing complete!")

Found 3 entries to process: [21, 24, 27]
Processing index 21 with full pipeline...
text 0 Step 1: Write out all conditions that might apply to the patient.

Based on the transcript, the following chronic and acute conditions or relevant findings are identified:
1. Hypertension (well controlled)
2. Diabetes mellitus (not on insulin)
3. History of venous blood clot in the head (cerebral venous thrombosis? Cranial blood clot)
4. Peripheral vascular disease (PVD) with a history of femoral-popliteal bypass surgery
5. History of mechanical fall with resultant pain (“pain in butt” after fall, may represent contusion)
6. Dyslipidemia (cholesterol well managed)
7. Long-term (chronic) use of anticoagulants (warfarin) for blood clot
8. Long-term (chronic) use of aspirin for vascular/heart protection
9. Reduced activity level, mainly due to leg issues (possibly claudication or residual vascular compromise)
10. Chronic pain/complications related to post-surgical state or vascular disease
11. Allerg

In [None]:
# Load existing entries
try:
    existing_df = pd.read_csv("test_run_1.csv")
    entries = existing_df["entries"].tolist() if "entries" in existing_df.columns else [None] * len(df)
except:
    entries = [None] * len(df)

# Find indices that have empty lists (failed entries)
failed_indices = []
for i, entry in enumerate(entries):
    if entry == "[]" or entry is None:
        failed_indices.append(i)

print(f"Found {len(failed_indices)} failed entries to reprocess: {failed_indices}")

# Reprocess only the failed indices
for index in failed_indices:
    try:
        print(f"Reprocessing failed index {index}")
        entries[index] = get_icd10_codes(df.transcript.iloc[index])
        print(f"Successfully processed index {index}")
    except Exception as e:
        print(f"Error reprocessing index {index}: {e}")
        entries[index] = []  # Keep as empty list if it fails again

    # Save after each iteration
    temp_df = df.copy()
    temp_df["entries"] = entries
    temp_df.to_csv("test_run_1.csv", index=False)
    print(f"Saved progress: {failed_indices.index(index) + 1}/{len(failed_indices)} failed entries processed")

Found 0 failed entries to reprocess: []


In [None]:
# Load existing entries
try:
    existing_df = pd.read_csv("test_run_1.csv")
    entries = existing_df["entries"].tolist() if "entries" in existing_df.columns else [None] * len(df)
except:
    entries = [None] * len(df)

# Find indices that have empty lists (failed entries)
failed_indices = []
for i, entry in enumerate(entries):
    if entry == "[]" or entry is None:
        failed_indices.append(i)

print(f"Found {len(failed_indices)} failed entries to reprocess: {failed_indices}")

# Reprocess only the failed indices
for index in failed_indices:
    try:
        print(f"Reprocessing failed index {index}")
        entries[index] = get_icd10_codes(df.transcript.iloc[index])
        print(f"Successfully processed index {index}")
    except Exception as e:
        print(f"Error reprocessing index {index}: {e}")
        entries[index] = []  # Keep as empty list if it fails again

    # Save after each iteration
    temp_df = df.copy()
    temp_df["entries"] = entries
    temp_df.to_csv("test_run_1.csv", index=False)
    print(f"Saved progress: {failed_indices.index(index) + 1}/{len(failed_indices)} failed entries processed")

Found 0 failed entries to reprocess: []
