In [2]:
import pandas as pd
from transformers import pipeline
import ast
from extractor import ExtractingPrompt
from LLM import LLM
from tqdm import tqdm
import re
import numpy as np

import warnings
warnings.filterwarnings('ignore')

2025-04-05 22:08:15.489247: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-05 22:08:15.499836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743883695.513566 3994732 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743883695.517916 3994732 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-05 22:08:15.531384: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [13]:
df = pd.read_csv('/home/laajila/mima_newcode/clean_code/Final_dataset_generated_one_symptom.csv')

In [14]:
df.shape

(1313, 11)

In [9]:
ctcae = pd.read_excel('PRO-CTCAE_Questionnaire_Terminology.xls', sheet_name = 'PRO')
symptoms_list = ctcae['PRO-CTCAE PT'].unique()[:-25]

In [10]:
def extract_symptom_scores(output_str):
    # This pattern matches a key enclosed in single or double quotes followed by a colon and a number (integer or float)
    pattern = r'["\']([^"\']+)["\']\s*:\s*([0-9]*\.?[0-9]+)'
    matches = re.findall(pattern, output_str)
    # Convert the extracted values to float and build the dictionary
    return {key: float(value) for key, value in matches}

In [None]:
def calculate_accuracy(df_results):
        """
        Calculate accuracy of symptom extraction, case-insensitive and with proper handling of extracted symptoms.
        
        Args:
            df_results: DataFrame containing columns 'True_Symptom' and 'Extracted_Symptom'
            
        Returns:
            accuracy: Float between 0 and 1 representing match accuracy
        """
        score = 0
        total = len(df_results)
        
        if total == 0:
            return 0.0  # handle empty dataframe case
        
        for i in range(total):
            true_symptom = str(df_results.iloc[i]['True_Symptom']).strip().lower()
            
            try:
                # Safely handle the extracted symptom (assuming it might be a string representation of a list)
                extracted = df_results.iloc[i]['Extracted_Symptom']
                
                # Handle different possible formats:
                if isinstance(extracted, str):

                    # Try to evaluate if it's a string representation of a list
                    if extracted.startswith('[') and extracted.endswith(']'):
                        extracted_list = [s.strip().lower() for s in extracted[1:-1].split(',')]
                    else:
                        extracted_list = [extracted.strip().lower()]

                elif isinstance(extracted, (list, tuple)):
                    extracted_list = [str(s).strip().lower() for s in extracted]
                    
                else:
                    extracted_list = [str(extracted).strip().lower()]
                    
                # Check if true symptom exists in extracted list
                if true_symptom in extracted_list:
                    score += 1
                    
            except Exception as e:
                print(f"Error processing row {i}: {e}")
                continue
        
        accuracy = score / total
        return accuracy

In [11]:
class ExtractingPrompt:
    """
    A class to generate prompts for extracting structured symptoms from patient dialogues.
    """
    def __init__(self, symptom_list: list[str]) -> None:
        """
        Initialize the prompt generator with a predefined list of symptoms.
        """
        self.symptom_list = symptom_list

    def build_extraction_prompt(self, dialogue: str) -> list[dict[str, str]]:
        
        symptoms_str = ", ".join(self.symptom_list)
        
        messages = [
            {
                "role": "system",
                "content": (
                    "You are a medical symptom extraction expert. Your task is to analyze patient dialogues "

                    "You are an AI assistant specialized in extracting medical symptoms. "
                    "Given a patient dialogue, identify which symptoms from the provided list are mentioned with high precision using these rules:\n"
                    " STRICT MATCHING: Only output symptoms that are mentioned in the symptoms list {symptoms_str}.\n"
                    " CONTEXT AWARENESS: Consider negation (e.g., 'no fever') and temporal aspects (e.g., 'had headache yesterday')\n"
                    " CONFIDENCE SCORING: Assign scores using this scale:\n"
                    "   - 0.9-1.0: Explicitly mentioned (e.g., 'I have fever')\n"
                    "   - 0.7-0.8: Strongly implied (e.g., 'my head is pounding' → headache)\n"
                    "   - 0.5-0.6: Weakly implied (only include if no better matches exist)\n"
                    "4. OUTPUT FORMAT: Strict JSON with {symptom: score} pairs, no additional text\n\n"
                    "Example Output:\n"
                    "{\"fever\": 0.95, \"headache\": 0.8}\n\n"
                    "Available symptoms: " + symptoms_str
                )
            },
            {
                "role": "user",
                "content": (
                    "DIALOGUE:\n\"\"\"\n" + dialogue.strip() + "\n\"\"\"\n\n"
                    "Analyze this dialogue and extract symptoms according to the rules above. "
                    "Output ONLY valid JSON with no additional commentary."
                )
            }
        ]
        return messages

## Import the LLM model : Biollama 

In [4]:
model = LLM(model_name="iRASC/BioLlama-Ko-8B", max_length=50)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [None]:
extractor = ExtractingPrompt(symptoms_list)

results = []

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    # Extract symptoms with scores > 0.80
    symptom_scores = extract_symptom_scores(symptoms_extracted_llm)

    symptoms_extracted = [symptom for symptom, score in symptom_scores.items() if score > 0.80]
    

    # symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
    #                      list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

        
    # Format the output appropriately
    if len(symptoms_extracted) == 0:
        formatted_symptoms = None  # or "" if you prefer empty string
    elif len(symptoms_extracted) == 1:
        formatted_symptoms = symptoms_extracted[0]  # single symptom as string
    else:
        formatted_symptoms = ", ".join(symptoms_extracted)  # multiple as comma-separated string
    

    true_symptom = df['Symptom'][i]
    
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": formatted_symptoms})
    
    df_results = pd.DataFrame(results)

    df_results.to_csv("Final_Extracting_one_symptom.csv")

9it [00:19,  2.17s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
1313it [47:38,  2.18s/it]


In [15]:
calculate_accuracy(df_results)

0.8141660319878141

In [18]:
model = LLM(model_name="meta-llama/Llama-3.2-3B-Instruct", max_length=50)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [None]:
extractor = ExtractingPrompt(symptoms_list)

results = []

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    # Extract symptoms with scores > 0.80
    symptom_scores = extract_symptom_scores(symptoms_extracted_llm)

    symptoms_extracted = [symptom for symptom, score in symptom_scores.items() if score > 0.80]

    # Format the output appropriately
    if len(symptoms_extracted) == 0:
        formatted_symptoms = None  # or "" if you prefer empty string
    elif len(symptoms_extracted) == 1:
        formatted_symptoms = symptoms_extracted[0]  # single symptom as string
    else:
        formatted_symptoms = ", ".join(symptoms_extracted)  # multiple as comma-separated string
    

    true_symptom = df['Symptom'][i]
    
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": formatted_symptoms})
    
    df_results = pd.DataFrame(results)

    df_results.to_csv("Final_Extracting_one_symptom_llama7b.csv")

10it [00:19,  1.93s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
452it [13:15,  1.52s/it]

In [3]:
df_results = pd.read_csv('Final_Extracting_one_symptom_llama7b.csv')

In [5]:
calculate_accuracy(df_results)

0.4314159292035398