In [29]:
import pandas as pd
from transformers import pipeline
import ast
from extractor import ExtractingPrompt
from LLM import LLM
from tqdm import tqdm
import re
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('/home/laajila/mima_newcode/clean_code/New_generated_dataset_with_biollama8B.csv')

In [42]:
df = pd.read_csv('/home/laajila/mima_newcode/clean_code/New_generated_dataset_with_biollama8B_3.csv')

In [43]:
df.shape

(314, 11)

## ner method :

In [4]:
ner_pipeline = pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="first")


extracted_data = []
for i , dialogue in enumerate(df['Dialogue_Generated']) :
    ner_results = ner_pipeline(dialogue)
    symptoms = [entity["word"] for entity in ner_results if "symptom" in entity["entity_group"].lower()]
    
    extracted_data.append({
        "Dialogue": dialogue,
        "Real symptoms": df['symptom'][i],
        "Extracted_Symptoms": symptoms
    })
    
df_extracted = pd.DataFrame(extracted_data)
df_extracted.head(5)

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,Dialogue,Real symptoms,Extracted_Symptoms
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,[cracks]
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,[]
2,"""I am fed up with this bloody cracking at the...",Cracking at the corners of the mouth (cheilosi...,[]
3,"""I have been experiencing a cracking at the c...",Cracking at the corners of the mouth (cheilosi...,[cracking]
4,"""I am fed up with this cracking at the corner...",Cracking at the corners of the mouth (cheilosi...,"[fed, cracking]"


In [6]:
df_extracted

Unnamed: 0,Dialogue,Real symptoms,Extracted_Symptoms
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,[cracks]
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,[]
2,"""I am fed up with this bloody cracking at the...",Cracking at the corners of the mouth (cheilosi...,[]
3,"""I have been experiencing a cracking at the c...",Cracking at the corners of the mouth (cheilosi...,[cracking]
4,"""I am fed up with this cracking at the corner...",Cracking at the corners of the mouth (cheilosi...,"[fed, cracking]"
...,...,...,...
818,"""I have been experiencing pain and swelling a...",Pain and swelling at injection site,"[pain, swelling]"
819,"""I'm still in a lot of pain and swelling at t...",Pain and swelling at injection site,"[pain, swelling]"
820,"""Oh, the agony! My arm is on fire and twice a...",Pain and swelling at injection site,[]
821,"""I have a lot of pain and swelling at the inj...",Pain and swelling at injection site,"[pain, swelling]"


In [5]:
nothing_extracted = [1 for i in range(len(df_extracted['Extracted_Symptoms'])) if df_extracted['Extracted_Symptoms'][i]==[]]
len(nothing_extracted)

221

## Evaluation functions :

* If the model extracts extra symptoms, it lowers precision.

* If the model misses true symptoms, recall decreases.

In [6]:
def evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results):
    
    # Compute precision (only penalizes false positives)
    precision_scores = [
        len(pred & true) / len(pred) if pred else 0 
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute recall (penalizes missing true symptoms)
    recall_scores = [
        len(pred & true) / len(true) if true else 0 
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute F1-score (avoiding division by zero)
    f1_scores = [
        (2 * p * r) / (p + r) if (p + r) > 0 else 0
        for p, r in zip(precision_scores, recall_scores)
    ]

    precision = np.mean(precision_scores)
    recall = np.mean(recall_scores)
    f1_score = np.mean(f1_scores)

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}")

Consider True Positives when at least one true symptom is extracted

In [7]:
def evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results):
    
    # Compute True Positives: At least one true symptom is extracted

    true_positives = [
        1 if len(pred & true) > 0 else 0  # If there's at least one correct symptom
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute Precision: Is at least one extracted symptom correct?

    precision_scores = [
        1 if len(pred & true) > 0 else 0  # If at least one match, it's a precision hit
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute Recall: Did the model miss all true symptoms?

    recall_scores = [
        1 if len(pred & true) > 0 else 0  # If at least one match, recall is 1
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute F1-Score (since precision and recall are binary in this case)

    f1_scores = [
        1 if len(pred & true) > 0 else 0  # If at least one match, F1 is 1
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    precision = np.mean(precision_scores)
    recall = np.mean(recall_scores)
    f1_score = np.mean(f1_scores)

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}")

## Import the LLM model : Biollama 

In [6]:
model = LLM(model_name="iRASC/BioLlama-Ko-8B", max_length=50)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [7]:
ctcae = pd.read_excel('PRO-CTCAE_Questionnaire_Terminology.xls', sheet_name = 'PRO')
symptoms_list = ctcae['PRO-CTCAE PT'].unique()[:-25]
symptoms_list

array(['Cracking at the corners of the mouth (cheilosis/cheilitis)',
       'Difficulty Swallowing', 'Dry Mouth', 'Hoarseness',
       'Mouth/throat sores', 'Voice quality changes', 'Abdominal pain',
       'Bloating', 'Constipation', 'Decreased appetite', 'Diarrhea',
       'Fecal incontinence', 'Gas', 'Heartburn', 'Hiccups', 'Nausea',
       'Taste Changes', 'Vomiting', 'Coughing', 'Shortness of breath',
       'Wheezing', 'Heart palpitations', 'Swelling (arms or legs)',
       'Acne', 'Bed/pressure sores', 'Hair loss',
       'Hand-foot syndrome (a rash of the hands and feet that can cause cracking, peeling, redness or pain)',
       'Hives', 'Itching', 'Nail discoloration', 'Nail loss',
       'Nail ridging', 'Radiation skin reaction', 'Rash',
       'Sensitivity to sunlight', 'Skin darkening', 'Skin dryness',
       'Stretch marks', 'Dizziness', 'Numbness & tingling',
       'Blurred vision', 'Flashing lights', 'Ringing in ears',
       'Visual floaters', 'Watery eyes', 'Concentra

In [8]:
def extract_symptom_scores(output_str):
    # This pattern matches a key enclosed in single or double quotes followed by a colon and a number (integer or float)
    pattern = r'["\']([^"\']+)["\']\s*:\s*([0-9]*\.?[0-9]+)'
    matches = re.findall(pattern, output_str)
    # Convert the extracted values to float and build the dictionary
    return {key: float(value) for key, value in matches}

In [None]:
class ExtractingPrompt:
    """
    A class to generate prompts for extracting structured symptoms from patient dialogues.
    """
    def __init__(self, symptom_list: list[str]) -> None:
        """
        Initialize the prompt generator with a predefined list of symptoms.
        """
        self.symptom_list = symptom_list

    def build_extraction_prompt(self, dialogue: str) -> list[dict[str, str]]:

        """
        Builds a prompt instructing the LLM to extract symptoms in a structured JSON format.
        """
        # Join your list of 80 symptoms into a comma-separated string.
        symptoms_str = ", ".join(self.symptom_list)
        
        messages = [
            {
                "role": "system",                                                                             
                "content": (

                    "You are an AI assistant specialized in extracting medical symptoms. "
                    "Given a patient dialogue, identify which symptoms from the provided list are mentioned. "
                    "For each detected symptom, assign a confidence score between 0 and 1 indicating how likely it is present. "
                    "Return your response as a JSON object where keys are symptom names (only those detected) and values are the corresponding scores. "
                    "Only include a symptom if its score is above 0. "
                ),
            },
            {
                "role": "user",
                "content": (
                    f"Patient dialogue: \"{dialogue.strip()}\"\n\n"
                    f"Symptom list: [{symptoms_str}]\n\n"
                    "Extract the symptoms and output them as instructed in valid JSON format."
                ),
            },
        ]
        

        return messages

In [20]:
extractor = ExtractingPrompt(symptoms_list)

In [None]:
results = []

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
                         list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

    true_symptom = df['symptom'][i]
    
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    
    df_results = pd.DataFrame(results)

    # df_results.to_csv("Extracting_symptoms_using_LLM_updated.csv")

88it [03:32,  2.74s/it]

In [23]:
df_explicit_symptom = df[df['Explicit_symptom']==True]
df_explicit_symptom = df_explicit_symptom.reset_index(drop=True)
df_explicit_symptom

Unnamed: 0.1,Unnamed: 0,Dialogue_Generated,symptom,description,meta,language_style,Tone,Detail_level,Enumeration,Explicit_symptom,Spelling_errors
0,1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,Severity,Not sexually active,Vulgar Register,Friendly,3,False,True,True
1,2,"""I've been having this weird thing going on w...",Cracking at the corners of the mouth (cheilosi...,Severity,Prefer not to answer,Informal Register,Neutral,4,False,True,True
2,12,"""I can't swallow nothin' proper like. It's li...",Difficulty Swallowing,Severity,Not applicable,Vulgar Register,Angry,3,False,True,False
3,14,"""I can't swallow nothin' proper like. It's li...",Difficulty Swallowing,Severity,Mild,Vulgar Register,Angry,4,False,True,True
4,21,"""I have a dry mouth all the time and it makes...",Dry Mouth,Severity,Moderate,Neutral/Standard Register,Confused,4,False,True,True
5,28,"""My voice is hoarse and I have difficulty spe...",Hoarseness,Severity,Not applicable,Formal Register,Fearful,4,True,True,False
6,34,"""I have mouth/throat sores.""",Mouth/throat sores,Interference (with daily activities),Not at all,Neutral/Standard Register,Friendly,1,False,True,True
7,38,"""I have mouth/throat sores.""",Mouth/throat sores,Interference (with daily activities),Not at all,Neutral/Standard Register,Confused,2,True,True,False
8,39,"""I've been having these sores in my mouth and...",Mouth/throat sores,Interference (with daily activities),,Informal Register,Confused,2,False,True,False
9,40,"""I've been having these little sores in my mo...",Mouth/throat sores,Interference (with daily activities),Not at all,Informal Register,Friendly,4,False,True,False


In [None]:
# results = []

# for i, phrase in tqdm(enumerate(df_explicit_symptom['Dialogue_Generated'])):

#     prompt = extractor.build_extraction_prompt(phrase)

#     symptoms_extracted_llm = model.generate_text(messages=prompt)

#     symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
#                          list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

#     true_symptom = df_explicit_symptom['symptom'][i]
    
#     results.append({
#         "Dialogue": phrase,
#         "True_Symptom": true_symptom,
#         "Extracted_Symptom": symptoms_extracted
#     })
#     df_results_2 = pd.DataFrame(results)
#     df_results_2.to_csv("Extracting_explicit_symptoms_using_LLM.csv")

0it [00:00, ?it/s]

10it [00:22,  1.94s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
172it [06:55,  2.42s/it]


In [None]:
# df_results_2 = pd.read_csv('Extracting_explicit_symptoms_using_LLM.csv')

In [39]:
true_symptoms = [{el} for el in list(df_results['True_Symptom'])]
true_symptoms

[{'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Hoarseness'},
 {'Hoarseness'},
 {'Hoarseness'},
 {'Hoarseness'},
 {'Hoarseness'},
 {'Hoarseness'},
 {'Hoarseness'},
 {'Hoarse

In [40]:
extracted_symptoms = [el for el in list(df_results['Extracted_Symptom'])]
extracted_symptoms

[['Cracking at the corners of the mouth (cheilosis/cheilitis)'],
 ['Cracking at the corners of the mouth (cheilosis/cheilitis)'],
 ['Cracking at the corners of the mouth (cheilosis/cheilitis)',
  'Difficulty Swallowing',
  'Dry Mouth',
  'Taste Changes'],
 ['Cracking at the corners of the mouth (cheilosis/cheilitis)',
  'Fatigue',
  'Dizziness',
  'Headache'],
 ['Decreased appetite', 'Weight loss', 'Fatigue'],
 ['Cracking at the corners of the mouth (cheilosis/cheilitis)'],
 ['Cracking at the corners of the mouth (cheilosis/cheilitis)'],
 ['Cracking at the corners of the mouth (cheilosis/cheilitis)'],
 ['Difficulty Swallowing', 'Coughing'],
 ['Difficulty Swallowing', 'Dry Mouth', 'Voice quality changes'],
 ['Difficulty Swallowing', 'Dry Mouth', 'Voice quality changes'],
 ['Difficulty Swallowing'],
 ['Difficulty Swallowing'],
 ['Difficulty Swallowing', 'Dry Mouth', 'Voice quality changes'],
 ['Difficulty Swallowing'],
 ['Difficulty Swallowing'],
 ['Dry Mouth'],
 ['Dry Mouth'],
 ['Dry Mo

In [27]:
# extracted_symptoms = [{ast.literal_eval(el)[i] for i in range(len(ast.literal_eval(el)))} for el in list(df_results['Extracted_Symptom'])]
# extracted_symptoms

In [19]:
score = 0
for i in range(len(df_results)) : 
    True_symptom = true_symptoms[i]
    Extracted_Symptom = extracted_symptoms[i]
    
    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results)
accuracy

0.31470230862697446

In [41]:
score = 0
for i in range(len(df_results)) : 
    True_symptom = df_results.iloc[i]['True_Symptom']
    #Extracted_Symptom = ast.literal_eval(df_results.iloc[i]['Extracted_Symptom'])[0] # we extract the symptom with the highest score 
    Extracted_Symptom = (df_results.iloc[i]['Extracted_Symptom'])[0] 

    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results)
accuracy

0.903448275862069

In [None]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results)

Precision: 0.4890, Recall: 0.7093, F1-Score: 0.5419


In [21]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results)

NameError: name 'evaluate_penalization_degree_0' is not defined

### We evaluate the extraction :

Accuracy :

In [3]:
score = 0
for i in range(len(df_results)) : 
    True_symptom = df_results.iloc[i]['True_Symptom']
    Extracted_Symptom = ast.literal_eval(df_results.iloc[i]['Extracted_Symptom'])[0]

    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results)
accuracy

0.5054678007290401

In [4]:
true_symptoms = [{el} for el in list(df_results['True_Symptom'])]
extracted_symptoms = [{ast.literal_eval(el)[i] for i in range(len(ast.literal_eval(el)))} for el in list(df_results['Extracted_Symptom'])] 

In [None]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results)

Precision: 0.4228, Recall: 0.6282, F1-Score: 0.4722


In [128]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results)

Precision: 0.6282, Recall: 0.6282, F1-Score: 0.6282


## Extracting from a multi_symptoms dataset :

In [10]:
df = pd.read_csv("Many_symptoms_generated.csv")

In [11]:
df.head(2)

Unnamed: 0,Dialogue_Generated,Symptoms,Descriptions,Meta,Language_Style,Tone,Detail_Level,Enumeration,Explicit_Symptom,Spelling_Errors
0,"""My skin is pale and dry, with a rash on my h...","['Voice quality changes', 'Hand-foot syndrome ...","['Presence/Absence', 'Severity', 'Interference...","['Prefer not to answer', 'None', 'Somewhat', '...",Poetic/Literary Register,Angry,3,True,False,False
1,"""I got swollen arms and legs, and I'm super s...","['Swelling (arms or legs)', 'Sensitivity to su...","['Frequency', 'Presence/Absence', 'Severity', ...","['Prefer not to answer', 'Not applicable', 'No...",Informal Register,Angry,1,False,False,True


In [12]:
results = []
ctcae = pd.read_excel('PRO-CTCAE_Questionnaire_Terminology.xls', sheet_name = 'PRO')
symptoms_list = ctcae['PRO-CTCAE PT'].unique()[:-25]
extractor = ExtractingPrompt(symptoms_list)

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
                         list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

    true_symptom = df['Symptoms'][i]
    
    # Append a dictionary with your desired columns
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    df_results = pd.DataFrame(results)
    df_results.to_csv("Extracting_symptoms_using_LLM_2.csv")

0it [00:00, ?it/s]

9it [00:22,  2.48s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100it [04:30,  2.71s/it]


In [14]:
df_results.head(5)

Unnamed: 0,Dialogue,True_Symptom,Extracted_Symptom
0,"""My skin is pale and dry, with a rash on my h...","['Voice quality changes', 'Hand-foot syndrome ...",[Hand-foot syndrome (a rash of the hands and f...
1,"""I got swollen arms and legs, and I'm super s...","['Swelling (arms or legs)', 'Sensitivity to su...","[Swelling (arms or legs), Sensitivity to sunli..."
2,"""I've been having these itchy red bumps all o...","['Hives', 'Hair loss', 'Painful urination']","[Hair loss, Itchy red bumps, Painful urination]"
3,"""I have been having trouble sleeping lately, ...","['Nail ridging', 'Insomnia', 'Sensitivity to s...",[Nail ridging]
4,"""I'm so scared, I keep seeing flashing lights...","['Flashing lights', 'Bed/pressure sores']","[Mouth/throat sores, Bed/pressure sores, Flash..."


In [75]:
true_symptoms = [set(ast.literal_eval(el)) for el in df_results['True_Symptom']]
extracted_symptoms = [set(el) for el in list(df_results['Extracted_Symptom'])]

In [81]:
score = 0
for i in range(len(df_results)) : 

    True_symptom = true_symptoms[i]
    Extracted_Symptom = extracted_symptoms[i]

    if True_symptom <= Extracted_Symptom : # Subset check (penalizes only missing ones)
        score +=1 

accuracy = score / len(df_results)
accuracy

0.15

In [None]:
score = 0
for i in range(len(df_results)) : 
    True_symptom = df_results.iloc[i]['True_Symptom']
    Extracted_Symptom = df_results.iloc[i]['Extracted_Symptom']

    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results)
accuracy

0.0

In [None]:
true_symptoms = [set(ast.literal_eval(el)) for el in df_results['True_Symptom']]
extracted_symptoms = [set(el) for el in list(df_results['Extracted_Symptom'])] 

In [None]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results)

Precision: 0.5238, Recall: 0.4542, F1-Score: 0.4665


In [None]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results)

Precision: 0.8200, Recall: 0.8200, F1-Score: 0.8200


## Evaluation :

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, 
    hamming_loss, jaccard_score, average_precision_score, confusion_matrix
)

class MultiLabelEvaluator:
    def _init_(self, y_true, y_pred, class_names=None, threshold=0.5, alpha=5):
        """
        Initializes the evaluator with true labels and predicted probabilities.
        
        Parameters:
            - y_true: np.array, shape (N, C), true binary labels (0 or 1)
            - y_pred: np.array, shape (N, C), predicted probabilities
            - class_names: list, names of classes
            - threshold: float, probability threshold to convert to binary labels
            - alpha: float, parameter for alpha-Softmax/Softmin aggregation
        """
        self.y_true = np.array(y_true)
        self.y_pred = np.array(y_pred)
        self.class_names = class_names if class_names else [f"Class {i}" for i in range(self.y_true.shape[1])]
        self.threshold = threshold
        self.alpha = alpha
        self.y_pred_binary = (self.y_pred >= self.threshold).astype(int)

        # Confusion matrix components for each class
        self.tps, self.fps, self.fns, self.tns = [], [], [], []
        num_labels = self.y_true.shape[1]
        fig, axes = plt.subplots(1, num_labels, figsize=(4 * num_labels, 4))
        for c in range(num_labels):
            cm = confusion_matrix(self.y_true[:, c], self.y_pred_binary[:, c])
            self.tns.append(cm[0, 0])
            self.fps.append(cm[0, 1])
            self.fns.append(cm[1, 0])
            self.tps.append(cm[1, 1])
            ax = axes[c] if num_labels > 1 else axes
            sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", ax=ax)
            ax.set_title(f"{self.class_names[c]}")
            ax.set_xlabel("Predicted")
            ax.set_ylabel("Actual")

        self.tps, self.fps, self.fns, self.tns = map(np.array, [self.tps, self.fps, self.fns, self.tns])
        plt.tight_layout()
        plt.show()

    def compute_binary_metrics(self):
        """Computes AUC, Accuracy, Precision, Recall for each class."""
        aucs, accs, precs, recs = [], [], [], []
        for c in range(self.y_true.shape[1]):
            aucs.append(roc_auc_score(self.y_true[:, c], self.y_pred[:, c]))
            accs.append(accuracy_score(self.y_true[:, c], self.y_pred_binary[:, c]))
            precs.append(precision_score(self.y_true[:, c], self.y_pred_binary[:, c], zero_division=1))
            recs.append(recall_score(self.y_true[:, c], self.y_pred_binary[:, c], zero_division=1))
        return np.array(aucs), np.array(accs), np.array(precs), np.array(recs)

    def aggregate_metrics(self, metrics):
        """Computes average, alpha-Softmax, and alpha-Softmin for a given metric set."""
        avg = np.mean(metrics)
        softmax = np.sum(metrics * np.exp(self.alpha * metrics)) / np.sum(np.exp(self.alpha * metrics))
        softmin = np.sum(metrics * np.exp(-self.alpha * metrics)) / np.sum(np.exp(-self.alpha * metrics))
        return {"Average": avg, "Alpha-Softmax": softmax, "Alpha-Softmin": softmin}

    def compute_false_distributions(self):
        """Computes false positive & false negative distributions across all samples."""
        false_pos_per_sample = np.sum(self.y_pred_binary > self.y_true, axis=1)
        false_neg_per_sample = np.sum(self.y_pred_binary < self.y_true, axis=1)
        return {
            "Avg False Positives Over all samples": np.mean(false_pos_per_sample),
            "Avg False Negatives Over all samples": np.mean(false_neg_per_sample),
            "FP Distribution": false_pos_per_sample,
            "FN Distribution": false_neg_per_sample
        }

    def compute_work_saved(self):
        """Computes the work saved metric."""
        T1 = np.sum(self.y_true, axis=1)  # All Positives (TP + FN)
        T2 = np.sum(self.y_pred_binary != self.y_true, axis=1)  # FN + FP (corrections needed)
        work_saved = 1 - (T2 / np.maximum(T1, 1))  # Avoid division by zero
        return {"Avg Work Saved": np.mean(work_saved)}
    
    def overall_metrics(self):
        """Compute overall multi-label classification metrics, including MAP."""
        overall_metrics = {
            "Hamming Loss": hamming_loss(self.y_true, self.y_pred_binary),
            "Micro F1-score": f1_score(self.y_true, self.y_pred_binary, average='micro'),
            "Macro F1-score": f1_score(self.y_true, self.y_pred_binary, average='macro'),
            "Weighted F1-score": f1_score(self.y_true, self.y_pred_binary, average='weighted'),
            "Jaccard Index (Macro)": jaccard_score(self.y_true, self.y_pred_binary, average='macro'),
            "Subset Accuracy": accuracy_score(self.y_true, self.y_pred_binary),
            "Mean Average Precision (MAP)": np.mean([
                average_precision_score(self.y_true[:, i], self.y_pred_binary[:, i]) 
                for i in range(self.y_true.shape[1]) if np.unique(self.y_true[:, i]).size > 1
            ])
        }
        return overall_metrics
    
    def evaluate(self):
        """Computes and returns all evaluation metrics."""
        aucs, accs, precs, recs = self.compute_binary_metrics()
        metrics = {
            "AUC": self.aggregate_metrics(aucs),
            "Accuracy": self.aggregate_metrics(accs),
            "Precision": self.aggregate_metrics(precs),
            "Recall": self.aggregate_metrics(recs),
        }
        false_distributions = self.compute_false_distributions()
        work_saved = self.compute_work_saved()
        overall_metrics = self.overall_metrics()
        return {**overall_metrics, **metrics, **false_distributions, **work_saved}


class SymptomMultiLabelEvaluator(MultiLabelEvaluator):

    def _init_(self, true_symptoms, extracted_symptoms, symptom_universe=None, threshold=0.5, alpha=5):
        """
        Adapts the MultiLabelEvaluator to work directly with sets of symptoms.
        
        Parameters:
            - true_symptoms: list of sets, each containing the true symptoms for a sentence.
            - extracted_symptoms: list of sets, each containing the symptoms extracted by the model.
            - symptom_universe: list of all possible symptoms. If None, it is computed as the union of all symptoms.
            - threshold: unused here (kept for compatibility), since we already have binary predictions.
            - alpha: parameter for alpha-softmax/softmin aggregation.
        """
        self.true_symptom_sets = true_symptoms
        self.predicted_symptom_sets = extracted_symptoms
        
        if symptom_universe is None:
            # Compute the union of all symptoms from both true and predicted sets
            symptom_universe = sorted(set().union(*true_symptoms, *extracted_symptoms))

        self.symptom_universe = symptom_universe
        
        # Convert sets to binary vectors
        y_true = self.sets_to_binary(true_symptoms, symptom_universe)
        y_pred = self.sets_to_binary(extracted_symptoms, symptom_universe)

        # self.y_true = y_true
        # self.y_pred = y_pred

        super()._init_(y_true, y_pred, class_names=symptom_universe, threshold=threshold, alpha=alpha)

    @staticmethod
    def sets_to_binary(sets_list, symptom_universe):
        """
        Converts a list of symptom sets into a binary matrix.
        
        Parameters:
            - sets_list: list of sets (true or predicted symptoms).
            - symptom_universe: list of all possible symptoms.
        
        Returns:
            - A binary numpy array of shape (number_of_samples, number_of_symptoms)
        """
        num_samples = len(sets_list)
        num_symptoms = len(symptom_universe)
        binary_matrix = np.zeros((num_samples, num_symptoms), dtype=int)
        symptom_to_idx = {sym: idx for idx, sym in enumerate(symptom_universe)}
        
        for i, symptom_set in enumerate(sets_list):
            for sym in symptom_set:
                if sym in symptom_to_idx:
                    binary_matrix[i, symptom_to_idx[sym]] = 1
        return binary_matrix

In [None]:
# Instantiate the evaluator for symptom extraction
evaluator = SymptomMultiLabelEvaluator()
results = evaluator.evaluate()

# Print the evaluation results
for key, value in results.items():
    print(f"{key}: {value}")

AttributeError: 'SymptomMultiLabelEvaluator' object has no attribute 'y_true'