In [2]:
import pandas as pd
from transformers import pipeline
import ast
from extractor import ExtractingPrompt
from LLM import LLM
from tqdm import tqdm
import re
import numpy as np

In [3]:
df = pd.read_csv('/home/laajila/mima_newcode/clean_code/outputs/2025-02-05/14-14-20/output_20250205_144847.csv')

In [4]:
df.head(2)

Unnamed: 0,Dialogue_Generated,symptom,description,meta,language_style,Tone,Detail_level,Enumeration,Explicit_symptom,Spelling_errors
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,Severity,Very severe,Informal Register,Fearful,1,False,True,True
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,Severity,Not sexually active,Vulgar Register,Fearful,2,False,True,True


In [5]:
df.shape

(823, 10)

## ner method :

In [4]:
ner_pipeline = pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="first")


extracted_data = []
for i , dialogue in enumerate(df['Dialogue_Generated']) :
    ner_results = ner_pipeline(dialogue)
    symptoms = [entity["word"] for entity in ner_results if "symptom" in entity["entity_group"].lower()]
    
    extracted_data.append({
        "Dialogue": dialogue,
        "Real symptoms": df['symptom'][i],
        "Extracted_Symptoms": symptoms
    })
    
df_extracted = pd.DataFrame(extracted_data)
df_extracted.head(5)

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,Dialogue,Real symptoms,Extracted_Symptoms
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,[cracks]
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,[]
2,"""I am fed up with this bloody cracking at the...",Cracking at the corners of the mouth (cheilosi...,[]
3,"""I have been experiencing a cracking at the c...",Cracking at the corners of the mouth (cheilosi...,[cracking]
4,"""I am fed up with this cracking at the corner...",Cracking at the corners of the mouth (cheilosi...,"[fed, cracking]"


In [6]:
df_extracted

Unnamed: 0,Dialogue,Real symptoms,Extracted_Symptoms
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,[cracks]
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,[]
2,"""I am fed up with this bloody cracking at the...",Cracking at the corners of the mouth (cheilosi...,[]
3,"""I have been experiencing a cracking at the c...",Cracking at the corners of the mouth (cheilosi...,[cracking]
4,"""I am fed up with this cracking at the corner...",Cracking at the corners of the mouth (cheilosi...,"[fed, cracking]"
...,...,...,...
818,"""I have been experiencing pain and swelling a...",Pain and swelling at injection site,"[pain, swelling]"
819,"""I'm still in a lot of pain and swelling at t...",Pain and swelling at injection site,"[pain, swelling]"
820,"""Oh, the agony! My arm is on fire and twice a...",Pain and swelling at injection site,[]
821,"""I have a lot of pain and swelling at the inj...",Pain and swelling at injection site,"[pain, swelling]"


In [5]:
nothing_extracted = [1 for i in range(len(df_extracted['Extracted_Symptoms'])) if df_extracted['Extracted_Symptoms'][i]==[]]
len(nothing_extracted)

221

## Evaluation functions :

* If the model extracts extra symptoms, it lowers precision.

* If the model misses true symptoms, recall decreases.

In [6]:
def evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results):
    
    # Compute precision (only penalizes false positives)
    precision_scores = [
        len(pred & true) / len(pred) if pred else 0 
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute recall (penalizes missing true symptoms)
    recall_scores = [
        len(pred & true) / len(true) if true else 0 
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute F1-score (avoiding division by zero)
    f1_scores = [
        (2 * p * r) / (p + r) if (p + r) > 0 else 0
        for p, r in zip(precision_scores, recall_scores)
    ]

    precision = np.mean(precision_scores)
    recall = np.mean(recall_scores)
    f1_score = np.mean(f1_scores)

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}")

Consider True Positives when at least one true symptom is extracted

In [7]:
def evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results):
    
    # Compute True Positives: At least one true symptom is extracted

    true_positives = [
        1 if len(pred & true) > 0 else 0  # If there's at least one correct symptom
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute Precision: Is at least one extracted symptom correct?

    precision_scores = [
        1 if len(pred & true) > 0 else 0  # If at least one match, it's a precision hit
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute Recall: Did the model miss all true symptoms?

    recall_scores = [
        1 if len(pred & true) > 0 else 0  # If at least one match, recall is 1
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    # Compute F1-Score (since precision and recall are binary in this case)

    f1_scores = [
        1 if len(pred & true) > 0 else 0  # If at least one match, F1 is 1
        for true, pred in zip(true_symptoms, extracted_symptoms)
    ]

    precision = np.mean(precision_scores)
    recall = np.mean(recall_scores)
    f1_score = np.mean(f1_scores)

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}")

## Import the LLM model : Biollama 

In [8]:
model = LLM(model_name="iRASC/BioLlama-Ko-8B", max_length=50)



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [9]:
ctcae = pd.read_excel('PRO-CTCAE_Questionnaire_Terminology.xls', sheet_name = 'PRO')
symptoms_list = ctcae['PRO-CTCAE PT'].unique()[:-25]
symptoms_list

array(['Cracking at the corners of the mouth (cheilosis/cheilitis)',
       'Difficulty Swallowing', 'Dry Mouth', 'Hoarseness',
       'Mouth/throat sores', 'Voice quality changes', 'Abdominal pain',
       'Bloating', 'Constipation', 'Decreased appetite', 'Diarrhea',
       'Fecal incontinence', 'Gas', 'Heartburn', 'Hiccups', 'Nausea',
       'Taste Changes', 'Vomiting', 'Coughing', 'Shortness of breath',
       'Wheezing', 'Heart palpitations', 'Swelling (arms or legs)',
       'Acne', 'Bed/pressure sores', 'Hair loss',
       'Hand-foot syndrome (a rash of the hands and feet that can cause cracking, peeling, redness or pain)',
       'Hives', 'Itching', 'Nail discoloration', 'Nail loss',
       'Nail ridging', 'Radiation skin reaction', 'Rash',
       'Sensitivity to sunlight', 'Skin darkening', 'Skin dryness',
       'Stretch marks', 'Dizziness', 'Numbness & tingling',
       'Blurred vision', 'Flashing lights', 'Ringing in ears',
       'Visual floaters', 'Watery eyes', 'Concentra

In [10]:
def extract_symptom_scores(output_str):
    # This pattern matches a key enclosed in single or double quotes followed by a colon and a number (integer or float)
    pattern = r'["\']([^"\']+)["\']\s*:\s*([0-9]*\.?[0-9]+)'
    matches = re.findall(pattern, output_str)
    # Convert the extracted values to float and build the dictionary
    return {key: float(value) for key, value in matches}

In [11]:
extractor = ExtractingPrompt(symptoms_list)

In [12]:
results = []

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'][:10])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
                         list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

    true_symptom = df['symptom'][i]
    
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    df_results = pd.DataFrame(results)
    df_results.to_csv("Extracting_symptoms_using_LLM_updated.csv")

10it [00:24,  2.44s/it]


In [None]:
df_results = pd.read_csv("Extracting_symptoms_using_LLM.csv")
df_results.head(5)

Unnamed: 0.1,Unnamed: 0,Dialogue,True_Symptom,Extracted_Symptom
0,0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,['Cracking at the corners of the mouth (cheilo...
1,1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,['Cracking at the corners of the mouth (cheilo...
2,2,"""I am fed up with this bloody cracking at the...",Cracking at the corners of the mouth (cheilosi...,['Cracking at the corners of the mouth (cheilo...
3,3,"""I have been experiencing a cracking at the c...",Cracking at the corners of the mouth (cheilosi...,['Cracking at the corners of the mouth (cheilo...
4,4,"""I am fed up with this cracking at the corner...",Cracking at the corners of the mouth (cheilosi...,['Cracking at the corners of the mouth (cheilo...


In [10]:
df_explicit_symptom = df[df['Explicit_symptom']==True]
df_explicit_symptom = df_explicit_symptom.reset_index(drop=True)
df_explicit_symptom

Unnamed: 0,Dialogue_Generated,symptom,description,meta,language_style,Tone,Detail_level,Enumeration,Explicit_symptom,Spelling_errors
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,Severity,Very severe,Informal Register,Fearful,1,False,True,True
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,Severity,Not sexually active,Vulgar Register,Fearful,2,False,True,True
2,"""I have been experiencing a cracking at the c...",Cracking at the corners of the mouth (cheilosi...,Severity,,Formal Register,Confused,2,False,True,True
3,"""My mouth corners crack and peel, a map of my...",Cracking at the corners of the mouth (cheilosi...,Severity,Moderate,Poetic/Literary Register,Friendly,1,False,True,False
4,"""I'm tellin' ya, doc, I got this crackin' in ...",Cracking at the corners of the mouth (cheilosi...,Severity,Mild,Vulgar Register,Angry,4,False,True,False
...,...,...,...,...,...,...,...,...,...,...
167,"""My nose bleeds so rarely, but when it does, ...",Nosebleed,Frequency,Rarely,Poetic/Literary Register,Angry,2,False,True,False
168,"""I been gettin nosebleeds all day long, man. ...",Nosebleed,Severity,,Vulgar Register,Confused,4,False,True,True
169,"""I've been having some nosebleeds lately, and...",Nosebleed,Severity,Moderate,Neutral/Standard Register,Friendly,5,False,True,False
170,"""I'm still in a lot of pain and swelling at t...",Pain and swelling at injection site,Presence/Absence,Prefer not to answer,Neutral/Standard Register,Insulting,3,False,True,True


In [10]:
results = []

for i, phrase in tqdm(enumerate(df_explicit_symptom['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
                         list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

    true_symptom = df_explicit_symptom['symptom'][i]
    
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    df_results_2 = pd.DataFrame(results)
    df_results_2.to_csv("Extracting_explicit_symptoms_using_LLM.csv")

0it [00:00, ?it/s]

10it [00:22,  1.94s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
172it [06:55,  2.42s/it]


In [8]:
df_results_2 = pd.read_csv('Extracting_explicit_symptoms_using_LLM.csv')

In [None]:
true_symptoms = [{el} for el in list(df_results_2['True_Symptom'])]
true_symptoms

[{'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Hoarseness'},
 {'Hoarseness'},
 {'Mouth/throat sores'},
 {'Mouth/throat sores'},
 {'Voice quality changes'},
 {'Voice quality changes'},
 {'Abdominal pain'},
 {'Abdominal pain'},
 {'Bloating'},
 {'Bloating'},
 {'Bloating'},
 {'Bloating'},
 {'Bloating'},
 {'Constipation'},
 {'Constipation'},
 {'Constipation'},
 {'Decreased appetite'},
 {'Decreased appetite'},
 {'Fecal incontinence'},
 {'Fecal incontinence'},
 {'Fecal incontinence'},
 {'Fecal incontinence'},
 {'Gas'},
 {'Heartburn'},
 {'Heartburn'},
 {'Hiccups'},
 {'Nausea'},
 {'Nausea'},
 {'Nausea'},


In [None]:
extracted_symptoms = [{ast.literal_eval(el)[i] for i in range(len(ast.literal_eval(el)))} for el in list(df_results_2['Extracted_Symptom'])]
extracted_symptoms

[{'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Cracking at the corners of the mouth (cheilosis/cheilitis)'},
 {'Difficulty Swallowing'},
 {'Difficulty Swallowing'},
 {'Coughing',
  'Difficulty Swallowing',
  'Heartburn',
  'Nausea',
  'Shortness of breath'},
 {'Dry Mouth'},
 {'Dry Mouth'},
 {'Hoarseness'},
 {'Difficulty Swallowing', 'Hoarseness'},
 {'Mouth/throat sores'},
 {'Difficulty Swallowing',
  'Dry Mouth',
  'Hoarseness',
  'Mouth/throat sores',
  'Voice quality changes'},
 {'Hoarseness'},
 {'Fatigue', 'Hoarseness', 'Voice quality changes'},
 {'Abdominal pain'},
 {'Abdominal pain'},
 {'Bloating', 'Gas'},
 {'Abdominal pain', 'Bloating'},
 {'Bloating', 'Constipation', 'Diarrhea', 'Nausea', 'Vomiting'},
 {'Abdominal pain', 'Bloating', 'Constipation'},
 {'Abdominal pain'

In [None]:
score = 0
for i in range(len(df_results_2)) : 
    True_symptom = true_symptoms[i]
    Extracted_Symptom = extracted_symptoms[i]
    
    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results_2)
accuracy

0.37209302325581395

In [None]:
score = 0
for i in range(len(df_results_2)) : 
    True_symptom = df_results_2.iloc[i]['True_Symptom']
    Extracted_Symptom = ast.literal_eval(df_results_2.iloc[i]['Extracted_Symptom'])[0] # we extract the symptom with the highest score 

    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results_2)
accuracy

0.5813953488372093

In [None]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results_2)

Precision: 0.4890, Recall: 0.7093, F1-Score: 0.5419


In [19]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results_2)

Precision: 0.7093, Recall: 0.7093, F1-Score: 0.7093


### We evaluate the extraction :

Accuracy :

In [95]:
score = 0
for i in range(len(df_results)) : 
    True_symptom = df_results.iloc[i]['True_Symptom']
    Extracted_Symptom = ast.literal_eval(df_results.iloc[i]['Extracted_Symptom'])[0]

    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results)
accuracy

0.5054678007290401

In [129]:
true_symptoms = [{el} for el in list(df_results['True_Symptom'])]
extracted_symptoms = [{ast.literal_eval(el)[i] for i in range(len(ast.literal_eval(el)))} for el in list(df_results['Extracted_Symptom'])] 

In [None]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results)

Precision: 0.4228, Recall: 0.6282, F1-Score: 0.4722


In [128]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results)

Precision: 0.6282, Recall: 0.6282, F1-Score: 0.6282


## Extracting from a multi_symptoms dataset :

In [10]:
df = pd.read_csv("Many_symptoms_generated.csv")

In [11]:
df.head(2)

Unnamed: 0,Dialogue_Generated,Symptoms,Descriptions,Meta,Language_Style,Tone,Detail_Level,Enumeration,Explicit_Symptom,Spelling_Errors
0,"""My skin is pale and dry, with a rash on my h...","['Voice quality changes', 'Hand-foot syndrome ...","['Presence/Absence', 'Severity', 'Interference...","['Prefer not to answer', 'None', 'Somewhat', '...",Poetic/Literary Register,Angry,3,True,False,False
1,"""I got swollen arms and legs, and I'm super s...","['Swelling (arms or legs)', 'Sensitivity to su...","['Frequency', 'Presence/Absence', 'Severity', ...","['Prefer not to answer', 'Not applicable', 'No...",Informal Register,Angry,1,False,False,True


In [12]:
results = []
ctcae = pd.read_excel('PRO-CTCAE_Questionnaire_Terminology.xls', sheet_name = 'PRO')
symptoms_list = ctcae['PRO-CTCAE PT'].unique()[:-25]
extractor = ExtractingPrompt(symptoms_list)

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
                         list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

    true_symptom = df['Symptoms'][i]
    
    # Append a dictionary with your desired columns
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    df_results = pd.DataFrame(results)
    df_results.to_csv("Extracting_symptoms_using_LLM_2.csv")

0it [00:00, ?it/s]

9it [00:22,  2.48s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100it [04:30,  2.71s/it]


In [14]:
df_results.head(5)

Unnamed: 0,Dialogue,True_Symptom,Extracted_Symptom
0,"""My skin is pale and dry, with a rash on my h...","['Voice quality changes', 'Hand-foot syndrome ...",[Hand-foot syndrome (a rash of the hands and f...
1,"""I got swollen arms and legs, and I'm super s...","['Swelling (arms or legs)', 'Sensitivity to su...","[Swelling (arms or legs), Sensitivity to sunli..."
2,"""I've been having these itchy red bumps all o...","['Hives', 'Hair loss', 'Painful urination']","[Hair loss, Itchy red bumps, Painful urination]"
3,"""I have been having trouble sleeping lately, ...","['Nail ridging', 'Insomnia', 'Sensitivity to s...",[Nail ridging]
4,"""I'm so scared, I keep seeing flashing lights...","['Flashing lights', 'Bed/pressure sores']","[Mouth/throat sores, Bed/pressure sores, Flash..."


In [75]:
true_symptoms = [set(ast.literal_eval(el)) for el in df_results['True_Symptom']]
extracted_symptoms = [set(el) for el in list(df_results['Extracted_Symptom'])]

In [81]:
score = 0
for i in range(len(df_results)) : 

    True_symptom = true_symptoms[i]
    Extracted_Symptom = extracted_symptoms[i]

    if True_symptom <= Extracted_Symptom : # Subset check (penalizes only missing ones)
        score +=1 

accuracy = score / len(df_results)
accuracy

0.15

In [None]:
score = 0
for i in range(len(df_results)) : 
    True_symptom = df_results.iloc[i]['True_Symptom']
    Extracted_Symptom = df_results.iloc[i]['Extracted_Symptom']

    if True_symptom == Extracted_Symptom :
        score +=1 

accuracy = score / len(df_results)
accuracy

0.0

In [None]:
true_symptoms = [set(ast.literal_eval(el)) for el in df_results['True_Symptom']]
extracted_symptoms = [set(el) for el in list(df_results['Extracted_Symptom'])] 

In [None]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms, df_results)

Precision: 0.5238, Recall: 0.4542, F1-Score: 0.4665


In [None]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms, df_results)

Precision: 0.8200, Recall: 0.8200, F1-Score: 0.8200
