In [2]:
import pandas as pd
from transformers import pipeline
import ast
from extractor import ExtractingPrompt
from LLM import LLM
from tqdm import tqdm
import re
import numpy as np
from evaluation_functions import calculate_accuracy_degree_1, calculate_accuracy_degree_1_bis, evaluate_penalization_degree_0, evaluate_penalization_degree_1

In [3]:
df = pd.read_csv('/home/laajila/mima_newcode/clean_code/outputs/2025-02-05/14-14-20/output_20250205_144847.csv')

## Utility functions 

In [4]:
def extract_symptom_scores(output_str):
    # This pattern matches a key enclosed in single or double quotes followed by a colon and a number (integer or float)
    pattern = r'["\']([^"\']+)["\']\s*:\s*([0-9]*\.?[0-9]+)'
    matches = re.findall(pattern, output_str)
    # Convert the extracted values to float and build the dictionary
    return {key: float(value) for key, value in matches}

In [5]:
def true_vs_extracted_symptoms(df_results):
    
    true_symptoms = [{el} for el in list(df_results['True_Symptom'])]
    extracted_symptoms = [{ast.literal_eval(el)[i] for i in range(len(ast.literal_eval(el)))} for el in list(df_results['Extracted_Symptom'])]
    return true_symptoms, extracted_symptoms

## Extract using Biollama

In [6]:
model = LLM(model_name="iRASC/BioLlama-Ko-8B", max_length=50)



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [7]:
ctcae = pd.read_excel('PRO-CTCAE_Questionnaire_Terminology.xls', sheet_name = 'PRO')
symptoms_list = ctcae['PRO-CTCAE PT'].unique()[:-25]

In [8]:
extractor = ExtractingPrompt(symptoms_list)

In [12]:
results = []

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
                         list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

    true_symptom = df['symptom'][i]
    
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    df_results = pd.DataFrame(results)
    df_results.to_csv("Extracting_symptoms_using_LLM_new_prompt.csv")
    

1it [00:06,  6.88s/it]


IndexError: list index out of range

In [11]:
results = []

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted_llm = model.generate_text(messages=prompt)

    symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
                         list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

    true_symptom = df['symptom'][i]
    
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    df_results = pd.DataFrame(results)
    df_results.to_csv("Extracting_symptoms_using_LLM_updated_prompt.csv")

10it [00:24,  2.65s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
823it [31:50,  2.32s/it]


In [2]:
df_results =pd.read_csv("Extracting_symptoms_using_LLM_updated_prompt.csv")

In [6]:
true_symptoms, extracted_symptoms = true_vs_extracted_symptoms(df_results)

In [8]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms)

Precision: 0.5614, Recall: 0.5614, F1-Score: 0.5614


In [None]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms)   # perfect metric

Precision: 0.4749, Recall: 0.5614, F1-Score: 0.4919


In [None]:
calculate_accuracy_degree_1(true_symptoms, extracted_symptoms, df_results)  # perfect accuracy

0.4422843256379101

In [12]:
calculate_accuracy_degree_1_bis(df_results)

0.5066828675577156

### Without few shots :

In [15]:
df_results = pd.read_csv("Extracting_symptoms_using_LLM.csv")

In [16]:
true_symptoms, extracted_symptoms = true_vs_extracted_symptoms(df_results)

In [17]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms)

Precision: 0.6282, Recall: 0.6282, F1-Score: 0.6282


In [18]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms)

Precision: 0.4228, Recall: 0.6282, F1-Score: 0.4722


In [21]:
calculate_accuracy_degree_1_bis(df_results)  ## 50% at the first try

0.5054678007290401

## When symptom is explicit

In [None]:
# df_explicit_symptom = df[df['Explicit_symptom']==True]
# df_explicit_symptom = df_explicit_symptom.reset_index(drop=True)
# df_explicit_symptom

In [None]:
# results = []

# for i, phrase in tqdm(enumerate(df_explicit_symptom['Dialogue_Generated'])):

#     prompt = extractor.build_extraction_prompt(phrase)

#     symptoms_extracted_llm = model.generate_text(messages=prompt)

#     symptoms_extracted = [list(extract_symptom_scores(symptoms_extracted_llm).keys())[0] ] + [el for el in
#                          list(extract_symptom_scores(symptoms_extracted_llm).keys())[1:] if extract_symptom_scores(symptoms_extracted_llm)[el] > 0.80 ]

#     true_symptom = df_explicit_symptom['symptom'][i]
    
#     results.append({
#         "Dialogue": phrase,
#         "True_Symptom": true_symptom,
#         "Extracted_Symptom": symptoms_extracted
#     })
#     df_results_2 = pd.DataFrame(results)
#     df_results_2.to_csv("Extracting_explicit_symptoms_using_LLM.csv")

In [22]:
df_results_2 = pd.read_csv("Extracting_explicit_symptoms_using_LLM.csv")

In [23]:
true_symptoms, extracted_symptoms = true_vs_extracted_symptoms(df_results_2)

In [24]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms)

Precision: 0.7093, Recall: 0.7093, F1-Score: 0.7093


In [25]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms)

Precision: 0.4890, Recall: 0.7093, F1-Score: 0.5419


In [29]:
calculate_accuracy_degree_1_bis(df_results_2)  # 58%

0.5813953488372093

In [53]:
df_results = pd.read_csv('Extracting_symptoms_using_LLM_2.csv')
df_results

Unnamed: 0.1,Unnamed: 0,Dialogue,True_Symptom,Extracted_Symptom
0,0,"""My skin is pale and dry, with a rash on my h...","['Voice quality changes', 'Hand-foot syndrome ...",['Hand-foot syndrome (a rash of the hands and ...
1,1,"""I got swollen arms and legs, and I'm super s...","['Swelling (arms or legs)', 'Sensitivity to su...","['Swelling (arms or legs)', 'Sensitivity to su..."
2,2,"""I've been having these itchy red bumps all o...","['Hives', 'Hair loss', 'Painful urination']","['Hair loss', 'Itchy red bumps', 'Painful urin..."
3,3,"""I have been having trouble sleeping lately, ...","['Nail ridging', 'Insomnia', 'Sensitivity to s...",['Nail ridging']
4,4,"""I'm so scared, I keep seeing flashing lights...","['Flashing lights', 'Bed/pressure sores']","['Mouth/throat sores', 'Bed/pressure sores', '..."
...,...,...,...,...
95,95,"""I have been experiencing acne on my face for...","['Acne', 'Painful urination']",['Acne']
96,96,"""I have been experiencing chills and general ...","['Chills', 'General pain', 'Difficulty Swallow...","['Difficulty Swallowing', 'Diarrhea', 'General..."
97,97,"""I cant swallow properly and I have the runs ...","['Difficulty Swallowing', 'Diarrhea']","['Difficulty Swallowing', 'Diarrhea', 'Shortne..."
98,98,"""My mouth is a battlefield, with sores that e...","['Mouth/throat sores', 'Bruising', 'Urinary ur...","['Mouth/throat sores', 'Dry Mouth', 'Taste Cha..."


In [57]:
df_results.iloc[0]['Extracted_Symptom']

"['Hand-foot syndrome (a rash of the hands and feet that can cause cracking, peeling, redness or pain)', 'Dry Mouth', 'Joint pain']"

In [40]:
true_symptoms = [set(ast.literal_eval(el)) for el in df_results['True_Symptom']]
extracted_symptoms = [set(ast.literal_eval(el)) for el in (df_results['Extracted_Symptom'])]

In [None]:
evaluate_penalization_degree_0(true_symptoms, extracted_symptoms) # at least one symptom is extracted

Precision: 0.8200, Recall: 0.8200, F1-Score: 0.8200


In [43]:
evaluate_penalization_degree_1(true_symptoms, extracted_symptoms)

Precision: 0.5238, Recall: 0.4542, F1-Score: 0.4665


In [44]:
calculate_accuracy_degree_1(true_symptoms, extracted_symptoms, df_results)

0.1

In [47]:
score = 0
for i in range(len(df_results)) : 

    True_symptom = true_symptoms[i]
    Extracted_Symptom = extracted_symptoms[i]

    if True_symptom <= Extracted_Symptom : # Subset check (penalizes only missing ones)
        score +=1 

accuracy = score / len(df_results)
accuracy

0.15

In [48]:
df = pd.read_csv("Many_symptoms_generated_2.csv")

In [51]:
df.iloc[0]['Dialogue_Generated']

' "I feel like I\'m always down in the dumps and I can\'t seem to shake it. My mouth is always dry and I can\'t sleep at night."'

In [52]:
df.iloc[0]

Dialogue_Generated     "I feel like I'm always down in the dumps and...
Symptoms                                           ['Sad', 'Dry Mouth']
Descriptions                                  ['Frequency', 'Severity']
Meta                                        ['Never', 'Not applicable']
Language_Style                                          Vulgar Register
Tone                                                           Confused
Detail_Level                                                          4
Enumeration                                                       False
Explicit_Symptom                                                  False
Spelling_Errors                                                   False
Name: 0, dtype: object