In [3]:
import pandas as pd
from transformers import pipeline
from extractor import ExtractingPrompt
from LLM import LLM
from tqdm import tqdm

In [4]:
df = pd.read_csv('/home/laajila/mima_newcode/clean_code/outputs/2025-02-05/14-14-20/output_20250205_144847.csv')

In [5]:
df.head(2)

Unnamed: 0,Dialogue_Generated,symptom,description,meta,language_style,Tone,Detail_level,Enumeration,Explicit_symptom,Spelling_errors
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,Severity,Very severe,Informal Register,Fearful,1,False,True,True
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,Severity,Not sexually active,Vulgar Register,Fearful,2,False,True,True


In [6]:
df.shape

(823, 10)

In [7]:
ner_pipeline = pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="first")


extracted_data = []
for i , dialogue in enumerate(df['Dialogue_Generated']) :
    ner_results = ner_pipeline(dialogue)
    symptoms = [entity["word"] for entity in ner_results if "symptom" in entity["entity_group"].lower()]
    
    extracted_data.append({
        "Dialogue": dialogue,
        "Real symptoms": df['symptom'][i],
        "Extracted_Symptoms": symptoms
    })
    
df_extracted = pd.DataFrame(extracted_data)
df_extracted.head(5)

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,Dialogue,Real symptoms,Extracted_Symptoms
0,"""I'm so scared, I've got these cracks at the ...",Cracking at the corners of the mouth (cheilosi...,[cracks]
1,"""I got these cracks at the corners of me mout...",Cracking at the corners of the mouth (cheilosi...,[]
2,"""I am fed up with this bloody cracking at the...",Cracking at the corners of the mouth (cheilosi...,[]
3,"""I have been experiencing a cracking at the c...",Cracking at the corners of the mouth (cheilosi...,[cracking]
4,"""I am fed up with this cracking at the corner...",Cracking at the corners of the mouth (cheilosi...,"[fed, cracking]"


In [7]:
nothing_extracted = [1 for i in range(len(df_extracted['Extracted_Symptoms'])) if df_extracted['Extracted_Symptoms'][i]==[]]
len(nothing_extracted)

217

### Import the LLM model : Biollama 

In [14]:
model = LLM(model_name="iRASC/BioLlama-Ko-8B", max_length=50)



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Device set to use cuda:0


Model loaded on device(s): {'': 0}


In [20]:
extractor = ExtractingPrompt()
phrase = "I\'m so scared, I\'ve got these cracks at the corners of my mouth that won\'t go away, it\'s so painful and itchy, I\'m so worried I\'ll get an infection."
prompt = extractor.build_extraction_prompt(phrase)

In [21]:
prompt

[{'role': 'system',
  'content': 'You are an AI assistant trained to extract structured symptoms from patient inputs based on the PRO-CTCAE dataset. You must identify and return only the symptoms mentioned, ensuring accuracy and completeness.'},
 {'role': 'user',
  'content': 'Identify and extract the symptoms described in the following patient statement:\n\n"I\'m so scared, I\'ve got these cracks at the corners of my mouth that won\'t go away, it\'s so painful and itchy, I\'m so worried I\'ll get an infection."\n\nReturn only the symptoms, separated by commas if multiple, without any additional text, comments, or explanations.'}]

In [22]:
symptoms_extracted = model.generate_text(messages=prompt)

In [23]:
symptoms_extracted

' mouth dryness, mouth ulceration, mouth pain, mouth tingling, mouth numbness, mouth burning, mouth itching, mouth cracking, mouth discomfort, mouth irritation, mouth infection, mouth inflammation, mouth redness, mouth swelling, mouth tenderness'

In [24]:
results = []
extractor = ExtractingPrompt()

for i, phrase in tqdm(enumerate(df['Dialogue_Generated'])):

    prompt = extractor.build_extraction_prompt(phrase)

    symptoms_extracted = model.generate_text(messages=prompt)

    true_symptom = df['symptom'][i]
    
    # Append a dictionary with your desired columns
    results.append({
        "Dialogue": phrase,
        "True_Symptom": true_symptom,
        "Extracted_Symptom": symptoms_extracted
    })
    df_results = pd.DataFrame(results)
    df_results.to_csv("Extracting_symptoms_using_LLM.csv")

8it [00:18,  2.32s/it]


KeyboardInterrupt: 

In [10]:
results

[{'Dialogue': ' "I\'m so scared, I\'ve got these cracks at the corners of my mouth that won\'t go away, it\'s so painful and itchy, I\'m so worried I\'ll get an infection."',
  'True_Symptom': 'Cracking at the corners of the mouth (cheilosis/cheilitis)',
  'Extracted_Symptom': ' "dry mouth, mouth ulcers, mouth pain"'},
 {'Dialogue': ' "I got these cracks at the corners of me mouth, an\' I\'m too scared to go out an\' get help, I\'m too scared to even look in the mirror, I\'m so ashamed of meself, I\'m so ashamed."',
  'True_Symptom': 'Cracking at the corners of the mouth (cheilosis/cheilitis)',
  'Extracted_Symptom': ' "oral mucosa dryness, oral mucosa cracking"'},
 {'Dialogue': ' "I am fed up with this bloody cracking at the corners of my mouth, it is so annoying and I cannot stand it anymore."',
  'True_Symptom': 'Cracking at the corners of the mouth (cheilosis/cheilitis)',
  'Extracted_Symptom': ' "oral mucosa dryness, oral mucosa cracking"'},
 {'Dialogue': ' "I have been experienci

68," ""I have been experiencing abdominal pain for the past three days.""",Abdominal pain, abdominal pain