In [8]:
%cd /home/is/dwipraseetyo-a/NAS_HAI/Project/Qwen2.5-Omni
%pwd

import pandas as pd
import numpy as np
import soundfile as sf
from pathlib import Path
import os, librosa, random, pickle, pydicom, requests, torch, re, shutil
from pydicom.datadict import keyword_for_tag
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
from IPython.display import Markdown, display
from openai import OpenAI
from PIL import Image
from tqdm import tqdm

import commons, const_variable

from huggingface_hub import login
from dotenv import load_dotenv
from datasets import Dataset, concatenate_datasets

load_dotenv()
login(token=os.getenv("HG_AUTH_KEY_WRITE"))

random.seed(42)

disease_map = {
    "Healthy": 0,
    "Tuberculosis": 1,
    "Covid-19": 2,
    "Pneumonia": 3,
    "Others": 4,
}
disease_map_reverse = {v: k for k, v in disease_map.items()}

prompt_templates = {
    ("audio",): [
        "Based on the provided cough audio",
        "Listen to this cough sound",
        "Does the cough audio indicate",
        "Analyze the following cough sound"
    ],
    ("xray",): [
        "Examine this chest x-ray",
        "Does the provided x-ray image show",
        "Analyze the radiographic scan",
        "From this x-ray image"
    ],
    ("symptoms",): [
        "Given these symptoms",
        "Do the presented symptoms indicate",
        "Analyze the following symptoms",
        "Based on the symptom description"
    ],
    ("audio", "xray"): [
        "Based on the chest x-ray and cough audio",
        "Examine the x-ray and cough sound",
        "Given the radiograph and cough recording",
        "Using both the x-ray and cough audio"
    ],
    ("audio", "symptoms"): [
        "Given the cough audio and symptoms",
        "Analyze the patient symptoms and cough",
        "Do the symptoms and cough sound indicate",
        "Using both the cough sound and symptoms"
    ],
    ("xray", "symptoms"): [
        "From the x-ray and symptoms",
        "Analyze the x-ray and clinical symptoms",
        "Given the chest scan and symptoms",
        "What is the diagnosis based on the x-ray and symptoms"
    ],
    ("audio", "xray", "symptoms"): [
        "Based on the x-ray, cough audio, and symptoms",
        "Evaluate the x-ray, cough sound, and symptoms",
        "Given the combined evidence",
        "Considering all inputs (image, sound, and symptoms)"
    ]
}


DATA_PATH = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets"
DESTINATION_PATH = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets"

system_prompt = '''
You are an advanced medical assistant AI specialized in analyzing and diagnosing clinical conditions, capable of perceiving auditory and visual inputs. 
You can interpret and reason over various medical inputs, including auditory inputs, visual inputs, and patient symptoms, individually or in combination, depending on what is provided. 
Your task is to analyze the given input, explain your reasoning, and give a possible diagnosis. Always respond in the following format:
<think>{Analyze the available data X-ray findings if present, cough audio characteristics if provided, and reported symptoms to determine the most likely disease.}</think>

<answer>{Determined disease}</answer>
'''

/home/ldap-users-2/dwipraseetyo-a/Project/Qwen2.5-Omni


In [4]:
for split in ['val']:
    df1 = pd.read_csv(f"{DATA_PATH}/coda/metadata_llm.csv.{split}").rename(columns={"participant": "user_id", "tb_status": "disease", "path_file": "path_file_audio"})
    df1 = df1[['user_id', 'disease', 'path_file_audio', 'symptoms_sentence', 'llm_analyze_symptoms']]
    df1["disease"] = df1["disease"].replace(0, 4)
    df1 = df1.groupby("user_id", group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), 70), random_state=42)
    )
    df2 = pd.read_csv(f"{DATA_PATH}/ukcovid/metadata_llm.csv.{split}").rename(columns={"participant_identifier": "user_id", "covid_test_result": "disease"})
    df2["disease"] = df2["disease"].replace(1, 2)
    df2["disease"] = df2["disease"].replace(0, 4)
    df2 = df2[['user_id', 'disease', 'path_file_audio', 'symptoms_sentence', 'llm_analyze_symptoms']]
    df3 = pd.read_csv(f"{DATA_PATH}/xray_metadata_llm_final.csv.{split}").rename(columns={"path_file": "path_file_image", "formatted_llm_analyze_image": "llm_analyze_image"})
    df_combined = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

    instruct_array = []
    for now_row in tqdm(df_combined.itertuples(), desc=f"Processing {split}", total=len(df_combined)):
        row_dict = now_row._asdict()
        modalities = []
        modalities_tags = ""
        if not pd.isna(now_row.path_file_audio):
            modalities_tags += "<audio>"
            modalities.append("audio")
        if not pd.isna(now_row.path_file_image):
            modalities_tags += "<image>"
            modalities.append("xray")
        if not pd.isna(now_row.symptoms_sentence):
            modalities.append("symptoms")

        key = tuple(modalities)
        if key in prompt_templates:
            last_sentence_question = random.choice(prompt_templates[key]) + ", Determine what disease this could indicate, or answer 'Others' if you cannot determine."
        
        question = ""
        think_sentence = "Okay, let's see. "
        if not pd.isna(now_row.llm_analyze_symptoms):
            question += f"The patient symptoms are {now_row.symptoms_sentence}."
            think_sentence += now_row.llm_analyze_symptoms + " "
        if not pd.isna(now_row.llm_analyze_image):
            think_sentence += now_row.llm_analyze_image + " "
        think_sentence += f"Therefore, the disease is {disease_map_reverse[now_row.disease]}."
        answer = f"<think>{think_sentence}</think>\n\n<answer>{disease_map_reverse[now_row.disease]}</answer>"

        question = modalities_tags + "" + question
        question += last_sentence_question

        temp_instruct = {"messages": [
            {"role": "system",
                "content": [
                    {"type": "text", "text": system_prompt}
                ],
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                ],
            },
            {"role": "assistant", "content": [
                {"type": "text", "text": answer}]},
        ]}

        if not pd.isna(now_row.path_file_audio):
            os.makedirs(f"{DESTINATION_PATH}/{split}/", exist_ok=True)
            filename_audio = now_row.path_file_audio.split("/")[-1].split(".")[0]
            start_sec, end_sec = commons.random_3sec_segment(now_row.path_file_audio, segment_duration=3.0)
            temp_instruct["messages"][1]['content'].append({
                "type": "audio", 
                "audio": f"{filename_audio}.wav",
                "audio_start": float(start_sec),
                "audio_end": float(end_sec)
            })
            #temp_instruct["audios"] = now_row.path_file_audio
            shutil.copy(now_row.path_file_audio, f"{DESTINATION_PATH}/{split}/{filename_audio}.wav")
            temp_instruct["audio_file_name"] = f"{filename_audio}.wav"
        if not pd.isna(now_row.path_file_image):
            filename_image = now_row.path_file_image.split("/")[-1].split(".")[0]
            temp_instruct["messages"][1]['content'].append({"type": "image", "image": now_row.path_file_image.split("/")[-1]})
            #temp_instruct["images"] = now_row.path_file_image
            os.makedirs(f"{DESTINATION_PATH}/{split}/", exist_ok=True)
            shutil.copy(now_row.path_file_image, f"{DESTINATION_PATH}/{split}/{now_row.path_file_image.split('/')[-1]}")
            temp_instruct["image_file_name"] = now_row.path_file_image.split("/")[-1]

        temp_instruct["solution"] = answer
        temp_instruct["identifier"] = now_row.user_id
        instruct_array.append(temp_instruct)

    df_result = pd.DataFrame(instruct_array)
    df_result.to_csv(f"{DESTINATION_PATH}/{split}/metadata.csv", index=False)

    # train_instruct = commons.load_image_PIL(instruct_array, direct=True)  
    # train_grpo = commons.grpo_build_datasets(train_instruct, None)
    # train_dataset = Dataset.from_list(train_grpo)
    # instruct_array = []
    # train_dataset.push_to_hub(
    #     "arkiven4/medical-grpo-3modalities",
    #     split=split,
    #     commit_message=f"Upload Datasets",
    # )

  df1 = df1.groupby("user_id", group_keys=False).apply(
Processing val: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15070/15070 [04:55<00:00, 50.99it/s]


In [None]:
# df = pd.read_csv(f"{DESTINATION_PATH}/val.csv")
# df.loc[df['audio_file_name'].notna(), 'audio_file_name'] =  "val/" + df.loc[df['audio_file_name'].notna(), 'audio_file_name']
# df.loc[df['image_file_name'].notna(), 'image_file_name'] =  "val/" + df.loc[df['image_file_name'].notna(), 'image_file_name']
# df.to_csv(f"{DESTINATION_PATH}/val.csv", index=False)

In [None]:
# cd /home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets/val/
# find . -type f ! -name "val.tar.part.*" -delete

In [None]:
cd /home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets/val/
find . -type f ! -name "*.tar" -delete

In [1]:
from datasets import load_dataset
import os
os.environ["HF_HOME"] = "/home/is/dwipraseetyo-a/NAS_HAI/.cache"

val_ds = load_dataset("/home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets/omnigrpo_dataset.py", 
                      split='validation', data_dir="/home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets", 
                      trust_remote_code=True)

No config specified, defaulting to: omnigrpo_dataset/default
Using custom data configuration default-29bcee5a383d3676
Loading Dataset Infos from /home/is/dwipraseetyo-a/.cache/huggingface/modules/datasets_modules/datasets/omnigrpo_dataset/98e8f5e8368e5a8f71a921eba3c315e3148c0ae1c2e47077a99b4fb7af564e84
Repo card metadata block was not found. Setting CardData to empty.


------------------------ New Version Processor2---------------------------


Generating dataset omnigrpo_dataset (/home/is/dwipraseetyo-a/.cache/huggingface/datasets/omnigrpo_dataset/default-29bcee5a383d3676/1.0.0/98e8f5e8368e5a8f71a921eba3c315e3148c0ae1c2e47077a99b4fb7af564e84)
Downloading and preparing dataset omnigrpo_dataset/default to /home/is/dwipraseetyo-a/.cache/huggingface/datasets/omnigrpo_dataset/default-29bcee5a383d3676/1.0.0/98e8f5e8368e5a8f71a921eba3c315e3148c0ae1c2e47077a99b4fb7af564e84...
Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

KeyboardInterrupt: 

In [61]:
from datasets import load_dataset

dataset_config = {
    "LOADING_SCRIPT_FILES": "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets/my_audio_dataset.py",
    "CONFIG_NAME": "val",
    "DATA_DIR": "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets/"
}

val_ds = load_dataset(
    dataset_config["LOADING_SCRIPT_FILES"],
    dataset_config["CONFIG_NAME"],
    data_dir=dataset_config["DATA_DIR"],
    trust_remote_code=True
)


Merging tar parts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.19s/it]


Generating validation split: 0 examples [00:00, ? examples/s]

In [14]:
import pandas as pd
df = pd.read_csv("/home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets/train.csv")
# df["audio_file_name"] = "train/" + df["audio_file_name"].astype(str)
# df["image_file_name"] = "train/" + df["image_file_name"].astype(str)
# df.to_csv("/home/is/dwipraseetyo-a/NAS_HAI/Datasets/grpo_3modalities_datasets/train.csv", index=False)

In [15]:
df

Unnamed: 0,messages,audio_file_name,solution,identifier,image_file_name
0,"[{'role': 'system', 'content': [{'type': 'text...",train/l_CODA_TB_0002_2.wav,"<think>Okay, let's see. Based on the patient's...",CODA_TB_0002,train/nan
1,"[{'role': 'system', 'content': [{'type': 'text...",train/l_CODA_TB_0002_1.wav,"<think>Okay, let's see. Based on the patient's...",CODA_TB_0002,train/nan
2,"[{'role': 'system', 'content': [{'type': 'text...",train/l_CODA_TB_0003_25.wav,"<think>Okay, let's see. Based on the provided ...",CODA_TB_0003,train/nan
3,"[{'role': 'system', 'content': [{'type': 'text...",train/l_CODA_TB_0003_14.wav,"<think>Okay, let's see. Based on the provided ...",CODA_TB_0003,train/nan
4,"[{'role': 'system', 'content': [{'type': 'text...",train/l_CODA_TB_0003_9.wav,"<think>Okay, let's see. Based on the provided ...",CODA_TB_0003,train/nan
...,...,...,...,...,...
100186,"[{'role': 'system', 'content': [{'type': 'text...",train/nan,"<think>Okay, let's see. The chest X-ray findin...",1GOVQY01H4M3,train/1GOVQY01H4M3.png
100187,"[{'role': 'system', 'content': [{'type': 'text...",train/nan,"<think>Okay, let's see. The chest X-ray findin...",RZY9JRW4JK4R,train/RZY9JRW4JK4R.png
100188,"[{'role': 'system', 'content': [{'type': 'text...",train/nan,"<think>Okay, let's see. The chest X-ray findin...",BKZ6GYHYEV2Y,train/BKZ6GYHYEV2Y.png
100189,"[{'role': 'system', 'content': [{'type': 'text...",train/nan,"<think>Okay, let's see. The chest X-ray findin...",0BTN61W0Y4LY,train/0BTN61W0Y4LY.png


In [4]:
for split in ['train', 'dev']:
    df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.{split}")
    df_llm_symptoms = ( pd.read_csv(f"datas/reasoning/symptoms/gpt-4o-mini_symptoms.csv.{split}").groupby('barcode', group_keys=False).apply(lambda x: x.sample(1), include_groups=True).reset_index(drop=True) ) 
    df_llm_images = ( pd.read_csv(f"datas/reasoning/xray/medgemma_xray_formatted.csv.{split}").groupby('path_file_image', group_keys=False).apply(lambda x: x.sample(1), include_groups=True).reset_index(drop=True) )
    df = pd.merge(df, df_llm_symptoms, on='barcode', how='left')
    df = pd.merge(df, df_llm_images, on='path_file_image', how='left')
    df = df.rename(columns={'coughdur': 'cough_duration', 'ngtsweats': 'night_sweets', 'weightloss': 'weight_loss', 'body_wt': 'body_weight'})

    case_info = df['ground_truth_tb'].value_counts().reset_index()
    max_count = case_info['count'][0]
    minor_count = case_info['count'][1]
    ratio = int(max_count // minor_count)

    #df_temp = pd.DataFrame(columns=['barcode', 'question', 'answer', 'tb_status', 'audio_path', 'image_path'])
    instruct_array = []
    for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
        row_dict = now_row._asdict()
        now_audiopath = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz/" +  now_row.path_file_audio
        now_imgaepath = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz/" +  now_row.path_file_image
        current_augment = ratio if now_row.ground_truth_tb == 1 else 1

        for i in range(current_augment):
            question = ""
            array_df = [None, None]
            last_sentence_question, modalities = commons.unique_modalities_generator(const_variable.prompt_templates, now_row.path_file_audio)
            last_sentence_question += ". "
            #templates = const_variable.positive_templates if now_row.ground_truth_tb == 1 else const_variable.negative_templates
            #answer = random.choice(templates)
            answer = commons.generate_tb_response(modalities, now_row.llm_analyze_symptoms, now_row.llm_analyze_image, positive=(now_row.ground_truth_tb == 1))       
            
            array_df = [None, None]
            modalities_tags = ""
            if "symptoms" in modalities:
                row_dict = now_row._asdict()
                #selected_feats = random.sample(const_variable.columns_soundfeat, k=random.randint(3, len(const_variable.columns_soundfeat)))
                symptom_descriptions = ", ".join(
                    f"{feat.replace('_', ' ')} is {row_dict[feat]}"
                    for feat in const_variable.columns_soundfeat #selected_feats
                    if row_dict.get(feat) != "Unknown"
                )
                if symptom_descriptions:
                    question += f"The patient symptoms are {symptom_descriptions}."

            if "audio" in modalities:
                modalities_tags += "<audio>"
                array_df[0] = now_audiopath

            if "xray" in modalities:
                modalities_tags += "<image>"
                array_df[1] = now_imgaepath
                xray_descriptions = ", ".join(
                    f"{feat.replace('_', ' ')} is {row_dict[feat]}"
                    for feat in const_variable.columns_imagefeat
                    if row_dict.get(feat) != "Unknown"
                )
                if xray_descriptions:
                    question += f" The chest x-ray metadata are {xray_descriptions}."

            question = question.strip()
            question = question.rstrip(",.")
            if not question.endswith("."):
                question += ". "
            
            question = modalities_tags + "" + question
            # temp_sentence_arry = last_sentence_question.split(",")
            # if len(temp_sentence_arry) >= 2:
            #     temp_sentence_arry[-2] += " " + modalities_tags
            # question += ",".join(temp_sentence_arry)
            question += last_sentence_question

            temp_instruct = {"messages": [
                {"role": "system",
                    "content": [
                        {"type": "text", "text": system_prompt}
                    ],
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": question},
                    ],
                },
                {"role": "assistant", "content": [
                    {"type": "text", "text": answer}]},
            ]}
            if array_df[0] != None:
                temp_instruct["messages"][1]['content'].append({"type": "audio", "audio": array_df[0]})
            if array_df[1] != None:
                temp_instruct["messages"][1]['content'].append({"type": "image", "image": array_df[1]})
            instruct_array.append(temp_instruct)            
            #df_temp.loc[len(df_temp)] = [now_row.barcode, question, answer, now_row.ground_truth_tb, now_audiopath, now_imgaepath]
    with open(f"datas/instruct_sft_balance.pkl.{split}", "wb") as f:
        pickle.dump(instruct_array, f)
    #pd.DataFrame(df_temp).to_csv(f'datas/grpo_balance.csv.{split}', index=False)

  df_llm_symptoms = ( pd.read_csv(f"datas/reasoning/symptoms/gpt-4o-mini_symptoms.csv.{split}").groupby('barcode', group_keys=False).apply(lambda x: x.sample(1), include_groups=True).reset_index(drop=True) )
  df_llm_images = ( pd.read_csv(f"datas/reasoning/xray/medgemma_xray_formatted.csv.{split}").groupby('path_file_image', group_keys=False).apply(lambda x: x.sample(1), include_groups=True).reset_index(drop=True) )


In [4]:
import re

In [10]:
re.search(r'<answer>(.*?)</answer>', "<answer> Negative Tuberculosis </answer>", flags=re.IGNORECASE).group(1).strip()

'Negative Tuberculosis'

# Publish To HG

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
from datasets import Dataset, concatenate_datasets

load_dotenv()
login(token=os.getenv("HG_AUTH_KEY_WRITE"))

commons.pretty_status("📦 Loading Dataset...")

with open('datas/instruct_grpo_balance.pkl.train', 'rb') as f:
    train_instruct = commons.load_image_PIL(pickle.load(f))

with open('datas/instruct_grpo_balance.pkl.dev', 'rb') as f:
    dev_instruct = commons.load_image_PIL(pickle.load(f))
    
train_dataset = Dataset.from_list(commons.grpo_build_datasets(train_instruct, None)) #
val_dataset = Dataset.from_list(commons.grpo_build_datasets(dev_instruct, None))

combined = concatenate_datasets([train_dataset, val_dataset])
combined.push_to_hub("arkiven4/cirdz-instruct")


| 📦 Loading Dataset... |



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3526/3526 [00:31<00:00, 112.05it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 439/439 [00:03<00:00, 114.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3526/3526 [00:55<00:00, 63.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 439/439 [00:04<00:00, 99.13it/s]


Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/arkiven4/cirdz-instruct/commit/8538285dc2f17ddce47fb960f49054e6c7d10419', commit_message='Upload dataset', commit_description='', oid='8538285dc2f17ddce47fb960f49054e6c7d10419', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/arkiven4/cirdz-instruct', endpoint='https://huggingface.co', repo_type='dataset', repo_id='arkiven4/cirdz-instruct'), pr_revision=None, pr_num=None)

In [None]:
df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.test")
df_llm_symptoms = ( pd.read_csv(f"datas/reasoning/symptoms/o4-mini_symptoms.csv.test").groupby('barcode', group_keys=False).apply(lambda x: x.sample(1), include_groups=True).reset_index(drop=True) ) 
df_llm_images = ( pd.read_csv(f"datas/reasoning/xray/medgemma_xray_formatted.csv.test").groupby('path_file_image', group_keys=False).apply(lambda x: x.sample(1), include_groups=True).reset_index(drop=True) )
df = pd.merge(df, df_llm_symptoms, on='barcode', how='left')
df = pd.merge(df, df_llm_images, on='path_file_image', how='left')
df = df.rename(columns={'coughdur': 'cough_duration', 'ngtsweats': 'night_sweets', 'weightloss': 'weight_loss', 'body_wt': 'body_weight'})

instruct_array_positive = []
instruct_array_negative = []

for now_row in tqdm(df.itertuples(), desc=f"Processing Datasets", total=len(df)):
    row_dict = now_row._asdict()
    now_audiopath = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz/" +  now_row.path_file_audio
    now_imgaepath = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz/" +  now_row.path_file_image

    modalities = ["audio", "xray", "symptoms"]
    key = tuple(modalities)
    if key in const_variable.prompt_templates:
        question = random.choice(const_variable.prompt_templates[key]) + ". "
    answer = commons.generate_tb_response(modalities, now_row.llm_analyze_symptoms, now_row.llm_analyze_image, positive=(now_row.ground_truth_tb == 1))

    if now_row.path_file_audio == 'Unknown':
        modalities = [m for m in modalities if m != "audio"]

    array_df = [None, None]
    if "symptoms" in modalities:
        row_dict = now_row._asdict()
        selected_feats = random.sample(const_variable.columns_soundfeat, k=random.randint(3, len(const_variable.columns_soundfeat)))
        symptom_descriptions = ", ".join(
            f"{feat.replace('_', ' ')} is {row_dict[feat]}"
            for feat in selected_feats
            if row_dict.get(feat) != "Unknown"
        )
        if symptom_descriptions:
            question += f" Also, the patient presents with: {symptom_descriptions}."

    if "audio" in modalities:
        array_df[0] = now_audiopath

    if "xray" in modalities:
        array_df[1] = now_imgaepath
        xray_descriptions = ", ".join(
            f"{feat.replace('_', ' ')} is {row_dict[feat]}"
            for feat in const_variable.columns_imagefeat
            if row_dict.get(feat) != "Unknown"
        )
        if xray_descriptions:
            question += f" Additional chest x-ray metadata include: {xray_descriptions}."

    question = question.strip()
    question = question.rstrip(",.")
    if not question.endswith("."):
        question += "."

    temp_instruct = {"messages": [
        {"role": "system",
            "content": [
                {"type": "text", "text": system_prompt}
            ],
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
            ],
        },
        {"role": "assistant", "content": [
            {"type": "text", "text": answer}]},
    ]}
    if array_df[0] != None:
        temp_instruct["messages"][1]['content'].append({"type": "audio", "audio": array_df[0]})
    if array_df[1] != None:
        temp_instruct["messages"][1]['content'].append({"type": "image", "image":array_df[1]})

    if now_row.ground_truth_tb == 1:
        instruct_array_positive.append(temp_instruct)
    elif now_row.ground_truth_tb == 0:
        instruct_array_negative.append(temp_instruct)
        
with open(f"datas/positive_instruct.pkl.test", "wb") as f:
    pickle.dump(instruct_array_positive, f)

with open(f"datas/negative_instruct.pkl.test", "wb") as f:
    pickle.dump(instruct_array_negative, f)

In [None]:
def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


clear_memory()