# Import

In [None]:
%cd /home/is/dwipraseetyo-a/NAS_HAI/Project/Qwen2.5-Omni
%pwd

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"])

import transformers, torch
device_name = torch.cuda.get_device_name(0)
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"Logical index: {i}, Name: {props.name}")

import commons, const_variable
import os, librosa, random, pickle, pydicom, requests, torch, re, json
import pandas as pd
import numpy as np
from tqdm import tqdm
from openai import OpenAI

# Iterate

In [None]:
system_prompt = '''
You are a clinical reasoning assistant specialized in respiratory diseases, particularly Tuberculosis (TB).
The user will provide a label (True for TB, False for not TB) along with a list of symptoms. Treat the label as ground truth.
Your task is to perform detailed clinical reasoning by analyzing the provided symptoms and explaining why they support or contradict the TB diagnosis.
Output format: JSONL with one key 'answer'
The value should be a short paragraph of clinical reasoning, grounded in the symptoms.
'''
DATA_PATH = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz"

In [None]:
for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.{split}")
    df = df.rename(columns={'coughdur': 'cough_duration', 'ngtsweats': 'night_sweets', 'weightloss': 'weight_loss', 'body_wt': 'body_weight'})

    # old_barcodes = set()
    # for split in ['train', 'dev', 'test']:
    #     old_file = f"datas/reasoning/symptoms/gpt41_symptoms.csv.{split}"
    #     old_df = pd.read_csv(old_file)
    #     old_barcodes.update(old_df['barcode'].unique())

    # missing_df = df[~df['barcode'].isin(old_barcodes)].copy()

    df_temp = pd.DataFrame(columns=['barcode', 'llm_analyze_symptoms'])
    for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
        row_dict = now_row._asdict()
        now_barcode = now_row.barcode
        answer = "This is Tuberculosis Positive" if now_row.ground_truth_tb == 1 else "This is Tuberculosis Negative"
        question = ", ".join(
            f"{feat.replace('_', ' ')} is {row_dict[feat]}"
            for feat in const_variable.columns_soundfeat
            if row_dict.get(feat) != "Unknown"
        )

        response = const_variable.clientOpenAI.chat.completions.create(
            model="gpt-4o-mini",  # o4-mini gpt-4o-mini gpt-4.1
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{answer}. The Patient Symptoms Are: {question}"}
            ]
        )

        safe_content = re.sub(r'(?<!\\)\n', r'\\n', response.choices[0].message.content.replace("{\n", "{").replace("\n}", "}"))
        if not safe_content.endswith('}'):
            safe_content += '}'
        response_json = json.loads(safe_content)
        llm_answer = response_json['answer']
        df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]

    pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt-4o-mini_symptoms.csv.{split}', index=False)

In [None]:
count = 0
for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
    count += 1
    if count <= 620:
        continue
    
    row_dict = now_row._asdict()
    now_barcode = now_row.barcode
    answer = "This is Tuberculosis" if now_row.ground_truth_tb == 1 else "This is Not Tuberculosis"
    question = ", ".join(
        f"{feat.replace('_', ' ')} is {row_dict[feat]}"
        for feat in const_variable.columns_soundfeat
        if row_dict.get(feat) != "Unknown"
    )

    response = const_variable.clientOpenAI.chat.completions.create(
        model="gpt-4o-mini",  # o4-mini gpt-4o-mini
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"{answer}. The Patient Symptoms Are: {question}"}
        ]
    )

    safe_content = re.sub(r'(?<!\\)\n', r'\\n', response.choices[0].message.content.replace("{\n", "{").replace("\n}", "}"))
    if not safe_content.endswith('}'):
        safe_content += '}'
    response_json = json.loads(safe_content)
    llm_answer = response_json['answer']
    df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]
pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt4_symptoms.csv.{split}', index=False)

In [None]:
old_dfs = []
for split in ['train', 'dev', 'test']:
    old_file = f"datas/reasoning/symptoms/gpt41_symptoms.csv.{split}"
    if os.path.exists(old_file):
        old_df = pd.read_csv(old_file)
        old_dfs.append(old_df)

if old_dfs:
    old_all = pd.concat(old_dfs, ignore_index=True)
else:
    old_all = pd.DataFrame()

In [None]:
for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.{split}")
    df = df.rename(columns={'coughdur': 'cough_duration', 'ngtsweats': 'night_sweets', 'weightloss': 'weight_loss', 'body_wt': 'body_weight'})

    df_temp = pd.DataFrame(columns=['barcode', 'llm_analyze_symptoms'])
    for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
        row_dict = now_row._asdict()
        now_barcode = now_row.barcode
        
        selected_row = old_all[old_all['barcode'] == now_barcode]
        llm_answer = selected_row.sample(n=1).iloc[0]['llm_analyze_symptoms']
        df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]

    pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt-4o-mini_symptoms.csv.{split}', index=False)

# OpenBioLLM-8B

In [None]:
%cd /home/is/dwipraseetyo-a/NAS_HAI/Project/Qwen2.5-Omni
%pwd

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"])

import transformers, torch
device_name = torch.cuda.get_device_name(0)
for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    print(f"Logical index: {i}, Name: {props.name}")

In [None]:
import commons, const_variable
import os, librosa, random, pickle, pydicom, requests, torch, re, json
import pandas as pd
import numpy as np
from tqdm import tqdm

DATA_PATH = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz"
model_id = "/home/is/dwipraseetyo-a/NAS_HAI/Project/pretrain/Llama3-OpenBioLLM-8B"
system_prompt = "You are an expert and experienced from the healthcare and biomedical domain with extensive medical knowledge and practical experience. Your name is OpenBioLLM, and you were developed by Saama AI Labs. who's willing to help answer the user's query with explanation. In your explanation, leverage your deep medical expertise such as relevant anatomical structures, physiological processes, diagnostic criteria, treatment guidelines, or other pertinent medical concepts. Use precise medical terminology while still aiming to make the explanation clear and accessible to a general audience."

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="bfloat16",
    device_map="auto" 
)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
"""
Turberculosis
Pneumonia
Covid19
Healthy
Others
"""