# Import

In [6]:
%cd /home/is/dwipraseetyo-a/NAS_HAI/Project/Qwen2.5-Omni
%pwd

import commons, const_variable
import os, librosa, random, pickle, pydicom, requests, torch, re, json
import pandas as pd
import numpy as np
from tqdm import tqdm
from openai import OpenAI

/home/ldap-users-2/dwipraseetyo-a/Project/Qwen2.5-Omni


# Iterate

In [7]:
system_prompt = '''
You are a clinical reasoning assistant specialized in respiratory diseases, particularly Tuberculosis (TB).
The user will provide a label (True for TB, False for not TB) along with a list of symptoms. Treat the label as ground truth.
Your task is to perform detailed clinical reasoning by analyzing the provided symptoms and explaining why they support or contradict the TB diagnosis.
Output format: JSONL with one key 'answer'
The value should be a short paragraph of clinical reasoning, grounded in the symptoms.
'''
DATA_PATH = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz"

In [8]:
for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.{split}")
    df = df.rename(columns={'coughdur': 'cough_duration', 'ngtsweats': 'night_sweets', 'weightloss': 'weight_loss', 'body_wt': 'body_weight'})

    df_temp = pd.DataFrame(columns=['barcode', 'llm_analyze_symptoms'])
    for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
        row_dict = now_row._asdict()
        now_barcode = now_row.barcode
        answer = "This is Tuberculosis Positive" if now_row.ground_truth_tb == 1 else "This is Tuberculosis Negative"
        question = ", ".join(
            f"{feat.replace('_', ' ')} is {row_dict[feat]}"
            for feat in const_variable.columns_soundfeat
            if row_dict.get(feat) != "Unknown"
        )

        response = const_variable.clientOpenAI.chat.completions.create(
            model="gpt-4.1",  # o4-mini gpt-4o-mini gpt-4.1
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{answer}. The Patient Symptoms Are: {question}"}
            ]
        )

        safe_content = re.sub(r'(?<!\\)\n', r'\\n', response.choices[0].message.content.replace("{\n", "{").replace("\n}", "}"))
        if not safe_content.endswith('}'):
            safe_content += '}'
        response_json = json.loads(safe_content)
        llm_answer = response_json['answer']
        df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]
    pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt41_symptoms.csv.{split}', index=False)

Processing train:   0%|                                                                                                                                                                                             | 0/2151 [00:00<?, ?it/s]

Processing train: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2151/2151 [1:47:50<00:00,  3.01s/it]
Processing dev: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 269/269 [12:51<00:00,  2.87s/it]
Processing test: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270/270 [13:04<00:00,  2.91s/it]


In [None]:
count = 0
for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
    count += 1
    if count <= 620:
        continue
    
    row_dict = now_row._asdict()
    now_barcode = now_row.barcode
    answer = "This is Tuberculosis" if now_row.ground_truth_tb == 1 else "This is Not Tuberculosis"
    question = ", ".join(
        f"{feat.replace('_', ' ')} is {row_dict[feat]}"
        for feat in const_variable.columns_soundfeat
        if row_dict.get(feat) != "Unknown"
    )

    response = const_variable.clientOpenAI.chat.completions.create(
        model="gpt-4o-mini",  # o4-mini gpt-4o-mini
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"{answer}. The Patient Symptoms Are: {question}"}
        ]
    )

    safe_content = re.sub(r'(?<!\\)\n', r'\\n', response.choices[0].message.content.replace("{\n", "{").replace("\n}", "}"))
    if not safe_content.endswith('}'):
        safe_content += '}'
    response_json = json.loads(safe_content)
    llm_answer = response_json['answer']
    df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]
pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt4_symptoms.csv.{split}', index=False)

In [None]:
import pandas as pd
import json
import re
from tqdm import tqdm
import concurrent.futures

def process_row(now_row):
    row_dict = now_row._asdict()
    now_barcode = now_row.barcode
    answer = "This is Tuberculosis" if now_row.ground_truth_tb == 1 else "This is Not Tuberculosis"
    
    question = ", ".join(
        f"{feat.replace('_', ' ')} is {row_dict[feat]}"
        for feat in const_variable.columns_soundfeat
        if row_dict.get(feat) != "Unknown"
    )

    try:
        response = const_variable.clientOpenAI.chat.completions.create(
            model="gpt-4.1",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{answer}. The Patient Symptoms Are: {question}"}
            ]
        )
        safe_content = re.sub(r'(?<!\\)\n', r'\\n', response.choices[0].message.content.replace("{\n", "{").replace("\n}", "}"))
        if not safe_content.endswith('}'):
            safe_content += '}'

        response_json = json.loads(safe_content)
        llm_answer = response_json['answer']
    except Exception as e:
        print(f"[ERROR] {now_barcode}: {e}")
        llm_answer = "Error"

    return now_barcode, llm_answer


for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.{split}")
    df = df.rename(columns={
        'coughdur': 'cough_duration',
        'ngtsweats': 'night_sweets',
        'weightloss': 'weight_loss',
        'body_wt': 'body_weight'
    })

    results = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = list(tqdm(
            executor.map(process_row, df.itertuples(index=False)),
            total=len(df),
            desc=f"Processing {split}"
        ))

    # Collect results
    for barcode, llm_answer in futures:
        results.append({'barcode': barcode, 'llm_analyze_symptoms': llm_answer})

    df_temp = pd.DataFrame(results)
    df_temp.to_csv(f'datas/reasoning/symptoms/gpt41_symptoms.csv.{split}', index=False)
