# Import

In [1]:
%cd /home/is/dwipraseetyo-a/NAS_HAI/Project/Qwen2.5-Omni
%pwd

import commons, const_variable
import os, librosa, random, pickle, pydicom, requests, torch, re, json
import pandas as pd
import numpy as np
from tqdm import tqdm
from openai import OpenAI

/home/ldap-users-2/dwipraseetyo-a/Project/Qwen2.5-Omni


# Iterate

In [8]:
system_prompt = '''
You are a clinical reasoning assistant specialized in respiratory diseases, particularly Tuberculosis (TB).
The user will provide a label (True for TB, False for not TB) along with a list of symptoms. Treat the label as ground truth.
Your task is to perform detailed clinical reasoning by analyzing the provided symptoms and explaining why they support or contradict the TB diagnosis.
Output format: JSONL with one key 'answer'
The value should be a short paragraph of clinical reasoning, grounded in the symptoms.
'''
DATA_PATH = "/home/is/dwipraseetyo-a/NAS_HAI/Datasets/cidrz"

In [14]:
missing_records

Unnamed: 0,path_file_image,anon_id,Modality,DistanceSourceToDetector,FieldOfViewDimensions,ImagerPixelSpacing,TargetExposureIndex,Sensitivity,PhotometricInterpretation,Rows,...,night_sweets,weight_loss,hiv_status,tobacco_use,cigarretes_perday,body_weight,height,bmi,path_file_audio,recorder_type
0,images/1538052139.npy,1538052139,DX,1000.0,"[350, 425]","[0.15, 0.15]",876.0,1556.0,MONOCHROME1,2336,...,yes,yes,neg,stopped,Unknown,60.0,185.0,18.0,Unknown,Unknown
1,images/1047931055.npy,1047931055,DX,1000.0,"[350, 425]","[0.15, 0.15]",876.0,16675.0,MONOCHROME1,2336,...,no,yes,pos,stopped,Unknown,53.0,166.0,19.0,wavs/01-399-0885_2.wav,2.0
2,images/2708539186.npy,2708539186,DX,1000.0,"[350, 425]","[0.15, 0.15]",876.0,3820.0,MONOCHROME1,2336,...,no,no,neg,never,Unknown,78.0,168.0,28.0,Unknown,Unknown
3,images/1738893439.npy,1738893439,CR,Unknown,Unknown,"[0.175, 0.175]",Unknown,250.0,MONOCHROME2,2434,...,no,no,pos,never,Unknown,70.0,157.0,28.0,wavs/03-399-0304_1.wav,1.0
4,images/1199494881.npy,1199494881,DX,1000.0,"[350, 425]","[0.15, 0.15]",876.0,2051.0,MONOCHROME1,2336,...,yes,yes,pos,current,1-10,66.0,164.0,25.0,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2166,images/8714745935.npy,8714745935,DX,1000.0,"[350, 425]","[0.15, 0.15]",876.0,1419.0,MONOCHROME1,2336,...,no,yes,neg,never,Unknown,59.0,160.0,23.0,wavs/02-399-0689_0.wav,0.0
2167,images/2686120669.npy,2686120669,CR,Unknown,Unknown,"[0.175, 0.175]",Unknown,250.0,MONOCHROME2,2434,...,no,yes,neg,never,Unknown,55.0,174.0,18.0,Unknown,Unknown
2169,images/1050161359.npy,1050161359,DX,1000.0,"[350, 425]","[0.15, 0.15]",876.0,4700.0,MONOCHROME1,2336,...,no,yes,neg,stopped,Unknown,67.0,180.0,21.0,Unknown,Unknown
2170,images/1143433422.npy,1143433422,DX,1000.0,"[350, 425]","[0.15, 0.15]",876.0,3327.0,MONOCHROME1,2336,...,no,no,neg,never,Unknown,83.0,169.0,29.0,Unknown,Unknown


In [None]:
for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.{split}")
    df = df.rename(columns={'coughdur': 'cough_duration', 'ngtsweats': 'night_sweets', 'weightloss': 'weight_loss', 'body_wt': 'body_weight'})

    # old_barcodes = set()
    # for split in ['train', 'dev', 'test']:
    #     old_file = f"datas/reasoning/symptoms/gpt41_symptoms.csv.{split}"
    #     old_df = pd.read_csv(old_file)
    #     old_barcodes.update(old_df['barcode'].unique())

    # missing_df = df[~df['barcode'].isin(old_barcodes)].copy()

    df_temp = pd.DataFrame(columns=['barcode', 'llm_analyze_symptoms'])
    for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
        row_dict = now_row._asdict()
        now_barcode = now_row.barcode
        answer = "This is Tuberculosis Positive" if now_row.ground_truth_tb == 1 else "This is Tuberculosis Negative"
        question = ", ".join(
            f"{feat.replace('_', ' ')} is {row_dict[feat]}"
            for feat in const_variable.columns_soundfeat
            if row_dict.get(feat) != "Unknown"
        )

        response = const_variable.clientOpenAI.chat.completions.create(
            model="gpt-4o-mini",  # o4-mini gpt-4o-mini gpt-4.1
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"{answer}. The Patient Symptoms Are: {question}"}
            ]
        )

        safe_content = re.sub(r'(?<!\\)\n', r'\\n', response.choices[0].message.content.replace("{\n", "{").replace("\n}", "}"))
        if not safe_content.endswith('}'):
            safe_content += '}'
        response_json = json.loads(safe_content)
        llm_answer = response_json['answer']
        df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]

    pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt-4o-mini_symptoms.csv.{split}', index=False)

In [None]:
count = 0
for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
    count += 1
    if count <= 620:
        continue
    
    row_dict = now_row._asdict()
    now_barcode = now_row.barcode
    answer = "This is Tuberculosis" if now_row.ground_truth_tb == 1 else "This is Not Tuberculosis"
    question = ", ".join(
        f"{feat.replace('_', ' ')} is {row_dict[feat]}"
        for feat in const_variable.columns_soundfeat
        if row_dict.get(feat) != "Unknown"
    )

    response = const_variable.clientOpenAI.chat.completions.create(
        model="gpt-4o-mini",  # o4-mini gpt-4o-mini
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"{answer}. The Patient Symptoms Are: {question}"}
        ]
    )

    safe_content = re.sub(r'(?<!\\)\n', r'\\n', response.choices[0].message.content.replace("{\n", "{").replace("\n}", "}"))
    if not safe_content.endswith('}'):
        safe_content += '}'
    response_json = json.loads(safe_content)
    llm_answer = response_json['answer']
    df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]
pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt4_symptoms.csv.{split}', index=False)

In [19]:
old_dfs = []
for split in ['train', 'dev', 'test']:
    old_file = f"datas/reasoning/symptoms/gpt41_symptoms.csv.{split}"
    if os.path.exists(old_file):
        old_df = pd.read_csv(old_file)
        old_dfs.append(old_df)

if old_dfs:
    old_all = pd.concat(old_dfs, ignore_index=True)
else:
    old_all = pd.DataFrame()

In [28]:
for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f"{DATA_PATH}/metadata_cut_processed.csv.{split}")
    df = df.rename(columns={'coughdur': 'cough_duration', 'ngtsweats': 'night_sweets', 'weightloss': 'weight_loss', 'body_wt': 'body_weight'})

    df_temp = pd.DataFrame(columns=['barcode', 'llm_analyze_symptoms'])
    for now_row in tqdm(df.itertuples(), desc=f"Processing {split}", total=len(df)):
        row_dict = now_row._asdict()
        now_barcode = now_row.barcode
        
        selected_row = old_all[old_all['barcode'] == now_barcode]
        llm_answer = selected_row.sample(n=1).iloc[0]['llm_analyze_symptoms']
        df_temp.loc[len(df_temp)] = [now_barcode, llm_answer]

    pd.DataFrame(df_temp).to_csv(f'datas/reasoning/symptoms/gpt-4o-mini_symptoms.csv.{split}', index=False)

Processing train: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2173/2173 [00:02<00:00, 1012.92it/s]
Processing dev: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 272/272 [00:00<00:00, 1061.59it/s]
Processing test: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 272/272 [00:00<00:00, 1060.43it/s]
