# CT-RATE label csv 가져와서 id 매칭

In [None]:
import pandas as pd

# ctrate
ct_rate_df = pd.read_csv('/workspace/7.Error/csv/CTRATE_train_predicted_labels.csv')
ct_rate_df = ct_rate_df[['VolumeName', 'Lung nodule', 'Pleural effusion']]

# radgenome
abnormal_df = pd.read_csv('/workspace/7.Error/csv/RADGENOME_train_vqa_abnormality.csv')
abnormal_df = abnormal_df[abnormal_df['Anatomy'] == 'lung']

# merge ctrate
ct_rate_df = ct_rate_df.merge(abnormal_df, left_on='VolumeName', right_on='Volumename', how='left')
ct_rate_df.drop(columns=['Volumename'], inplace=True)

# radgenome
train_df = pd.read_csv('/workspace/7.Error/csv/medregion_report_train.csv')
valid_df = pd.read_csv('/workspace/7.Error/csv/medregion_report_valid.csv')
data_df  = pd.concat([train_df, valid_df], axis=0)
data_df  = data_df[['ct_path', 'id', 'full_report', 'lung_parenchyma']]

# merge radgenome
merged_df = data_df.merge(ct_rate_df, left_on='id', right_on='VolumeName', how='left')
merged_df.drop(columns=['VolumeName'], inplace=True)


In [None]:
merged_df.head(3)

In [None]:
nodule_df = merged_df[(merged_df['Lung nodule'] == 1) 
                      & (merged_df['Pleural effusion'] == 0) 
                      & (merged_df['lung_parenchyma'].str.contains('nodule'))]

effusion_df = merged_df[(merged_df['Lung nodule'] == 0) 
                        & (merged_df['Pleural effusion'] == 1) 
                        & (merged_df['lung_parenchyma'].str.contains('effusion'))]

none_df = merged_df[(merged_df['Lung nodule'] == 0) 
                    & (merged_df['Pleural effusion'] == 0) 
                    & (merged_df['Abnormality'].str.lower()=='no findings')]

len(nodule_df), len(effusion_df), len(none_df)

In [None]:
# 중복 체크

print(set(nodule_df['id'].values) & set(none_df['id'].values))
print(set(effusion_df['id'].values) & set(none_df['id'].values))
print(set(nodule_df['id'].values) & set(effusion_df['id'].values))

In [None]:
import random
random.seed(42)

nodule_sample_df   = nodule_df.sample(1000)
effusion_sample_df = effusion_df.sample(1000)
none_sample_df     = none_df.sample(1000)


In [None]:
nodule_sample_df

In [None]:
# 저장

nodule_sample_df.to_csv('/workspace/7.Error/csv/nodule_sample_1000.csv', index=False)
effusion_sample_df.to_csv('/workspace/7.Error/csv/effusion_sample_1000.csv', index=False)
none_sample_df.to_csv('/workspace/7.Error/csv/none_sample_1000.csv', index=False)

# Error generation using dspy

In [None]:
# python code/error_generation_dspy.py --csv_path /workspace/7.Error/csv --save_path /workspace/7.Error/csv

In [None]:
# !pip install dspy pandas

# Error Pair 만들기

In [1]:
import pandas as pd

none_sample_df     = pd.read_csv('/workspace/7.Error/csv/dspy_none_1000.csv')
effusion_sample_df = pd.read_csv('/workspace/7.Error/csv/dspy_effusion_1000.csv')
nodule_sample_df   = pd.read_csv('/workspace/7.Error/csv/dspy_nodule_1000.csv')

omission_effusion   = effusion_sample_df[effusion_sample_df['omission_effusion_classification'] == 1]
insertion_nodule    = effusion_sample_df[effusion_sample_df['insertion_nodule_classification'] == 1]
direction_effusion  = effusion_sample_df[effusion_sample_df['direction_effusion_classification'] == 1]
size_effusion       = effusion_sample_df[effusion_sample_df['size_classification'] == 1]
typo_effusion       = effusion_sample_df[effusion_sample_df['typo_classification'] == 1]
unit_effusion       = effusion_sample_df[effusion_sample_df['unit_classification'] == 1]

omission_nodule    = nodule_sample_df[nodule_sample_df['omission_nodule_classification'] == 1]
insertion_effusion = nodule_sample_df[nodule_sample_df['insertion_effusion_classification'] == 1]
direction_nodule   = nodule_sample_df[nodule_sample_df['direction_nodule_classification'] == 1]
size_nodule        = nodule_sample_df[nodule_sample_df['size_classification'] == 1]
typo_nodule        = nodule_sample_df[nodule_sample_df['typo_classification'] == 1]
unit_nodule        = nodule_sample_df[nodule_sample_df['unit_classification'] == 1]

none_insert_nodule   = none_sample_df[none_sample_df['insertion_nodule_classification'] == 1]
none_insert_effusion = none_sample_df[none_sample_df['insertion_effusion_classification'] == 1]

## Question format

In [2]:
question_format = {
    "classification": [
        "Evaluate the accuracy of the provided medical report based on the given CT scan image. Respond strictly with either 0 (report accurate) or 1 (report contains inaccuracies).\nAnswer Format: '0' or '1'",
        "Check the provided medical report for correctness using the CT scan image as a reference. Your response must strictly be either 0 (report is correct) or 1 (report is incorrect).\nAnswer Format: '0' or '1'",
        "Determine if the medical report accurately describes the provided CT scan. Answer strictly 0 if the report is correct, or 1 if it contains any errors.\nAnswer Format: '0' or '1'",
        "Assess the medical report for any errors or inaccuracies using the provided CT scan image as the standard reference. Return exactly 0 if no errors exist, or exactly 1 if errors are present.\nAnswer Format: '0' or '1'",
        "Verify whether the medical report is accurate based on the provided CT scan. Return strictly 0 for correct or 1 if any inaccuracies exist.\nAnswer Format: '0' or '1'"
    ],    
    
    "detection": [
        "Identify and extract sentences from the provided medical report that contain inaccuracies or inconsistencies relative to the image. If none exist, reply: 'No errors detected.'\nAnswer Format: '{error sentence}' or 'No errors detected.'",
        "Detect report-image errors. Output erroneous sentence(s) or 'No errors detected.'\nAnswer Format: '{error sentence}' or 'No errors detected.'",
        "Check the medical report for any errors, including mismatches with the image. If any such issues are found, point out the exact sentence(s) involved. If everything is accurate and consistent, say: 'No errors detected.'\nAnswer Format: '{error sentence}' or 'No errors detected.'",
        "Scan the medical report for any errors or factual discrepancies in comparison to the image. Return erroneous sentence(s) or respond with 'No errors detected.'\nAnswer Format: '{error sentence}' or 'No errors detected.'",
        "Highlight any incorrect, mismatched, or erroneous sentences in the medical report, including those that conflict with the provided image. If no issues are found, respond clearly with 'No errors detected.'\nAnswer Format: '{error sentence}' or 'No errors detected.'"
    ],   
    
    "detection_index": [
        "Identify and extract the index numbers of sentences in the provided medical report that contain inaccuracies or inconsistencies with reference to the image. If no errors are found, output '0' to indicate no issues.\nAnswer Format: '{index}' or '0'",
        "Detect report-image errors. Output the index of each erroneous sentence. If there are no errors, respond with '0' to indicate all sentences are correct.\nAnswer Format: '{index}' or '0'",
        "Check the medical report for any errors, including mismatches with the image. If any such issues are found, return the index numbers of the sentences involved. If everything is consistent, output '0' to indicate no errors.\nAnswer Format: '{index}' or '0'",
        "Scan the medical report for any errors or factual discrepancies in comparison to the image. Return the index numbers of any erroneous sentences. If no such sentences exist, respond with '0' to indicate no issues were detected.\nAnswer Format: '{index}' or '0'",
        "Highlight any incorrect, mismatched, or erroneous sentences in the medical report, including those that conflict with the provided image. If the report is fully accurate, clearly respond with '0' to show that no errors were found.\nAnswer Format: '{index}' or '0'"
    ],    
    
    "correction": [
        "Review the medical report in comparison with the provided image. First, identify any sentence(s) that are inaccurate or any important findings from the image that are missing from the report. Then, provide a corrected version of the inaccurate sentence(s) or write the missing sentence(s) that should have been included. If no issues are found, respond: 'No errors detected.'\nAnswer Format: '{corrected or added sentence}' or 'No errors detected.'",
        "Compare the medical report with the corresponding image. First, identify any erroneous or omitted sentence(s) based on the image. Then, revise the incorrect sentence(s) or generate the sentence(s) that were missing. If the report is accurate and complete, respond: 'No errors detected.'\nAnswer Format: '{corrected or added sentence}' or 'No errors detected.'",
        "Analyze the medical report with reference to the provided image. First, detect any errors or missing descriptions. Then, return the corrected version of the inaccurate content or the missing sentence(s). If everything is accurate, respond with 'No errors detected.'\nAnswer Format: '{corrected or added sentence}' or 'No errors detected.'",
        "Evaluate the medical report with the given image. Begin by identifying any inaccurate or missing statement(s) in the report. After identifying them, present either the corrected version or the appropriate missing sentence(s). If no problems are found, respond: 'No errors detected.'\nAnswer Format: '{corrected or added sentence}' or 'No errors detected.'",
        "Examine the medical report alongside the associated image. First, identify any inaccuracies or omissions. Then, provide the corrected version of the sentence(s) or the sentence(s) that should be added. If the report is both accurate and complete, reply with: 'No errors detected.'\nAnswer Format: '{corrected or added sentence}' or 'No errors detected.'"
    ],
}

## level 1

### insertion

In [3]:
import random
from tqdm import tqdm
import json

def generate_insert_conversations(df, lesion_type, report_col, inserted_sentence_col, question_format, start_index=0):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['classification'])
        report = sample[report_col]
        distorted_sentence = sample[inserted_sentence_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = f"{distorted_sentence}"

        human_dict = {
            'type': 'insertion',
            'level': '1',
            'lesion': lesion_type,
            'from': 'human',
            'value': input_text.strip()
        }

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


# 실행 부분
full_list = []
full_list += generate_insert_conversations(none_insert_nodule, 
                                           lesion_type='nodule',
                                           report_col='insertion_nodule_distorted_report',
                                           inserted_sentence_col='insertion_nodule_classification',
                                           question_format=question_format,
                                           start_index=0)

full_list += generate_insert_conversations(none_insert_effusion, 
                                           lesion_type='effusion',
                                           report_col='insertion_effusion_distorted_report',
                                           inserted_sentence_col='insertion_effusion_classification',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(insertion_effusion, 
                                           lesion_type='effusion',
                                           report_col='insertion_effusion_distorted_report',
                                           inserted_sentence_col='insertion_effusion_classification',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(insertion_nodule, 
                                           lesion_type='nodule',
                                           report_col='insertion_nodule_distorted_report',
                                           inserted_sentence_col='insertion_nodule_classification',
                                           question_format=question_format,
                                           start_index=len(full_list))


100%|██████████| 1000/1000 [00:00<00:00, 18258.41it/s]
100%|██████████| 997/997 [00:00<00:00, 16288.15it/s]
100%|██████████| 992/992 [00:00<00:00, 15082.39it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15865.22it/s]


### omission

In [4]:
import random
from tqdm import tqdm
import json

def generate_omit_conversations(df, lesion_type, report_col, inserted_sentence_col, question_format, start_index=0):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['classification'])
        report = sample[report_col]
        distorted_sentence = sample[inserted_sentence_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = f"{distorted_sentence}"

        human_dict = {
            'type': 'omission',
            'level': '1',
            'lesion': lesion_type,
            'from': 'human',
            'value': input_text.strip()
        }

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list



# 실행 부분
full_list += generate_omit_conversations(omission_nodule, 
                                         lesion_type='nodule',
                                         report_col='omission_nodule_distorted_report',
                                         inserted_sentence_col='omission_nodule_classification',
                                         question_format=question_format,
                                         start_index=len(full_list))

full_list += generate_omit_conversations(omission_effusion, 
                                         lesion_type='effusion',
                                         report_col='omission_effusion_distorted_report',
                                         inserted_sentence_col='omission_effusion_classification',
                                         question_format=question_format,
                                         start_index=len(full_list))



100%|██████████| 967/967 [00:00<00:00, 16046.67it/s]
100%|██████████| 935/935 [00:00<00:00, 16463.58it/s]


### others

In [5]:
import random
from tqdm import tqdm
import json

def generate_other_conversations(df, report_col, inserted_sentence_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['classification'])
        report = sample[report_col]
        distorted_sentence = sample[inserted_sentence_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = f"{distorted_sentence}"
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '1',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '1',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


# nodule
full_list += generate_other_conversations(direction_nodule, 
                                          lesion_type='nodule',
                                          report_col='direction_nodule_distorted_report',
                                          inserted_sentence_col='direction_nodule_classification',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_nodule, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_classification',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_nodule, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_classification',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_nodule, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_classification',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))



# effusion
full_list += generate_other_conversations(direction_effusion, 
                                          lesion_type='effusion',
                                          report_col='direction_effusion_distorted_report',
                                          inserted_sentence_col='direction_effusion_classification',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_effusion, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_classification',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_effusion, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_classification',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_effusion, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_classification',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))


100%|██████████| 955/955 [00:00<00:00, 10759.62it/s]
100%|██████████| 702/702 [00:00<00:00, 16571.93it/s]
100%|██████████| 976/976 [00:00<00:00, 19126.30it/s]
100%|██████████| 873/873 [00:00<00:00, 13085.37it/s]
100%|██████████| 987/987 [00:00<00:00, 15708.23it/s]
100%|██████████| 688/688 [00:00<00:00, 14264.37it/s]
100%|██████████| 977/977 [00:00<00:00, 13944.00it/s]
100%|██████████| 654/654 [00:00<00:00, 15718.01it/s]


### No erros detected

In [6]:
import random
from tqdm import tqdm
import json

def generate_normal_conversations(df, report_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['classification'])
        report = sample[report_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = "0"
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '1',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '1',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list



# normal
full_list += generate_normal_conversations(nodule_sample_df, 
                                           lesion_type='nodule',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_normal_conversations(effusion_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 

full_list += generate_normal_conversations(none_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 



100%|██████████| 1000/1000 [00:00<00:00, 12457.98it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14079.10it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8495.34it/s]


In [7]:
len(full_list)

15703

## level 2

### insertion

In [8]:
import random
from tqdm import tqdm
import json

def generate_insert_conversations(df, lesion_type, report_col, inserted_sentence_col, question_format, start_index=0):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['detection'])
        report = sample[report_col]
        distorted_sentence = sample[inserted_sentence_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = f"{distorted_sentence}"

        human_dict = {
            'type': 'insertion',
            'level': '2',
            'lesion': lesion_type,
            'from': 'human',
            'value': input_text.strip()
        }

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


full_list += generate_insert_conversations(none_insert_nodule, 
                                           lesion_type='nodule',
                                           report_col='insertion_nodule_distorted_report',
                                           inserted_sentence_col='insertion_nodule_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(none_insert_effusion, 
                                           lesion_type='effusion',
                                           report_col='insertion_effusion_distorted_report',
                                           inserted_sentence_col='insertion_effusion_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(insertion_effusion, 
                                           lesion_type='effusion',
                                           report_col='insertion_effusion_distorted_report',
                                           inserted_sentence_col='insertion_effusion_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(insertion_nodule, 
                                           lesion_type='nodule',
                                           report_col='insertion_nodule_distorted_report',
                                           inserted_sentence_col='insertion_nodule_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))


100%|██████████| 1000/1000 [00:00<00:00, 17455.97it/s]
100%|██████████| 997/997 [00:00<00:00, 20364.07it/s]
100%|██████████| 992/992 [00:00<00:00, 21291.10it/s]
100%|██████████| 1000/1000 [00:00<00:00, 21512.23it/s]


### others

In [9]:
import random
from tqdm import tqdm
import json

def generate_other_conversations(df, report_col, inserted_sentence_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['detection'])
        report = sample[report_col]
        distorted_sentence = sample[inserted_sentence_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = f"{distorted_sentence}"
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '2',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '2',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


# nodule
full_list += generate_other_conversations(direction_nodule, 
                                          lesion_type='nodule',
                                          report_col='direction_nodule_distorted_report',
                                          inserted_sentence_col='direction_nodule_distorted_sentence',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_nodule, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_distorted_sentence',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_nodule, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_distorted_sentence',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_nodule, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_distorted_sentence',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))



# effusion
full_list += generate_other_conversations(direction_effusion, 
                                          lesion_type='effusion',
                                          report_col='direction_effusion_distorted_report',
                                          inserted_sentence_col='direction_effusion_distorted_sentence',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_effusion, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_distorted_sentence',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_effusion, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_distorted_sentence',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_effusion, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_distorted_sentence',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))


100%|██████████| 955/955 [00:00<00:00, 16521.18it/s]
100%|██████████| 702/702 [00:00<00:00, 16347.52it/s]
100%|██████████| 976/976 [00:00<00:00, 16548.05it/s]
100%|██████████| 873/873 [00:00<00:00, 20816.65it/s]
100%|██████████| 987/987 [00:00<00:00, 18607.75it/s]
100%|██████████| 688/688 [00:00<00:00, 20365.58it/s]
100%|██████████| 977/977 [00:00<00:00, 18407.97it/s]
100%|██████████| 654/654 [00:00<00:00, 6228.71it/s]


### No erros detected

In [10]:
import random
from tqdm import tqdm
import json

def generate_normal_conversations(df, report_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['detection'])
        report = sample[report_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = "No errors detected."
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '2',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '2',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list



# normal
full_list += generate_normal_conversations(nodule_sample_df, 
                                           lesion_type='nodule',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_normal_conversations(effusion_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 

full_list += generate_normal_conversations(none_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 



100%|██████████| 1000/1000 [00:00<00:00, 21918.85it/s]
100%|██████████| 1000/1000 [00:00<00:00, 21703.24it/s]
100%|██████████| 1000/1000 [00:00<00:00, 26075.06it/s]


In [11]:
len(full_list)

29504

## level 2 - indexing

In [12]:
import re

def find_sentence_index(report: str, target_sentence: str, threshold: int = 4) -> int:
    # 문장 추출
    sentences = re.findall(r'\d+\.\s+(.*?)(?=\n\d+\.|\Z)', report, re.DOTALL)

    # 전처리된 타겟 단어 리스트
    target_words = set(target_sentence.strip().lower().rstrip('.').split())

    # 문장별로 중복 단어 수 체크
    for idx, sentence in enumerate(sentences):
        sentence_words = set(sentence.strip().lower().rstrip('.').split())
        common_words = target_words & sentence_words
        
        if len(common_words) >= threshold:
            return idx + 1  # 1-based index
    return -1



### insertion

In [13]:
import random
from tqdm import tqdm
import json
import re


def generate_insert_conversations(df, lesion_type, report_col, inserted_sentence_col, question_format, start_index=0):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['detection_index'])
        report = sample[report_col]
        sentences = re.split(r'(?<=[.!?])\s+', report.strip())
        numbered_text = '\n'.join(f"{i+1}. {sentence}" for i, sentence in enumerate(sentences))
        distorted_sentence = sample[inserted_sentence_col]
        index = find_sentence_index(numbered_text, distorted_sentence)

        input_text = f"{cls_prompt}\nHere is the medical report: \n'''\n{numbered_text}\n'''"
        answer = f"{index}"

        human_dict = {
            'type': 'insertion',
            'level': '2',
            'lesion': lesion_type,
            'from': 'human',
            'value': input_text.strip()
        }

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip(),
            'distorted_sentence': distorted_sentence.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


full_list += generate_insert_conversations(none_insert_nodule, 
                                           lesion_type='nodule',
                                           report_col='insertion_nodule_distorted_report',
                                           inserted_sentence_col='insertion_nodule_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(none_insert_effusion, 
                                           lesion_type='effusion',
                                           report_col='insertion_effusion_distorted_report',
                                           inserted_sentence_col='insertion_effusion_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(insertion_effusion, 
                                           lesion_type='effusion',
                                           report_col='insertion_effusion_distorted_report',
                                           inserted_sentence_col='insertion_effusion_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_insert_conversations(insertion_nodule, 
                                           lesion_type='nodule',
                                           report_col='insertion_nodule_distorted_report',
                                           inserted_sentence_col='insertion_nodule_inserted_sentence',
                                           question_format=question_format,
                                           start_index=len(full_list))


100%|██████████| 1000/1000 [00:00<00:00, 14153.50it/s]
100%|██████████| 997/997 [00:00<00:00, 13051.40it/s]
100%|██████████| 992/992 [00:00<00:00, 10014.58it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8849.47it/s]


In [14]:
print(full_list[-1]['conversations'][0]['value'])
print(full_list[-1]['conversations'][1]['value'])
print(full_list[-1]['conversations'][1]['distorted_sentence'])

Detect report-image errors. Output the index of each erroneous sentence. If there are no errors, respond with '0' to indicate all sentences are correct.
Answer Format: '{index}' or '0'
Here is the medical report: 
'''
1. Atelectasis was observed adjacent to the effusion in the lower lobes of both lungs.
2. There are emphysematous changes in both lungs.
3. There is bilateral pleural effusion.
4. The pleural effusion measured 40 mm at its thickest point.
5. A 1.7 cm nodule with a lobulated border in the left lower lobe.
'''
5
A 1.7 cm nodule with a lobulated border in the left lower lobe.


### others

In [15]:
import random
from tqdm import tqdm
import json

def generate_other_conversations(df, report_col, inserted_sentence_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['detection_index'])
        report = sample[report_col]
        sentences = re.split(r'(?<=[.!?])\s+', report.strip())
        numbered_text = '\n'.join(f"{i+1}. {sentence}" for i, sentence in enumerate(sentences))
        distorted_sentence = sample[inserted_sentence_col]
        index = find_sentence_index(numbered_text, distorted_sentence)

        input_text = f"{cls_prompt}\nHere is the medical report: \n'''\n{numbered_text}\n'''"
        answer = f"{index}"
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '2',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '2',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip(),
            'distorted_sentence': distorted_sentence.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


# nodule
full_list += generate_other_conversations(direction_nodule, 
                                          lesion_type='nodule',
                                          report_col='direction_nodule_distorted_report',
                                          inserted_sentence_col='direction_nodule_distorted_sentence',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_nodule, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_distorted_sentence',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_nodule, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_distorted_sentence',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_nodule, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_distorted_sentence',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))



# effusion
full_list += generate_other_conversations(direction_effusion, 
                                          lesion_type='effusion',
                                          report_col='direction_effusion_distorted_report',
                                          inserted_sentence_col='direction_effusion_distorted_sentence',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_effusion, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_distorted_sentence',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_effusion, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_distorted_sentence',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_effusion, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_distorted_sentence',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))


100%|██████████| 955/955 [00:00<00:00, 10814.94it/s]
100%|██████████| 702/702 [00:00<00:00, 10101.24it/s]
100%|██████████| 976/976 [00:00<00:00, 10329.29it/s]
100%|██████████| 873/873 [00:00<00:00, 5314.18it/s]
100%|██████████| 987/987 [00:00<00:00, 9666.73it/s]
100%|██████████| 688/688 [00:00<00:00, 9878.95it/s]
100%|██████████| 977/977 [00:00<00:00, 9446.50it/s]
100%|██████████| 654/654 [00:00<00:00, 10227.00it/s]


In [16]:
print(full_list[-3]['conversations'][0]['value'])
print(full_list[-3]['conversations'][1]['value'])
print(full_list[-3]['conversations'][1]['distorted_sentence'])

Check the medical report for any errors, including mismatches with the image. If any such issues are found, return the index numbers of the sentences involved. If everything is consistent, output '0' to indicate no errors.
Answer Format: '{index}' or '0'
Here is the medical report: 
'''
1. There is compression atelectesis in the accompanying lung parenchyma.
2. There are areas of linear atelectasis in the lower lobe of the left lung, and there are areas of consolidation that may be compatible with interlober-intralobular signal thickness increases and effusion in places in the lower lobe of both lungs.
3. The volume of the lower lobe of the right lung has decreased, and a consolidation area consistent with ataelactasia is observed in the lower lobe of the right lung.
4. There are ground glass opacities in the parenchyma around the consolidation area.
5. When examined in the lung parenchyma window; Pleural effusion reaching 4 mm in the thickest part of the right lung is observed.
'''
1


### No erros detected

In [17]:
import random
from tqdm import tqdm
import json

def generate_normal_conversations(df, report_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['detection_index'])
        report = sample[report_col]
        sentences = re.split(r'(?<=[.!?])\s+', report.strip())
        numbered_text = '\n'.join(f"{i+1}. {sentence}" for i, sentence in enumerate(sentences))

        input_text = f"{cls_prompt}\nHere is the medical report: \n'''\n{numbered_text}\n'''"
        answer = "0"
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '2',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '2',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list



# normal
full_list += generate_normal_conversations(nodule_sample_df, 
                                           lesion_type='nodule',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_normal_conversations(effusion_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 

full_list += generate_normal_conversations(none_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 



100%|██████████| 1000/1000 [00:00<00:00, 16879.91it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14835.54it/s]
100%|██████████| 1000/1000 [00:00<00:00, 21019.34it/s]


In [18]:
len(full_list)

43305

In [19]:
print(full_list[-1]['conversations'][0]['value'])
print(full_list[-1]['conversations'][1]['value'])
# print(full_list[-1]['conversations'][1]['distorted_sentence'])

Highlight any incorrect, mismatched, or erroneous sentences in the medical report, including those that conflict with the provided image. If the report is fully accurate, clearly respond with '0' to show that no errors were found.
Answer Format: '{index}' or '0'
Here is the medical report: 
'''
1. In the evaluation of both lung parenchyma; No mass nodule infiltration was observed in both lung parenchyma.
2. Pleural effusion-thickening was not detected in both hemithorax.
'''
0


In [20]:
cnt = 0
for i in full_list:
    if i['conversations'][1]['value'] == "-1":
        cnt += 1
        print(i['id'], i['image'])
        print(i['conversations'][0]['value'])
        print(i['conversations'][1]['value'])
        print(i['conversations'][1]['distorted_sentence'])
        print('---')

report_generation_38966 train_484_a_1.nii.gz
Highlight any incorrect, mismatched, or erroneous sentences in the medical report, including those that conflict with the provided image. If the report is fully accurate, clearly respond with '0' to show that no errors were found.
Answer Format: '{index}' or '0'
Here is the medical report: 
'''
1. Segmentary-subsegmental tubular bronchiectasis and peribronchial thickening were observed in both lungs.
2. In addition, 97x50 mm sized infected bulla formation with air-fluid leveling was observed in the right lung lower lobe basal.
3. Other findings are stable.
4. Emphysema areas are panacinar in the right lung lower lobe basal and left lung upper lobe apical segments.
5. It is stable.
6. Bula formations were observed in the left lung apex and in the left inferior lingular segment.
7. Diffuse paraseptal-centracinar emphysema areas were observed in both lungs.
8. A pleural effusion measuring 10 mm in the deepest part on the right (17.8 mm in the p

In [21]:
for i in full_list:
    if i['id'] == "report_generation_38966":
        i['conversations'][1]['value'] = "8"

    elif i['id'] == "report_generation_39112":
        i['conversations'][1]['value'] = "3"    

    elif i['id'] == "report_generation_39226":
        i['conversations'][1]['value'] = "1"        
    

In [22]:
for i in full_list:
    if i['conversations'][1]['value'] == "-1":
        print(i['id'])
        print(i['conversations'][0]['value'])
        print(i['conversations'][1]['value'])
        print(i['conversations'][1]['distorted_sentence'])
        print('---')

## level 3

### omission

In [23]:
import random
from tqdm import tqdm
import json

def generate_omit_conversations(df, lesion_type, report_col, inserted_sentence_col, question_format, start_index=0):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['correction'])
        report = sample[report_col]
        corrected_sentence = sample[inserted_sentence_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = f"{corrected_sentence}"

        human_dict = {
            'type': 'omission',
            'level': '3',
            'lesion': lesion_type,
            'from': 'human',
            'value': input_text.strip()
        }

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


# 실행 부분
full_list += generate_omit_conversations(omission_nodule, 
                                         lesion_type='nodule',
                                         report_col='omission_nodule_distorted_report',
                                         inserted_sentence_col='omission_nodule_deleted_sentence',
                                         question_format=question_format,
                                         start_index=len(full_list))

full_list += generate_omit_conversations(omission_effusion, 
                                         lesion_type='effusion',
                                         report_col='omission_effusion_distorted_report',
                                         inserted_sentence_col='omission_effusion_deleted_sentence',
                                         question_format=question_format,
                                         start_index=len(full_list))



100%|██████████| 967/967 [00:00<00:00, 11592.04it/s]
100%|██████████| 935/935 [00:00<00:00, 18851.12it/s]


### others

In [24]:
import random
from tqdm import tqdm
import json

def generate_other_conversations(df, report_col, inserted_sentence_col, corrected_sentence_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['correction'])
        report = sample[report_col]
        corrected_sentence = sample[corrected_sentence_col]
        
        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = f"{corrected_sentence}"
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '3',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '3',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


# nodule
full_list += generate_other_conversations(direction_nodule, 
                                          lesion_type='nodule',
                                          report_col='direction_nodule_distorted_report',
                                          inserted_sentence_col='direction_nodule_distorted_sentence',
                                          corrected_sentence_col='direction_nodule_corrected_sentence',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_nodule, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_distorted_sentence',
                                          corrected_sentence_col='size_corrected_sentence',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_nodule, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_distorted_sentence',
                                          corrected_sentence_col='typo_corrected_sentence',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_nodule, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_distorted_sentence',
                                          corrected_sentence_col='unit_corrected_sentence',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))



# effusion
full_list += generate_other_conversations(direction_effusion, 
                                          lesion_type='effusion',
                                          report_col='direction_effusion_distorted_report',
                                          inserted_sentence_col='direction_effusion_distorted_sentence',
                                          corrected_sentence_col='direction_effusion_corrected_sentence',
                                          type='direction',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(size_effusion, 
                                          report_col='size_distorted_report',
                                          inserted_sentence_col='size_distorted_sentence',
                                          corrected_sentence_col='size_corrected_sentence',
                                          type='size',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(typo_effusion, 
                                          report_col='typo_distorted_report',
                                          inserted_sentence_col='typo_distorted_sentence',
                                          corrected_sentence_col='typo_corrected_sentence',
                                          type='typo',
                                          question_format=question_format,
                                          start_index=len(full_list))

full_list += generate_other_conversations(unit_effusion, 
                                          report_col='unit_distorted_report',
                                          inserted_sentence_col='unit_distorted_sentence',
                                          corrected_sentence_col='unit_corrected_sentence',
                                          type='unit',
                                          question_format=question_format,
                                          start_index=len(full_list))


100%|██████████| 955/955 [00:00<00:00, 10565.00it/s]
100%|██████████| 702/702 [00:00<00:00, 17634.31it/s]
100%|██████████| 976/976 [00:00<00:00, 19503.56it/s]
100%|██████████| 873/873 [00:00<00:00, 20474.66it/s]
100%|██████████| 987/987 [00:00<00:00, 21201.80it/s]
100%|██████████| 688/688 [00:00<00:00, 20200.21it/s]
100%|██████████| 977/977 [00:00<00:00, 21015.40it/s]
100%|██████████| 654/654 [00:00<00:00, 19859.51it/s]


### No erros detected

In [25]:
import random
from tqdm import tqdm
import json

def generate_normal_conversations(df, report_col, question_format, type, lesion_type=None, start_index=len(full_list)):
    data_list = []
    for i in tqdm(range(len(df))):
        sample = df.iloc[i]
        cls_prompt = random.choice(question_format['correction'])
        report = sample[report_col]

        input_text = f"{cls_prompt}\nHere is the medical report: '''{report}'''"
        answer = "No errors detected."
        
        if lesion_type is not None:
            human_dict = {
                'type': type,
                'level': '3',
                'lesion': lesion_type,
                'from': 'human',
                'value': input_text.strip()
            }
        else:
            human_dict = {
                'type': type,
                'level': '3',
                'from': 'human',
                'value': input_text.strip()
            }            

        gpt_dict = {
            'from': 'gpt',
            'value': answer.strip()
        }

        conversation = [human_dict, gpt_dict]

        data_dict = {
            'id': f'report_generation_{start_index + i}',
            'image': sample['id'],
            'conversations': conversation
        }

        data_list.append(data_dict)
    
    return data_list


# normal
full_list += generate_normal_conversations(nodule_sample_df, 
                                           lesion_type='nodule',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list))

full_list += generate_normal_conversations(effusion_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 

full_list += generate_normal_conversations(none_sample_df, 
                                           lesion_type='effusion',
                                           report_col='lung_parenchyma',
                                           type='normal',
                                           question_format=question_format,
                                           start_index=len(full_list)) 



100%|██████████| 1000/1000 [00:00<00:00, 19506.58it/s]
100%|██████████| 1000/1000 [00:00<00:00, 21745.89it/s]
100%|██████████| 1000/1000 [00:00<00:00, 25627.69it/s]


In [26]:
len(full_list)

55019

## fixed 경로 추가

In [27]:
import glob

train_meta_list = glob.glob("/workspace/5.Lung/datasets/CT-RATE-Revised/dataset/train_fixed/*/*/*.nii.gz")

In [28]:
import pandas as pd

df = pd.DataFrame(train_meta_list, columns=['image_path'])

list_train = [i['image'] for i in full_list]

In [29]:
new_list = []
for i in range(len(list_train)):
    new_list.append(df[df['image_path'].str.contains(list_train[i])].values[0][0])

In [30]:
len(new_list), len(list_train), len(df), len(full_list)

(55019, 55019, 23887, 55019)

In [31]:
for i in range(len(full_list)):
    full_list[i]['image_path'] = new_list[i]

In [32]:
import json

with open("/workspace/7.Error/error_full_list_final_new.json", "w") as f:
    json.dump(full_list, f, indent=4)


In [34]:
# 나누기

In [36]:
level_1_list = []
level_2_list = []
level_3_list = []

for i in full_list:
    if i['conversations'][0]['level'] == "1":
        level_1_list.append(i)
    elif i['conversations'][0]['level'] == "2":
        level_2_list.append(i)
    elif i['conversations'][0]['level'] == "3":
        level_3_list.append(i)        

In [50]:
with open("/workspace/7.Error/error_full_list_final_new_level1.json", "w") as f:
    json.dump(level_1_list, f, indent=4)

with open("/workspace/7.Error/error_full_list_final_new_level2.json", "w") as f:
    json.dump(level_2_list[:-13801], f, indent=4)

with open("/workspace/7.Error/error_full_list_final_new_level2_index.json", "w") as f:
    json.dump(level_2_list[-13801:], f, indent=4)

with open("/workspace/7.Error/error_full_list_final_new_level3.json", "w") as f:
    json.dump(level_3_list, f, indent=4)            


# RadFM style

In [None]:
# nii.gz load 하고, resize 512x512로 resize, 그리고 복사해서 3 ch 만듬.
# depth 축을 64로 ndimage.zoom을 이용하여 구축.
# min max normalization 후에 torch tensor로 변환.
    

In [None]:
# !pip install einops==0.6.1
# !pip install einops-exts==0.0.4
# !pip install huggingface-hub==0.16.4
# !pip install nibabel==5.1.0
# !pip install nmslib==2.1.1
# !pip install opencv-python==4.8.0.76
# !pip install pandas==2.0.3
# !pip install Pillow==9.4.0
# !pip install pytz==2023.3
# !pip install PyYAML==6.0.1
# !pip install scikit-learn==1.3.0
# !pip install scipy==1.11.2
# !pip install scispacy
# !pip install sentencepiece==0.1.99
# !pip install SimpleITK==2.2.1
# !pip install spacy==3.6.1
# !pip install spacy-alignments==0.9.0
# !pip install spacy-legacy==3.0.12
# !pip install spacy-loggers==1.0.4
# !pip install spacy-transformers==1.2.5
# !pip install tokenizers==0.13.3
# !pip install torch==2.0.1
# !pip install torchaudio==2.0.2
# !pip install torchvision==0.15.2
# !pip install tqdm==4.66.1
# !pip install transformers==4.28.1

# M3D style

In [None]:
# 1. The image shape needs to be processed as 1*32*256*256, consider resize and other methods.
# 2. The image needs to be normalized to 0-1, consider Min-Max Normalization.
# 3. The image format needs to be converted to .npy 

In [None]:
# deepspeed==0.13.4
# einops==0.8.0
# evaluate==0.4.1
# matplotlib==3.8.4
# monai==1.3.0
# nibabel==5.2.1
# numpy==1.26.4
# opencv_python==4.9.0.80
# pandas==2.2.2
# peft==0.8.2
# Pillow==10.3.0
# pycocotools==2.0.7
# Requests==2.31.0
# rouge==1.0.1
# safetensors==0.4.3
# scipy==1.13.0
# simple_slice_viewer==0.97
# SimpleITK==2.3.1
# torch==2.2.1+cu118
# torchvision==0.17.1+cu118
# tqdm==4.66.2
# transformers==4.39.1
# tweepy==4.14.0

# CT-CHAT style

In [54]:
import json

with open("/workspace/7.Error/error_full_list_final_new.json", "r") as f:
    full_list = json.load(f)


In [55]:
for i in range(len(full_list)):
    # full_list[i]['conversations'][0]['value'] = "<image>\n" + full_list[i]['conversations'][0]['value'] + ' <report_generation>'

    if full_list[i]['conversations'][0]['level'] == '1':
        full_list[i]['conversations'][0]['value'] = "<image>\n" + full_list[i]['conversations'][0]['value'] + " <short_answer>"
    else:
        full_list[i]['conversations'][0]['value'] = "<image>\n" + full_list[i]['conversations'][0]['value'] + " <long_answer>"


In [56]:
import json

with open("/workspace/7.Error/error_full_list_final_new_ctchat.json", "w") as f:
    json.dump(full_list, f, indent=4)


In [None]:
full_list[0]

In [None]:
import json

with open("/workspace/7.Error/error_full_list_final_new_level1.json", "r") as f:
    level_1_list = json.load(f)

with open("/workspace/7.Error/error_full_list_final_new_level2.json", "r") as f:
    level_2_list = json.load(f)

with open("/workspace/7.Error/error_full_list_final_new_level2_index.json", "r") as f:
    level_2_list_index = json.load(f)

with open("/workspace/7.Error/error_full_list_final_new_level3.json", "r") as f:
    level_3_list = json.load(f)


In [52]:
for i in range(len(level_1_list)):
    if level_1_list[i]['conversations'][0]['level'] == '1':
        level_1_list[i]['conversations'][0]['value'] = "<image>\n" + level_1_list[i]['conversations'][0]['value'] + " <short_answer>"
    else:
        level_1_list[i]['conversations'][0]['value'] = "<image>\n" + level_1_list[i]['conversations'][0]['value'] + " <long_answer>"

for i in range(len(level_2_list)):
    if level_2_list[i]['conversations'][0]['level'] == '1':
        level_2_list[i]['conversations'][0]['value'] = "<image>\n" + level_2_list[i]['conversations'][0]['value'] + " <short_answer>"
    else:
        level_2_list[i]['conversations'][0]['value'] = "<image>\n" + level_2_list[i]['conversations'][0]['value'] + " <long_answer>"

for i in range(len(level_2_list_index)):
    if level_2_list_index[i]['conversations'][0]['level'] == '1':
        level_2_list_index[i]['conversations'][0]['value'] = "<image>\n" + level_2_list_index[i]['conversations'][0]['value'] + " <short_answer>"
    else:
        level_2_list_index[i]['conversations'][0]['value'] = "<image>\n" + level_2_list_index[i]['conversations'][0]['value'] + " <long_answer>"

for i in range(len(level_3_list)):
    if level_3_list[i]['conversations'][0]['level'] == '1':
        level_3_list[i]['conversations'][0]['value'] = "<image>\n" + level_3_list[i]['conversations'][0]['value'] + " <short_answer>"
    else:
        level_3_list[i]['conversations'][0]['value'] = "<image>\n" + level_3_list[i]['conversations'][0]['value'] + " <long_answer>"                        


In [53]:
with open("/workspace/7.Error/error_full_list_final_new_level1_ctchat.json", "w") as f:
    json.dump(level_1_list, f, indent=4)

with open("/workspace/7.Error/error_full_list_final_new_level2_ctchat.json", "w") as f:
    json.dump(level_2_list, f, indent=4)

with open("/workspace/7.Error/error_full_list_final_new_level2_index_ctchat.json", "w") as f:
    json.dump(level_2_list_index, f, indent=4)

with open("/workspace/7.Error/error_full_list_final_new_level3_ctchat.json", "w") as f:
    json.dump(level_3_list, f, indent=4)            


# Define 만들기

In [57]:
import json

# "/workspace/7.Error/error_full_list_final.json" 읽기

with open("/workspace/7.Error/error_full_list_final_new.json", "r") as f:
    data = json.load(f)


In [58]:
len(data)

55019

In [None]:
data[0]['conversations'][0]['type']

In [None]:
type_list = []
for i in data:
    type_list.append(i['conversations'][0]['type'])


In [None]:
import numpy as np

np.unique(type_list, return_counts=True)

In [None]:
['direction', 'insertion', 'no errors', 'omission', 'size', 'typo', 'unit']


In [None]:
import re

define = '''There are 7 possible scenarios you should consider:
1. Omission: Check if any nodule, mass, or effusion visible in the CT imaging is missing from the report. Ensure all significant findings observed in the CT scan are documented in the report.
2. Insertion: Verify that all abnormal findings described in the report actually correspond to the current patient's CT imaging. Look for sentences that may have been inappropriately inserted from templates or unrelated cases.
3. Direction: Confirm that directional terms (right/left, upper/lower, unilateral/bilateral, both) used to describe nodules, masses, or effusions match the actual locations in the CT imaging. The laterality or position may have been incorrectly switched.
4. Size: Validate that the size measurements of lesions in the report accurately reflect the actual dimensions in the CT imaging. The reported values may be incorrectly stated as 50% larger or smaller than the true measurements.
5. Unit: Check that measurement units (cm, mm, m) are appropriate and correct. Unit errors can significantly misrepresent the actual size of lesions.
6. Typo: Review medical terminology for spelling errors. A single character typo can alter or obscure the intended medical meaning.
7. No errors: The report may be completely accurate with no errors present. Not all reports contain mistakes.'''

def insert_define_after_first_sentence(question_text: str, define: str) -> str:
    """
    첫 문장 뒤에 정의문을 삽입하는 함수.
    
    Parameters:
    - question_text (str): 원본 질문 텍스트
    - define (str): 삽입할 정의문 텍스트
    
    Returns:
    - str: define이 첫 문장 뒤에 삽입된 최종 문자열
    """
    question_text = question_text.strip()

    match = re.search(r"^(.*?\.)", question_text)
    if match:
        insert_index = match.end()
        result = question_text[:insert_index].strip() + "\n" + define + "\n" + question_text[insert_index:].lstrip()
    else:
        # 마침표가 없는 경우 전체 앞에 define 추가
        result = define + "\n" + question_text

    return result


In [None]:
for i in data:
    i['conversations'][0]['value'] = insert_define_after_first_sentence(i['conversations'][0]['value'], define)



In [None]:
print(data[0]['conversations'][0]['value'])

In [None]:
# data 저장

with open("/workspace/7.Error/error_full_list_final_add_define.json", "w") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)  # ensure_ascii=False to keep Korean characters intact

# Few shot 만들기

In [None]:
import json

# "/workspace/7.Error/error_full_list_final.json" 읽기

with open("/workspace/7.Error/error_full_list_final_new.json", "r") as f:
    data = json.load(f)


In [None]:
data[0]['conversations'][0]['level'], data[0]['conversations'][0]['type']

In [None]:
# OLD

Classification_Omission = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '0'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe.'
Answer: '1'
'''
Classification_Insertion = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '0'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung. Pleural effusions in the form of minimal thin smears are observed in both hemithorax.'
Answer: '1'
'''
Classification_Direction = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '0'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the left lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '1'
'''
Classification_Size = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '0'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 1.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '1'
'''
Classification_Unit = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '0'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 mm is observed in the right lower lung.'
Answer: '1'
'''
Classification_Typo = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '0'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchya window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: '1'
'''

Detection_Insertion = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung. Pleural effusions in the form of minimal thin smears are observed in both hemithorax.'
Answer: 'Error: Pleural effusions in the form of minimal thin smears are observed in both hemithorax.'
'''
Detection_Direction = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the left lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'Error: A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the left lung upper lobe.'
'''
Detection_Size = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 1.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'Error: A few nonspecific nodules measuring 1.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe.'
'''
Detection_Unit = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 mm is observed in the right lower lung.'
Answer: 'Error: In the examination made in the lung parenchyma window; A mass measuring 3 mm is observed in the right lower lung.'
'''
Detection_Typo = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchya window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'Error: In the examination made in the lung parenchya window; A mass measuring 3 cm is observed in the right lower lung.'
'''


Correction_Omission = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe.'
Answer: 'Corrected: In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
'''
Correction_Direction = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the left lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'Corrected: A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe.'
'''
Correction_Size = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 1.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'Corrected: A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe.'
'''
Correction_Unit = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 mm is observed in the right lower lung.'
Answer: 'Corrected: In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
'''
Correction_Typo = '''
<Example #1>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'No errors detected.'
<Example #2>
Report: 'In both lungs, there is a mosaic attenuation pattern more evident in the lower lobes. Sequela parenchymal changes are observed in the left lung upper lobe lingular segment, bilateral lung lower lobe posterobasal segment and right lung middle lobe medial segment. A few nonspecific nodules measuring 5.5 mm in size are observed in the posterior and anterior segment of the right lung upper lobe. In the examination made in the lung parenchya window; A mass measuring 3 cm is observed in the right lower lung.'
Answer: 'Corrected:  In the examination made in the lung parenchyma window; A mass measuring 3 cm is observed in the right lower lung.'
'''

In [63]:
import numpy as np

a = np.load("/workspace/2.Multi_Modal/M3D/Data/data/M3D-Seg/M3D_Seg/0006/case_00168/image.npy")

In [64]:
a.max(), a.min(), a.mean(), a.std()

(2.360952, -1.5798162, -7.643021e-05, 0.9998113)