In [21]:
import os
import pandas as pd
os.chdir('/home/jupyter/aicup-meddata-pp')

# 解壓縮官網下載之資料集

In [None]:
!mkdir -p content
!yes | unzip ./content/First_Phase_ReleaseCorrection.zip -d ./content
!yes | unzip ./content/Second_Phase_Dataset.zip -d ./content
!yes | unzip ./content/Validation_Dataset_Answer.zip -d ./content
!yes | unzip ./content/opendid_test.zip -d ./content

!mv ./content/First_Phase_Release\(Correction\) ./content/First_Phase_Release

In [4]:
def read_file(path):
    with open(path , 'r' , encoding = 'utf-8-sig') as fr:
        return fr.readlines()

In [5]:
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
ner = '\n\n####\n\n'
special_tokens_dict = {'bos_token': bos,
                       'eos_token': eos,
                       'pad_token': pad,
                       'sep_token': ner}

def process_annotation_file(lines):
    '''
    處理anwser.txt 標註檔案

    output:annotation dicitonary
    '''
    print("process annotation file...")
    entity_dict = {}
    for line in lines:
        items = line.strip('\n').split('\t')
        if len(items) == 5:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
            }
        elif len(items) == 6:
            item_dict = {
                'phi' : items[1],
                'st_idx' : int(items[2]),
                'ed_idx' : int(items[3]),
                'entity' : items[4],
                'normalize_time' : items[5],
            }
        if items[0] not in entity_dict:
            entity_dict[items[0]] = [item_dict]
        else:
            entity_dict[items[0]].append(item_dict)
    print("annotation file done")
    return entity_dict

def process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict):
    '''
    處理單個病理報告

    output : 處理完的 sequence pairs
    '''
    file_name = txt_name + '.txt'
    sents = read_file(os.path.join(medical_report_folder, file_name))
    article = "".join(sents)

    bounary , item_idx , temp_seq , seq_pairs = 0 , 0 , "" , []
    new_line_idx = 0
    for w_idx, word in enumerate(article):
        if word == '\n':
            new_line_idx = w_idx + 1
            if article[bounary:new_line_idx] == '\n':
                continue
            if temp_seq == "":
                temp_seq = "PHI:Null"
            sentence = article[bounary:new_line_idx].strip().replace('\t' , ' ')
            temp_seq = temp_seq.strip('\\n')
            seq_pair = f"{txt_name}\t{new_line_idx}\t{sentence}\t{temp_seq}\n"
            # seq_pair = special_tokens_dict['bos_token'] + article[bounary:new_line_idx] + special_tokens_dict['sep_token'] + temp_seq + special_tokens_dict['eos_token']
            bounary = new_line_idx
            seq_pairs.append(seq_pair)
            temp_seq = ""
        if w_idx == annos_dict[txt_name][item_idx]['st_idx']:
            phi_key = annos_dict[txt_name][item_idx]['phi']
            phi_value = annos_dict[txt_name][item_idx]['entity']
            if 'normalize_time' in annos_dict[txt_name][item_idx]:
                temp_seq += f"{phi_key}:{phi_value}=>{annos_dict[txt_name][item_idx]['normalize_time']}\\n"
            else:
                temp_seq += f"{phi_key}:{phi_value}\\n"
            if item_idx == len(annos_dict[txt_name]) - 1:
                continue
            item_idx += 1
    return seq_pairs

def generate_annotated_medical_report_parallel(anno_file_path, medical_report_folder , tsv_output_path , num_processes=4):
    '''
    呼叫上面的兩個function
    處理全部的病理報告和標記檔案

    output : 全部的 sequence pairs
    '''
    anno_lines = read_file(anno_file_path)
    annos_dict = process_annotation_file(anno_lines)
    txt_names = list(annos_dict.keys())

    print("processing each medical file")

    all_seq_pairs = []
    for txt_name in txt_names:
        all_seq_pairs.extend(process_medical_report(txt_name, medical_report_folder, annos_dict, special_tokens_dict))
    print(all_seq_pairs[:10])
    print("All medical file done")
    print("write out to tsv format...")
    with open(tsv_output_path , 'w' , encoding = 'utf-8') as fw:
        for seq_pair in all_seq_pairs:
            fw.write(seq_pair)
    print("tsv format dataset done")
    # return all_seq_pairs


def process_valid_data(test_txts , out_file):
    with open(out_file , 'w' , encoding = 'utf-8') as fw:
        for txt in test_txts:
            m_report = read_file(txt)
            boundary = 0
            # temp = ''.join(m_report)
            fid = txt.split('/')[-1].replace('.txt' , '')
            for idx,sent in enumerate(m_report):
                #sent = sent[:-1] if sent[-1] == '\n' else sent
                if sent.replace(' ' , '').replace('\n' , '').replace('\t' , '') != '':
                    sent = sent.replace('\t' , ' ')
                    newline = '' if sent[-1] == '\n' else '\n'
                    fw.write(f"{fid}\t{boundary}\t{sent}{newline}")
                # else:
                #     print(f"{fid}\t{boundary}\t{sent}\n")
                #     assert 1==2
                boundary += len(sent)

# 利用官方提供程式碼將資料集前處理成 .tsv 檔

In [6]:
anno_info_path = r"./content/First_Phase_Release/answer.txt"
report_folder = r"./content/First_Phase_Release/First_Phase_Text_Dataset"
tsv_output_path = './train1.tsv'
generate_annotated_medical_report_parallel(anno_info_path, report_folder, tsv_output_path, num_processes=4)

anno_info_path = r"./content/Second_Phase_Dataset/answer.txt"
report_folder = r"./content/Second_Phase_Dataset/Second_Phase_Text_Dataset"
tsv_output_path = './train2.tsv'
generate_annotated_medical_report_parallel(anno_info_path, report_folder, tsv_output_path, num_processes=4)

anno_info_path = r"./content/answer.txt"
report_folder = r"./content/First_Phase_Release/Validation_Release/"
tsv_output_path = './valid.tsv'
generate_annotated_medical_report_parallel(anno_info_path, report_folder, tsv_output_path, num_processes=4)

process annotation file...
annotation file done
processing each medical file
['10\t25\tEpisode No:  09F016547J\tIDNUM:09F016547J\n', '10\t36\t091016.NMT\tMEDICALRECORD:091016.NMT\n', '10\t52\tSIZAR, HOWARD\tPATIENT:SIZAR, HOWARD\n', '10\t70\tLab No:  09F01654\tIDNUM:09F01654\n', '10\t78\tRunford\tSTREET:Runford\n', '10\t97\tRENMARK  TAS  5084\tCITY:RENMARK\\nSTATE:TAS\\nZIP:5084\n', '10\t114\tSpecimen: Tissue\tPHI:Null\n', '10\t132\tD.O.B:  24/8/1993\tDATE:24/8/1993=>1993-08-24\n', '10\t140\tSex:  M\tPHI:Null\n', '10\t171\tCollected: 28/08/2013 at 08:26\tTIME:28/08/2013 at 08:26=>2013-08-28T08:26\n']
All medical file done
write out to tsv format...
tsv format dataset done
process annotation file...
annotation file done
processing each medical file
['1093\t25\tEpisode No:  48B915480A\tIDNUM:48B915480A\n', '1093\t37\t4809154.WAA\tMEDICALRECORD:4809154.WAA\n', '1093\t58\tOtterbine, Laverne\tPATIENT:Otterbine, Laverne\n', '1093\t85\tLab No:  48B91548,48B91548\tIDNUM:48B91548\\nIDNUM:48B915

In [24]:
test_phase_path = r'./content/opendid_test'
valid_out_file_path = './test.tsv'
test_txts = list(map(lambda x:os.path.join(test_phase_path , x) , os.listdir(test_phase_path)))
test_txts = sorted(test_txts)
valid_data = process_valid_data(test_txts , valid_out_file_path)

# 處理空字串

In [25]:
# {1:'./content/First_Phase_Release/First_Phase_Text_Dataset', 
#  2:'./content/Second_Phase_Dataset/Second_Phase_Text_Dataset',
#  3:'./content/opendid_test/', 
#  4:'./content/First_Phase_Release/Validation_Release'} 

df1 = pd.read_csv('./train1.tsv', header=None, delimiter='\t').rename(columns={0:'file', 1:'start_id', 2:'sentence', 3:'label'})
df1['sentence'] = df1.sentence.fillna('')
df1['source'] = 1

df2 = pd.read_csv('./train2.tsv', header=None, delimiter='\t').rename(columns={0:'file', 1:'start_id', 2:'sentence', 3:'label'})
df2['sentence'] = df2.sentence.fillna('')
df2['source'] = 2

train = pd.concat([df1, df2], axis=0).reset_index(drop=True)
train[['source', 'file', 'start_id','sentence','label']].to_csv('./train_pp.tsv', index=False, sep='\t')

valid = pd.read_csv('./valid.tsv', header=None, delimiter='\t').rename(columns={0:'file', 1:'start_id', 2:'sentence', 3:'label'})
valid['source'] = 4
valid['sentence'] = valid.sentence.fillna('')
valid[['source', 'file', 'start_id','sentence','label']].to_csv('./valid_pp.tsv', index=False, sep='\t')

test = pd.read_csv('./test.tsv', header=None, delimiter='\t').rename(columns={0:'file', 1:'start_id', 2:'sentence'})
test['source'] = 3
test['sentence'] = test.sentence.fillna('')


test[['source', 'file', 'start_id','sentence']].to_csv('./test_pp.tsv', index=False, sep='\t')

In [26]:
train_df = pd.read_csv('./train_pp.tsv', delimiter='\t')
valid_df = pd.read_csv('./valid_pp.tsv', delimiter='\t')
test_df = pd.read_csv('./test_pp.tsv', delimiter='\t')

In [19]:
train_df.head(3)

Unnamed: 0,source,file,start_id,sentence,label
0,1,10,25,Episode No: 09F016547J,IDNUM:09F016547J
1,1,10,36,091016.NMT,MEDICALRECORD:091016.NMT
2,1,10,52,"SIZAR, HOWARD","PATIENT:SIZAR, HOWARD"


In [27]:
valid_df.head(3)

Unnamed: 0,source,file,start_id,sentence,label
0,4,1001,24,Episode No: 88Y206206L,IDNUM:88Y206206L
1,4,1001,36,8892062.BPL,MEDICALRECORD:8892062.BPL
2,4,1001,65,"Vatterott, Jerrie CLARENCE","PATIENT:Vatterott, Jerrie CLARENCE"


In [20]:
test_df.head(3)

Unnamed: 0,source,file,start_id,sentence
0,3,1097,0,433475.RDC
1,3,1097,12,"Timmins, ELDEN"
2,3,1097,27,"43J47561,43J47561"
