In [1]:
import random
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

random_seed = 42

# Create Annotation Files for CHEST and ENDO (Multi-label)

In [2]:
chest_train = pd.read_csv("chest_train.csv")
chest_train.drop(columns=['Unnamed: 0'], inplace=True)
chest_train.set_index('img_id', inplace=True)

endo_train = pd.read_csv("endo_train.csv") # study_id
endo_train.drop(columns=['Unnamed: 0'], inplace=True)
endo_train.set_index('img_id', inplace=True)

print(chest_train.shape)
print(endo_train.shape)
display(chest_train.head())
display(endo_train.head())

(2140, 19)
(1810, 5)


Unnamed: 0_level_0,pleural_effusion,nodule,pneumonia,cardiomegaly,hilar_enlargement,fracture_old,fibrosis,aortic_calcification,tortuous_aorta,thickened_pleura,TB,pneumothorax,emphysema,atelectasis,calcification,pulmonary_edema,increased_lung_markings,elevated_diaphragm,consolidation
img_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5B91F7409CCCE2.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5DC2824164E8B52.png,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5E0E86BB348CB90.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5E0D44C4555411F.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5C189D471208E37.png,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,study_id,ulcer,erosion,polyp,tumor
img_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13333_2021.11_0003_55200268.png,13333_2021.11_0003,0.0,0.0,0.0,0.0
13333_2021.11_0003_55199880.png,13333_2021.11_0003,0.0,0.0,0.0,0.0
13333_2021.11_0003_55200215.png,13333_2021.11_0003,1.0,1.0,0.0,0.0
13333_2021.11_0003_55199804.png,13333_2021.11_0003,0.0,0.0,0.0,0.0
13333_2021.11_0003_55199338.png,13333_2021.11_0003,0.0,0.0,0.0,0.0


In [3]:
# pd.read_csv("colon_train.csv") # slide_id

# chest_train["all_feature"]
# df['b'].astype(str) + df['c'].astype(str)

def concat_all_feature(row):
    new_col = ""
    for ele in row:
        try:
            ele = int(ele)
        except:
            pass
        new_col += ( str(ele) + '_' )
    return new_col

{
            "metainfo":
            {
              "classes":['A', 'B', 'C'....]
            },
            "data_list":
            [
              {
                "img_path": "test_img1.jpg",
                'gt_label': [0, 1],
              },
              {
                "img_path": "test_img2.jpg",
                'gt_label': [2],
              },
            ]
            ....
        }

In [4]:
def gen_export_multi_label(df, drop_col=[]):
    
    df["concat_all_feature"] = df.apply(concat_all_feature, axis=1)
    print(f"Num Unique Label: {len(df['concat_all_feature'].unique())}")
    
    try:
        df_train, df_val = train_test_split(df, test_size=0.2, shuffle=True, random_state=random_seed, stratify=df['concat_all_feature'])
        print(f"Stratified by {df['concat_all_feature'].values}")
    except Exception as error:
        df_train, df_val = train_test_split(df, test_size=0.2, shuffle=True, random_state=random_seed)
        print(f"No stratify due to {error}")
    
    drop_col = list(drop_col) + ["concat_all_feature"]
    
    print(f"Drop columns: {drop_col}")
    
    df.drop(columns=["concat_all_feature"], inplace=True)
    
    df_train.drop(columns=drop_col, inplace=True)
    df_val.drop(columns=drop_col, inplace=True)
    
    class_df = list(df_train.columns)
    print(f"Classes: {class_df}")
    
    # ===================================================================================================
    
    x_label_train, y_label_train = np.where(df_train.values==1)

    print(x_label_train, y_label_train)

    export_data_train = {"metainfo":{"classes": class_df}, "data_list": []}

    for i_x, x in enumerate(x_label_train):
        y = y_label_train[i_x]
        
        if len(export_data_train["data_list"]) == 0:
            export_data_train["data_list"].append({"img_path": df_train.index[x], 'gt_label': [y]})
        elif export_data_train["data_list"][-1]["img_path"] != df_train.index[x]:
            export_data_train["data_list"].append({"img_path": df_train.index[x], 'gt_label': [y]})
        else:
            export_data_train["data_list"][-1]['gt_label'].append(y)
    
    # ===================================================================================================
    
    x_label_val, y_label_val = np.where(df_val.values==1) # There are two dims in df_val
    # The result is [   0    0    0 ... 1711 1711 1711] and [0 2 4 ... 1 7 8] respectively
    # , so [0, 0], [0, 2], [0, 4], ..., [1711, 1], [1711, 7], [1711, 8] are all 1s.

    print(x_label_val, y_label_val)
    
    export_data_val = {"metainfo":{"classes": class_df}, "data_list": []}
    
    for i_x, x in enumerate(x_label_val):
        y = y_label_val[i_x]
        
        if len(export_data_val["data_list"]) == 0:
            export_data_val["data_list"].append({"img_path": df_val.index[x], 'gt_label': [y]})
        elif export_data_val["data_list"][-1]["img_path"] != df_val.index[x]:
            export_data_val["data_list"].append({"img_path": df_val.index[x], 'gt_label': [y]})
        else:
            export_data_val["data_list"][-1]['gt_label'].append(y)
        
        
    return export_data_train, export_data_val

In [5]:
chest_label_train, chest_label_val = gen_export_multi_label(chest_train)
chest_label_train,

Num Unique Label: 609
No stratify due to The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Drop columns: ['concat_all_feature']
Classes: ['pleural_effusion', 'nodule', 'pneumonia', 'cardiomegaly', 'hilar_enlargement', 'fracture_old', 'fibrosis', 'aortic_calcification', 'tortuous_aorta', 'thickened_pleura', 'TB', 'pneumothorax', 'emphysema', 'atelectasis', 'calcification', 'pulmonary_edema', 'increased_lung_markings', 'elevated_diaphragm', 'consolidation']
[   0    0    0 ... 1711 1711 1711] [0 2 4 ... 1 7 8]
[  0   1   2   2   2   3   5   5   5   5   5   6   6   7   7   7   9   9
   9   9  10  10  10  11  11  11  11  11  13  13  14  14  14  15  15  16
  16  18  18  18  20  20  21  21  21  22  22  23  23  24  24  25  25  26
  26  26  28  28  28  29  29  29  30  30  31  31  32  32  32  32  33  33
  33  33  34  35  35  36  36  38  38  38  38  38  39  39  39  39  40  40
  40  41  42  42  43  43  43  44  44 

({'metainfo': {'classes': ['pleural_effusion',
    'nodule',
    'pneumonia',
    'cardiomegaly',
    'hilar_enlargement',
    'fracture_old',
    'fibrosis',
    'aortic_calcification',
    'tortuous_aorta',
    'thickened_pleura',
    'TB',
    'pneumothorax',
    'emphysema',
    'atelectasis',
    'calcification',
    'pulmonary_edema',
    'increased_lung_markings',
    'elevated_diaphragm',
    'consolidation']},
  'data_list': [{'img_path': '5E02FDDE3CA09DF.png', 'gt_label': [0, 2, 4]},
   {'img_path': '5B42AF5714BC9EC.png', 'gt_label': [1, 3, 5]},
   {'img_path': 'DX.1.2.392.200036.9125.4.0.487562353.556557718.2795306561.png',
    'gt_label': [2]},
   {'img_path': '5D92A4F448B02A3.png', 'gt_label': [0, 2, 3]},
   {'img_path': '5C8619B92460648.png', 'gt_label': [0, 11, 13]},
   {'img_path': '5B11E29BF18A74.png', 'gt_label': [3, 5, 6, 8]},
   {'img_path': '5CDD10AA1C00650.png', 'gt_label': [0, 2, 4, 6]},
   {'img_path': '5E323A32E8810F.png', 'gt_label': [1, 8]},
   {'img_path': '

In [6]:
endo_label_train, endo_label_val = gen_export_multi_label(endo_train, ["study_id"])
endo_label_train

Num Unique Label: 196
No stratify due to The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Drop columns: ['study_id', 'concat_all_feature']
Classes: ['ulcer', 'erosion', 'polyp', 'tumor']
[   0    1    2    3    5    5    7    7    8    8   10   11   11   13
   14   15   17   18   20   20   21   22   23   24   25   26   27   28
   30   31   32   33   37   39   40   40   42   42   45   51   51   60
   66   67   68   70   70   71   74   74   76   77   78   79   80   83
   86   87   90   91   91   92   93   99  100  102  104  104  105  105
  106  107  107  108  109  115  116  117  120  121  122  123  124  126
  130  131  131  133  134  135  135  136  136  137  138  138  139  139
  140  140  141  142  143  144  145  145  149  150  152  153  153  154
  154  155  159  160  160  164  164  165  165  166  169  171  172  173
  176  180  181  181  182  182  183  190  194  194  195  196  196  197
  198  199  200  2

{'metainfo': {'classes': ['ulcer', 'erosion', 'polyp', 'tumor']},
 'data_list': [{'img_path': '13333_2021.08_0007_50955025.png',
   'gt_label': [2]},
  {'img_path': '13333_2021.08_0007_51343722.png', 'gt_label': [0]},
  {'img_path': '13333_2021.12_0004_57458526.png', 'gt_label': [2]},
  {'img_path': '13333_2021.12_0004_57457640.png', 'gt_label': [1]},
  {'img_path': '13333_2021.12_0009_56515469.png', 'gt_label': [1, 2]},
  {'img_path': '13333_2021.11_0000_55099537.png', 'gt_label': [0, 1]},
  {'img_path': '13333_2021.01_0000_41224617.png', 'gt_label': [0, 2]},
  {'img_path': '13333_2021.12_0001_56922631.png', 'gt_label': [0]},
  {'img_path': '13333_2021.12_0002_57427770.png', 'gt_label': [0, 2]},
  {'img_path': '13333_2021.12_0004_56719160.png', 'gt_label': [1]},
  {'img_path': '13333_2021.08_0007_50955084.png', 'gt_label': [1]},
  {'img_path': '13333_2021.12_0004_56719296.png', 'gt_label': [1]},
  {'img_path': '13333_2021.11_0000_55098893.png', 'gt_label': [1]},
  {'img_path': '13333_

5D1FF9E03D4015A.png 0
5D1FF9E03D4015A.png 3
5D1FF9E03D4015A.png 4

In [7]:
with open('./chest/chest_label_train.pkl', 'wb') as file:
    pickle.dump(chest_label_train, file)
    
with open('./chest/chest_label_val.pkl', 'wb') as file:
    pickle.dump(chest_label_val, file)

In [8]:
with open('./endo/endo_label_train.pkl', 'wb') as file:
    pickle.dump(endo_label_train, file)
    
with open('./endo/endo_label_val.pkl', 'wb') as file:
    pickle.dump(endo_label_val, file)

# Create Annotation Files for COLON (BINARY)

In [9]:
colon_train = pd.read_csv("colon_train.csv")
colon_train.set_index('img_id', inplace=True)

print(colon_train.shape)
display(colon_train.head())

(5654, 2)


Unnamed: 0_level_0,slide_id,tumor
img_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-08974-1-5-5_2019-05-28 23_17_50-lv1-33952-9759-6526-5268p0018.png,2019-08974-1-5-5_2019-05-28 23_17_50-lv1-33952...,0
2019-06-11 01_00_50-lv1-1956-12720-11095-7901p0079.png,2019-06-11 01_00_50-lv1-1956-12720-11095-7901,0
2019-06-11 00_58_36-lv1-820-6042-6546-12714p0089.png,2019-06-11 00_58_36-lv1-820-6042-6546-12714,0
2019-10327-1-1-1_2019-05-28 17_55_39-lv1-47692-11609-5464-5695p0026.png,2019-10327-1-1-1_2019-05-28 17_55_39-lv1-47692...,0
2019-06-11 01_00_50-lv1-1956-12720-11095-7901p0050.png,2019-06-11 01_00_50-lv1-1956-12720-11095-7901,0


In [10]:
def gen_export_binary_label(df, target, drop_col=[]):
    
    df["concat_all_feature"] = df.apply(concat_all_feature, axis=1)
    print(f"Num Unique Label: {len(df['concat_all_feature'].unique())}")
    
    try:
        df_train, df_val = train_test_split(df, test_size=0.2, shuffle=True, random_state=random_seed, stratify=df['concat_all_feature'])
        print(f"Stratified by {df['concat_all_feature'].values}")
    except Exception as error:
        df_train, df_val = train_test_split(df, test_size=0.2, shuffle=True, random_state=random_seed)
        print(f"No stratify due to {error}")
    
    drop_col = list(drop_col) + ["concat_all_feature"]
    
    print(f"Drop columns: {drop_col}")
    
    df.drop(columns=["concat_all_feature"], inplace=True)
    df_train.drop(columns=drop_col, inplace=True)
    df_val.drop(columns=drop_col, inplace=True)
    
    class_df = list(df_train.columns)
    print(f"Classes: {class_df}")
    
    # ===================================================================================================

    export_data_train = []

    for row in range(df_train.shape[0]):
        export_data_train.append(f"{df_train.index[row]} {df_train.loc[df_train.index[row], target]}\n")
    
    # ===================================================================================================
    
    export_data_val = []

    for row in range(df_val.shape[0]):
        export_data_val.append(f"{df_val.index[row]} {df_val.loc[df_val.index[row], target]}\n")
        
        
    return export_data_train, export_data_val

In [11]:
colon_label_train, colon_label_val = gen_export_binary_label(colon_train, "tumor", ["slide_id"])
colon_label_train, colon_label_val

Num Unique Label: 387
No stratify due to The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Drop columns: ['slide_id', 'concat_all_feature']
Classes: ['tumor']


(['1902160001_2019-06-11 12_36_32-lv1-41428-19386-2643-3166p0004.png 0\n',
  '2019-05890-1-2-2_2019-05-29 04_09_56-lv1-18641-14250-4430-2993p0010.png 0\n',
  '2019-08974-1-2-2_2019-05-28 23_45_47-lv1-47903-12182-4668-8764p0021.png 0\n',
  'D20181047302_2019-06-10 11_38_18-lv1-2363-24324-4352-3675p0018.png 0\n',
  'D20180598601_2019-06-10 13_23_14-lv1-36698-13817-4720-3742p0004.png 0\n',
  'D20190399101_2019-06-10 14_34_19-lv1-20934-28904-6720-5415p0038.png 0\n',
  '2019-06-04 17_08_52-lv1-16428-11718-4556-4938p0017.png 0\n',
  '2019-10946-1-1-1_2019-05-28 15_40_39-lv1-22021-16267-4506-6971p0033.png 0\n',
  '2019-08974-1-5-5_2019-05-28 23_17_50-lv1-33952-9759-6526-5268p0031.png 0\n',
  'D20190284802_2019-06-10 15_06_21-lv1-16310-12946-4950-3583p0009.png 0\n',
  'D20170502603_2019-05-14 11_39_00-lv1-322-6285-19314-22701p0170.png 1\n',
  'D201707788_2019-05-14 14_05_34-lv1-2777-15975-12688-9314p0024.png 1\n',
  '2019-10737-1-1-1_2019-05-28 16_25_50-lv1-52019-13596-5916-5916p0036.png 0\n',

In [12]:
with open('./colon/colon_label_train.txt', 'w') as file:
    file.writelines(colon_label_train)
    
with open('./colon/colon_label_val.txt', 'w') as file:
    file.writelines(colon_label_val)