In [32]:
import pandas as pd, numpy as np
import os
from tqdm import tqdm
tqdm.pandas()
import ast

In [33]:
train_df = pd.read_csv(f'/mnt/d/Dataset/Github/SCD/bbox/csv/train_dupicate.csv')
mapping = dict(train_df[['image_id', 'StudyInstanceUID']].values)
fold_df = pd.read_csv("/mnt/d/Dataset/Github/SCD/bbox/csv/scd_fold.csv")
fold_df['StudyInstanceUID'] = fold_df.image_id.map(mapping)
study2fold = dict(fold_df[['StudyInstanceUID', 'fold']].values)
train_df['fold'] = train_df['StudyInstanceUID'].map(study2fold)
print('original:', train_df.shape[0])
train_df.fold.value_counts()

original: 6334


1    1268
4    1267
3    1267
0    1266
2    1266
Name: fold, dtype: int64

# Class Name Mapping

In [34]:
class_names = ['Negative for Pneumonia',
              'Indeterminate Appearance',
              'Atypical Appearance',
              'Typical Appearance']
class_labels= [0, 1, 2, 3]
name2label = dict(zip(class_names, class_labels))
label2name = {v:k for k, v in name2label.items()}

# train_df
tqdm.pandas(desc="train  ")
train_df['class_name']  = train_df.progress_apply(lambda row:row[class_names].iloc[[row[class_names].values.argmax()]].index.tolist()[0], 
                             axis=1)
train_df['class_label'] = train_df.class_name.map(name2label)

train  : 100%|██████████| 6334/6334 [00:05<00:00, 1061.66it/s]


# Fix Dataset

In [35]:
def get_fix(grp):
    if grp.loc[grp.label!='none 1 0 0 1 1'].shape[0]!=0: # remove from those groups where there is img with bbox
        grp.loc[grp.label=='none 1 0 0 1 1', 'fix'] = 1
    return grp

train_df['fix'] = 0
train_df = train_df.groupby(['StudyInstanceUID']).progress_apply(get_fix)
print(train_df.fix.value_counts())
train_df = train_df[train_df.fix!=1]

train  : 100%|██████████| 6054/6054 [00:11<00:00, 544.26it/s] 

0    6117
1     217
Name: fix, dtype: int64





# Remove Duplicates

In [36]:
dup_0 = train_df.query("dup_id==0") # take all from non-duplicates
dup_1 = train_df.query("dup_id>0").groupby("StudyInstanceUID").head(1) # take one from duplicates
train_df = pd.concat((dup_0, dup_1), axis=0)
print('After removal size:',train_df.shape[0])

After removal size: 6069


In [37]:
df_ids    = []
df_data   = []
for idx in tqdm(range(train_df.shape[0])):
    row = train_df.iloc[idx]
    image_id = row.image_id+'_image'
    study_id = row.StudyInstanceUID+'_study'
    width    = row.width
    height   = row.height
    data = np.array(row.label.split(' ')).reshape(-1, 6).tolist()
    df_data.extend(data)
    df_ids.extend([[image_id,width, height]]*len(data))
image_df = pd.DataFrame(np.concatenate((df_ids,df_data), axis=1), columns=["ImageID","Width","Height","LabelName","Conf","XMin","YMin","XMax","YMax"])
image_df.head(2)
    

100%|██████████| 6069/6069 [00:01<00:00, 4986.79it/s]


Unnamed: 0,ImageID,Width,Height,LabelName,Conf,XMin,YMin,XMax,YMax
0,000a312787f2_image,4256,3488,opacity,1,789.28836,582.43035,1815.94498,2499.73327
1,000a312787f2_image,4256,3488,opacity,1,2245.91208,591.20528,3340.5737,2352.75472


In [38]:
image_df.shape[0]


9628

In [39]:
df_data   = []
for idx in tqdm(range(train_df.shape[0])):
    row = train_df.iloc[idx]
    study_id = row.StudyInstanceUID+'_study'
    width    = row.width
    height   = row.height
    df_data.append([study_id, width, height, row["class_name"], '1', '0', '0', '1','1'])
study_df = pd.DataFrame(df_data, columns=["ImageID","Width","Height","LabelName","Conf","XMin","YMin","XMax","YMax"])
study_df.head(2)
    

100%|██████████| 6069/6069 [00:00<00:00, 6444.61it/s]


Unnamed: 0,ImageID,Width,Height,LabelName,Conf,XMin,YMin,XMax,YMax
0,5776db0cec75_study,4256,3488,Typical Appearance,1,0,0,1,1
1,ff0879eb20ed_study,2832,2320,Negative for Pneumonia,1,0,0,1,1


In [40]:
study_df.shape[0]

6069

In [41]:
gt_df = pd.concat((image_df, study_df), axis=0).drop('Conf', axis=1)
gt_df[["Width","Height","XMin","YMin","XMax","YMax"]] = gt_df[["Width","Height","XMin","YMin","XMax","YMax"]].astype(float)
gt_df[["XMin", "YMin"]] = gt_df[["XMin", "YMin"]].values/gt_df[['Width', 'Height']].values
gt_df[["XMax", "YMax"]] = gt_df[["XMax", "YMax"]].values/gt_df[['Width', 'Height']].values
gt_df.head(2)

Unnamed: 0,ImageID,Width,Height,LabelName,XMin,YMin,XMax,YMax
0,000a312787f2_image,4256.0,3488.0,opacity,0.185453,0.166981,0.426679,0.716667
1,000a312787f2_image,4256.0,3488.0,opacity,0.527705,0.169497,0.784909,0.674528


In [42]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(gt_df, title="Pandas Profiling Report")
# profile

In [43]:
gt_df.XMax.min()

0.00020445716622367614

In [44]:
# def fix_study(row):
#     if 'study' in row['ImageID']:
#         row['XMin'], row['XMax'], row['YMin'], row['YMax'] = 0, 1, 0, 1
#     if row['LabelName']=='none':
#         row['XMin'], row['XMax'], row['YMin'], row['YMax'] = 0, 1, 0, 1
#     return row
# gt_df = gt_df.progress_apply(fix_study, axis=1)
# gt_df.head(2)

In [45]:
mapping = {
    'Typical Appearance':'typical',
    'Atypical Appearance':'atypical',
    'Indeterminate Appearance':'indeterminate',
    'Negative for Pneumonia':'negative',
    'none':'none',
    'opacity':'opacity',
}
gt_df['LabelName'] = gt_df['LabelName'].map(mapping)

In [46]:
gt_df.to_csv('/mnt/d/Dataset/Github/SCD/bbox/csv/gt.csv',index=False)

In [47]:
# gt_df.query("LabelName=='none'")

In [48]:
gt_df.shape

(15697, 8)

In [49]:
gt_df.drop_duplicates().shape[0]

15689

In [50]:
# pred_df = pd.read_csv('/mnt/d/Dataset/Github/SCD/bbox/csv/tmp_pred.csv')
# pred_df.shape[0]

In [51]:
# len(set.intersection(set(gt_df.ImageID), set(pred_df.ImageID)))

In [52]:
# pred2 = pd.read_csv('/mnt/d/Dataset/Github/SCD/bbox/csv/pred.csv')
# pred2.shape[0]

In [53]:
# gt_df[['ImageID', 'Width', 'Height']].drop_duplicates()

In [54]:
# pd.merge(pred_df, gt_df[['ImageID', 'Width', 'Height']], on='ImageID', how='left').drop_duplicates().shape

In [57]:
pwd

'/mnt/d/Dataset/Github/SCD/bbox/notebooks'

In [79]:
gt = pd.read_csv('csv/gt.csv')
gt.head(2)

Unnamed: 0,ImageID,Width,Height,LabelName,XMin,YMin,XMax,YMax
0,000a312787f2_image,4256.0,3488.0,opacity,0.185453,0.166981,0.426679,0.716667
1,000a312787f2_image,4256.0,3488.0,opacity,0.527705,0.169497,0.784909,0.674528
