In [14]:
import os
from tqdm import tqdm
import pandas as pd
from ensemble_boxes import *

In [15]:
DATA_DIR = '../data/csv'
CSV_FILE = 'train.csv'

In [16]:
train = pd.read_csv(os.path.join(DATA_DIR, CSV_FILE))
print(train.shape)
train.head()

(67914, 10)


Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,,2332,2580
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,,2954,3159
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,2080,2336
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,2304,2880
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,,2540,3072


In [17]:
#remove empty classes
train = train[train['class_id'] != 14]

In [18]:
print(train.shape)
train.head()

(36096, 10)


Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,2080,2336
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,2304,2880
5,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,2540,3072
6,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,2285,2555
7,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,2568,3353


In [19]:
train['class_id'].value_counts()

0     7162
3     5427
11    4842
13    4655
8     2580
7     2483
10    2476
9     2203
6     1247
5     1000
2      960
4      556
1      279
12     226
Name: class_id, dtype: int64

In [20]:
# ===============================
# Default WBF config (you can change these)
iou_thr = 0.4
skip_box_thr = 0.0001
sigma = 0.1
# ===============================


def preprocess_fusion(df, fusion_type, iou_thr=0.5, sigma=0.1, skip_box_thr=0.0001):
    # Loading the train DF
#     df.fillna(0, inplace=True)
#     df.loc[df["class_id"] == 14, ['x_max', 'y_max']] = 1.0

    results = []
    image_ids = df["image_id"].unique()

    for image_id in tqdm(image_ids, total=len(image_ids), position=0, leave=True):

        # All annotations for the current image.
        data = df[df["image_id"] == image_id]
        data = data.reset_index(drop=True)
        width = data.loc[0, 'width']
        height = data.loc[0, 'height']

        annotations = {}
        weights = []

        # WBF expects the coordinates in 0-1 range.
        max_value = data.iloc[:, 4:].values.max()
        data.loc[:, ["x_min", "y_min", "x_max", "y_max"]] = data.iloc[:, 4:8] / max_value #[4:8] denotes x_min,y_min,x_max,y_max

        # Loop through all of the annotations for single image
        for idx, row in data.iterrows():

            rad_id = row["rad_id"]

            if rad_id not in annotations:
                annotations[rad_id] = {
                    "boxes_list": [],
                    "scores_list": [],
                    "labels_list": [],
                }

                # We consider all of the radiologists as equal.
                weights.append(1.0)

            annotations[rad_id]["boxes_list"].append([row["x_min"], row["y_min"], row["x_max"], row["y_max"]])
            annotations[rad_id]["scores_list"].append(1.0)
            annotations[rad_id]["labels_list"].append(row["class_id"])

        boxes_list = []
        scores_list = []
        labels_list = []

        for annotator in annotations.keys():
            boxes_list.append(annotations[annotator]["boxes_list"])
            scores_list.append(annotations[annotator]["scores_list"])
            labels_list.append(annotations[annotator]["labels_list"])

        # Calculate Fusion
        if fusion_type == 'wbf':
            boxes, scores, labels = weighted_boxes_fusion(
                boxes_list,
                scores_list,
                labels_list,
                weights=weights,
                iou_thr=iou_thr,
                skip_box_thr=skip_box_thr)
            
        if fusion_type == 'nms':
             boxes, scores, labels = nms(
                boxes_list,
                scores_list,
                labels_list,
                weights=weights,
                iou_thr=iou_thr)
                
        if fusion_type == 'softnms':
            boxes, scores, labels = soft_nms(
                boxes_list,
                scores_list,
                labels_list,
                sigma=sigma,
                weights=weights,
                iou_thr=iou_thr)
                
        if fusion_type == 'nmw':
            boxes, scores, labels = non_maximum_weighted(
                boxes_list,
                scores_list,
                labels_list,
                weights=weights,
                iou_thr=iou_thr,
                skip_box_thr=skip_box_thr)    
                
                       

        for idx, box in enumerate(boxes):
            results.append({
                "image_id": image_id,
                "class_id": int(labels[idx]),
                "rad_id": "wbf",
                "x_min": box[0] * max_value,
                "y_min": box[1] * max_value,
                "x_max": box[2] * max_value,
                "y_max": box[3] * max_value,
                "height": height,
                "width": width
            })
            
    results = pd.DataFrame(results, columns=['image_id','class_id','rad_id','x_min','y_min','x_max','y_max','width','height'])
    return results

In [21]:
wbf_csv = preprocess_fusion(train, 'wbf')

100%|██████████| 4394/4394 [00:30<00:00, 142.55it/s]


In [22]:
print(wbf_csv.shape)
wbf_csv.head()

(23904, 9)


Unnamed: 0,image_id,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
0,9a5094b2563a1ef3ff50dc5c7ff71345,3,wbf,690.666677,1354.333319,1658.666605,1797.666771,2080,2336
1,9a5094b2563a1ef3ff50dc5c7ff71345,0,wbf,1052.0,715.0,1299.0,966.0,2080,2336
2,9a5094b2563a1ef3ff50dc5c7ff71345,11,wbf,1789.0,1729.0,1875.0,1992.0,2080,2336
3,9a5094b2563a1ef3ff50dc5c7ff71345,10,wbf,1789.0,1729.0,1875.0,1992.0,2080,2336
4,051132a778e61a86eb147c7c6f564dfe,3,wbf,953.999949,1305.0,2043.66663,1672.999935,2304,2880


In [23]:
#normalise bbox
wbf_csv['x_min'] = wbf_csv['x_min'] / wbf_csv['width']
wbf_csv['x_max'] = wbf_csv['x_max'] / wbf_csv['width']
wbf_csv['y_min'] = wbf_csv['y_min'] / wbf_csv['height']
wbf_csv['y_max'] = wbf_csv['y_max'] / wbf_csv['height']

In [24]:
# split folds
from sklearn.model_selection import GroupKFold

wbf_csv['fold'] = -1
gkf  = GroupKFold(n_splits = 5)
for fold, (train_idx, val_idx) in enumerate(gkf.split(wbf_csv, groups=wbf_csv.image_id.tolist())):
    wbf_csv.loc[val_idx, 'fold'] = fold

wbf_csv.head()

Unnamed: 0,image_id,class_id,rad_id,x_min,y_min,x_max,y_max,width,height,fold
0,9a5094b2563a1ef3ff50dc5c7ff71345,3,wbf,0.332051,0.579766,0.797436,0.769549,2080,2336,2
1,9a5094b2563a1ef3ff50dc5c7ff71345,0,wbf,0.505769,0.306079,0.624519,0.413527,2080,2336,2
2,9a5094b2563a1ef3ff50dc5c7ff71345,11,wbf,0.860096,0.740154,0.901442,0.85274,2080,2336,2
3,9a5094b2563a1ef3ff50dc5c7ff71345,10,wbf,0.860096,0.740154,0.901442,0.85274,2080,2336,2
4,051132a778e61a86eb147c7c6f564dfe,3,wbf,0.414062,0.453125,0.887008,0.580903,2304,2880,0


In [25]:
wbf_csv.groupby(['fold'])['class_id'].value_counts()

fold  class_id
0     11          811
      0           672
      13          631
      3           492
      8           404
                 ... 
4     5           191
      2           157
      4            78
      1            44
      12           36
Name: class_id, Length: 70, dtype: int64

In [21]:
wbf_csv.to_csv(os.path.join(DATA_DIR, 'train_wbf.csv'), index=False)