In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from torch.nn import functional as F
import torch
from tqdm import tqdm_notebook as tqdm

In [None]:
ROOT_OUT_DIR = '/kolos/m2/ct/data/rsna/error_analysis/'

In [None]:
csv_paths = [
    '/kolos/m2/ct/models/classification/rsna/0_1_2_3/version_0/val_results.csv',
    '/kolos/m2/ct/models/classification/rsna/0_1_2_4/version_0/val_results.csv',
    '/kolos/m2/ct/models/classification/rsna/0_1_3_4/version_0/val_results.csv',
    '/kolos/m2/ct/models/classification/rsna/0_2_3_4/version_0/val_results.csv',
    '/kolos/m2/ct/models/classification/rsna/1_2_3_4/version_0/val_results.csv'
]

df = pd.concat([pd.read_csv(path) for path in csv_paths])
df = df.reset_index()

In [None]:
# create auxiliary columns

class_weights = [1, 1, 1, 1, 1, 2]
classes = ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any']
gt_columns = ['gt_' + c for c in classes]
pred_columns = ['pred_' + c for c in classes]
gt = df[gt_columns]
pred = df[pred_columns]

losses = F.binary_cross_entropy(torch.tensor(pred.to_numpy()), torch.tensor(gt.to_numpy()), weight=torch.tensor(class_weights, dtype=torch.float64), reduction='none')

for i, c in enumerate(classes):
    df['loss_' + c] = pd.Series(losses[:, i].numpy())
    
losses = losses.mean(dim=1).numpy()
df['loss'] = pd.Series(losses)
df['study_path'] = df.path.apply(lambda path: '/'.join(path.split('/')[:-2]))

df['any_error'] = (df.gt_any - df.pred_any).abs()

In [None]:
# save top_n slices with highest overall loss

out_dir = ROOT_OUT_DIR + 'highest_loss/'
top_n = 1000

os.makedirs(out_dir, exist_ok=True)

for row in df.sort_values(by='loss', ascending=False)[:top_n + 1].itertuples():
    path = row.path
    loss = row.loss
    pred = [getattr(row, p) for p in pred_columns]
    
    target_path = os.path.join(os.path.dirname(path), '../vis')
    
    slice_num = os.path.basename(path).split('.')[0]
    pred_str = '{:.02f},{:.02f},{:.02f},{:.02f},{:.02f},{:.02f}'.format(*pred)
    link_name = f'loss={loss:.04f},slice={slice_num},pred={pred_str}'
    link_name = os.path.join(out_dir, link_name)
    try:
        os.symlink(target_path, link_name)
    except FileExistsError:
        pass

In [None]:
# save positive and negative scans with any_error in specified ranges

error_ranges = [(0.05, 0.2), (0.2, 0.5), (0.5, 0.9), (0.9, 1.0)]

for min_error, max_error in error_ranges:
    error_range_dir = f'error={min_error:.02f}-{max_error:0.2f}'
    neg_dir = os.path.join(ROOT_OUT_DIR, 'negatives', error_range_dir)
    pos_dir = os.path.join(ROOT_OUT_DIR, 'positives', error_range_dir)
    os.makedirs(neg_dir, exist_ok=True)
    os.makedirs(pos_dir, exist_ok=True)

    for row in df[(df.any_error > min_error) & (df.any_error < max_error)].itertuples():
        path = row.path
        error = row.any_error
        gt_any = row.gt_any
        pred = [getattr(row, p) for p in pred_columns]

        target_path = os.path.join(os.path.dirname(path), '../vis')

        slice_num = os.path.basename(path).split('.')[0]
        pred_str = '{:.02f},{:.02f},{:.02f},{:.02f},{:.02f},{:.02f}'.format(*pred)
        link_name = f'error={error:.04f},slice={slice_num},pred={pred_str}'
        link_name = os.path.join(pos_dir if gt_any else neg_dir, link_name)
        try:
            os.symlink(target_path, link_name)
        except FileExistsError:
            pass

In [None]:
# save top_n scans with highest loss averaged per scan

top_n = 1000
highest_loss_exams = df.groupby('study_path', as_index=False).mean().sort_values(by='loss', ascending=False)[:top_n][['study_path', 'loss']]

for exam in tqdm(highest_loss_exams.itertuples(), total=top_n):
    study_path = exam.study_path
    loss = exam.loss
    
    study_id = os.path.basename(study_path)
    out_dir = os.path.join(ROOT_OUT_DIR, 'highest_loss_per_scan', f'loss={loss:.04f},{study_id}')
    os.makedirs(out_dir, exist_ok=True)
    
    for row in df[df.study_path == study_path].itertuples():
        path = row.path
        slice_loss = row.loss
        pred = [getattr(row, p) for p in pred_columns]

        target_path = path.replace('npy256', 'vis').replace('.npy', '.png')
        target_path = os.path.realpath(target_path)

        slice_num = os.path.basename(path).split('.')[0]
        pred_str = '{:.02f},{:.02f},{:.02f},{:.02f},{:.02f},{:.02f}'.format(*pred)
        link_name = f'slice={slice_num},loss={slice_loss:.04f},pred={pred_str}.png'
        link_name = os.path.join(out_dir, link_name)
        try:
            os.symlink(target_path, link_name)
        except FileExistsError:
            pass

In [None]:
# save scans with per-class loss averaged over scan (only scans containing specific class)

for class_ in classes:
    if class_ == 'any':
        continue
    for variant in ['highest', 'lowest', 'all']:
        print(class_, variant)
        top_n = 100
        root_out_dir = f'{ROOT_OUT_DIR}{class_}_{variant}_loss_per_scan/'
        
        selected_exams = df.groupby('study_path', as_index=False).mean().sort_values(
            by=f'loss_{class_}', ascending=(variant == 'lowest'))
        selected_exams = selected_exams[selected_exams[f'gt_{class_}'] > 0][['study_path', f'loss_{class_}']]
        if variant != 'all':
            selected_exams = selected_exams[:top_n]

        for exam in tqdm(selected_exams.itertuples(), total=top_n):
            study_path = exam.study_path
            loss = getattr(exam, f'loss_{class_}')

            study_id = os.path.basename(study_path)
            out_dir = f'{ROOT_OUT_DIR}{class_}_{variant}_loss_per_scan/loss={loss:.04f},{study_id}'
            os.makedirs(out_dir, exist_ok=True)

            for row in df[df.study_path == study_path].itertuples():
                path = row.path
                slice_loss = row.loss
                pred = [getattr(row, p) for p in pred_columns]

                target_path = path.replace('npy256', 'vis').replace('.npy', '.png')
                target_path = os.path.realpath(target_path)

                slice_num = os.path.basename(path).split('.')[0]
                pred_str = '{:.02f},{:.02f},{:.02f},{:.02f},{:.02f},{:.02f}'.format(*pred)
                link_name = f'slice={slice_num},loss={slice_loss:.04f},pred={pred_str}.png'
                link_name = os.path.join(out_dir, link_name)
                try:
                    os.symlink(target_path, link_name)
                except FileExistsError:
                    pass