In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import cv2
from pathlib import Path
import skimage.io as io
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import os
import pickle
import pydicom
from metrics import Pfbeta

In [3]:
DATA_DIR = Path('input/rsna-breast-cancer-detection/')

In [4]:
train = pd.read_csv(DATA_DIR/'train.csv')

# Load experiment

In [5]:
from datasets import *
from architectures import *
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torch
from transforms import *
from configs import *
from kuma_utils.utils import sigmoid

In [6]:
# def plot_color_gradients(cmap_name):
#     gradient = np.linspace(0, 1, 256)
#     gradient = np.vstack((gradient, gradient))
#     # Create figure and adjust figure height to number of colormaps
#     plt.figure(figsize=(10, 1))
#     plt.imshow(gradient, aspect='auto', cmap=plt.cm.get_cmap(cmap_name))
#     plt.axis('off')

# plot_color_gradients('jet')

In [7]:
# cfg = Dataset02v0()
# ds = PatientLevelDataset(
#     df=train, image_dir=Path('input/rsna-breast-cancer-detection/image_resized_1024W'),
#     flip_lr=False,
#     preprocess=A.Compose([AutoFlip(), CropROI(threshold=0.1, buffer=160), A.Resize(768, 384)]),
#     transforms=ToTensorV2(),
#     # preprocess=cfg.preprocess['train'],
#     # transforms=cfg.transforms['train']
# )

In [6]:
cfg = Model05v3loss0()
# model = cfg.model(**cfg.model_params)

In [7]:
with open(f'results/{cfg.name}/predictions.pickle', 'rb') as f:
    predictions = pickle.load(f)

In [8]:
SAMPLE_ID = {
    42624, 48001, 48514, 2179, 31107, 23554, 13185, 53255, 
    29192, 59530, 64908, 32527, 13845, 59552, 54816, 49954, 
    55330, 59307, 21934, 63536, 23729, 61490, 61874, 16955, 
    46014, 38727, 64456, 50375, 9162, 55755, 25550, 15696, 
    50002, 58195, 10198, 13016, 25050, 31581, 26333, 29664, 
    8289, 3305, 6637, 48493, 58610, 42231, 12282, 9083, 32252, 39677}

In [9]:
metric = Pfbeta(binarize=True)

In [17]:
for fold_i, (folds, pred_fold) in enumerate(zip(predictions['folds'], predictions['outoffolds'])):
    ds = cfg.dataset(
        df=train.iloc[folds[1]], 
        image_dir=Path('input/rsna-breast-cancer-detection/image_resized_1024W'),
        # flip_lr=False, 
        is_test=True,
        preprocess=cfg.preprocess['test'],
        transforms=ToTensorV2())
    labels = ds.get_labels().reshape(-1)
    pred = sigmoid(pred_fold).reshape(-1)
    hard_samples = np.argsort(np.abs(labels - pred))[::-1][:20]
    for idx in hard_samples:
        img, label = ds[idx]
        plt.figure(figsize=(12, 4))
        plt.suptitle(f'{ds.pids[idx]}/ {label[0].item()}/ {pred[idx]:5f}')
        for i in range(img.shape[0]):
            plt.subplot(1, img.shape[0], i+1)
            plt.imshow(img[i, 0], cmap='gray')
        # plt.show()
        plt.savefig(f'input/plot2/fold{fold_i}_{idx}.png', facecolor='white')
        plt.close()
    # scores, thres = metric.optimal_f1_all(labels, pred)
    # plt.plot(scores)
    # plt.title(f'fold{fold_i} MAX: {max(scores):.5f} AUC: {sum(scores):.5f}')
    # plt.show()

In [26]:
# metric = Pfbeta(binarize=True)
# for fold_i, (folds, pred_fold) in enumerate(zip(predictions['folds'], predictions['outoffolds'])):
#     ds = cfg.dataset(
#         df=train.iloc[folds[1]], image_dir=Path('input/rsna-breast-cancer-detection/image_resized_1024W'))
#     res_df = []
#     for i in range(len(ds)):
#         record = ds.df_dict[ds.pids[i]]
#         res_df.append({
#             'pred': sigmoid(pred_fold[i][0]), 
#             'label': record['cancer'].values[0],
#             'site': record['site_id'].values[0]})
#     res_df = pd.DataFrame(res_df)
#     print(f'fold {fold_i}')
#     print('overall')
#     print(metric.optimal_f1(res_df['label'].values, res_df['pred'].values))
#     print('site 1')
#     print(metric.optimal_f1(res_df.query('site == 1')['label'].values, res_df.query('site == 1')['pred'].values))
#     print('site 2')
#     print(metric.optimal_f1(res_df.query('site == 2')['label'].values, res_df.query('site == 2')['pred'].values))

# Use metadata ? 

In [17]:
from sklearn.linear_model import LogisticRegression

In [30]:
for fold_i, (folds, pred_fold) in enumerate(zip(predictions['folds'], predictions['outoffolds'])):
    ds = cfg.dataset(df=train.iloc[folds[1]], image_dir=None)
    labels = ds.get_labels()
    pred = sigmoid(pred_fold).reshape(-1)
    print(metric.optimal_f1(labels, pred))
    stack_x = pd.DataFrame({
        'pred': pred,
        'age': [ds.df_dict[ds.pids[i]]['age'].values[0] / 100 for i in range(len(ds))]
    })
    stack_x['age'] = stack_x['age'].fillna(stack_x['age'].mean())
    model = LogisticRegression()
    model.fit(stack_x, labels.reshape(-1))
    pred2 = model.predict_proba(stack_x)[:, 1]
    print(metric.optimal_f1(labels, pred2))
    

(0.425, 0.64)
(0.42236024844720493, 0.22)
(0.4174757281553398, 0.13)
(0.4161849710982659, 0.05)
(0.5681818181818182, 0.31)
(0.5654450261780104, 0.05)
(0.5, 0.28)
(0.48837209302325585, 0.09)
(0.42276422764227634, 0.65)
(0.42975206611570244, 0.39)
