<a href="https://www.kaggle.com/code/rimzakhama/rsna-training-v2?scriptVersionId=143880832" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install iterative-stratification

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
print('done')

In [None]:
class Config :
    DEVICE = 'cuda:0'
    seed = 10
    INPUT_BASE = '/kaggle/input/rsna-breast-cancer-detection'
    OUTPUT_BASE = '/kaggle/working/'
    
    windowing = False
    batch_size = 8
    name = 'tf_efficientnetv2_s'
    lr = 5.0e-5
    epochs = 3
    epochs_warmup = 0
    num_cycles = 0.5
    VER = '084'
    n_folds = 4
    apex = True
    MEAN = 0.3089279
    STD = 0.25053555408335154
    # parameters for elastic deformation
    print_freq = 100
    a = 10
    s = 15
        
    

In [None]:
device = Config.DEVICE if torch.cuda.is_available() else 'cpu'
print('device:', device)

In [None]:
### Setting
def seed_everything(seed: int):    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(Config.seed)

In [None]:
# Dataset
# Breast Level
class MammoDataset(Dataset):
    def __init__(self, df, Config, train=True, tfms=None, windowing=False):
        self.df = df
        self.train = train
        self.tfms = tfms
        self.Config = Config
        self.INPUT_BASE = Path(Config.INPUT_BASE)
        self.windowing = windowing
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        data = self.df.iloc[idx]
        img_id = f"{data['image_id']}.png"
        input_path = '/kaggle/input/preprocessed-images-rsna/output'
        path = str(Path(input_path).joinpath("preprocessed_images_RSNA", str(data['patient_id']), 
                                             img_id))
        #path = str(self.INPUT_BASE.joinpath("train_images", str(data['patient_id']), img_id))
        #img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = Image.open(path).convert('RGB')
        img = img.resize((1024,912))
        #img = asarray(img)
        #print(img.shape)
       
        if self.tfms:
            #augmented = self.tfms(image=img)
            img = self.tfms(img).to(torch.float32)
            #img = augmented['image']
        #img = img.astype('float32')

        if self.windowing:
            # sigmoid windowing
            img /= 255
            img = img * (data['max'] - data['min']) + data['min']
            if data['rev'] == 1:
                img = data['rev_max'] - img
            img = data['y_range'] / (1 + np.exp(-4 * (img - data['center']) / data['width']))
            if data['rev'] == 1:
                img = np.amax(img) - img

        #img = img.astype('float32')
        #minimum_value = img.min()
        #maximum_value = img.max()
        
        #imgs -= img.min()
        #imgs /= img.min()
        
        #img = torch.tensor((img - self.Config.MEAN)/self.Config.STD, dtype=torch.float32)
        
        if self.train:
            return img, torch.tensor(data['cancer'], dtype=torch.long)
        else:
            return img


In [None]:
# Augmentations
'''
def get_aug(p=1.0, a=10, s=10):
    return Compose([
        HorizontalFlip(),
        VerticalFlip(),
        # RandomRotate90(),
#         ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=45, p=0.8, 
#                          border_mode=cv2.BORDER_REFLECT)
        # OneOf([Affine(rotate=20, translate_percent=0.1, scale=[0.8,1.2], shear=20)])
        Affine(rotate=20, translate_percent=0.1, scale=[0.8,1.2], shear=20),
        ElasticTransform(alpha=a, sigma=s)
    ], p=p)
'''


def get_aug(p=1.0, a=10, s=10):
    
    augmentator = transforms.Compose([
    # input for augmentator is always PIL image
    # transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.5),
    #transforms.RandomPerspective(),
    #transforms.RandomRotation((0, 90)),
    #transforms.RandomAutocontrast(),
    #transforms.RandomAffine(degrees=(0, 180), scale=(0.8, 1.2)),
    #transforms.ElasticTransform(),   
    transforms.ToTensor(), # return it as a tensor and transforms it to [0, 1]
    transforms.Normalize(mean = [0.1338, 0.1338, 0.1338],
                         std = [0.2068, 0.2068, 0.2068])    
])
    return augmentator


def get_aug_valid():
    augmentator = transforms.Compose([ 
    transforms.ToTensor(), # return it as a tensor
    transforms.Normalize(mean = [0.1336, 0.1336, 0.1336],
                         std = [0.2082, 0.2082, 0.2082])      
])
    return augmentator





In [None]:
# Model 
def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1.0 / p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6, p_trainable=False):
        super(GeM, self).__init__()
        if p_trainable:
            self.p = Parameter(torch.ones(1) * p)
        else:
            self.p = p
        self.eps = eps

    def forward(self, x):
        ret = gem(x, p=self.p, eps=self.eps)
        return ret

    def __repr__(self):
        return (
            self.__class__.__name__
            + "("
            + "p="
            + "{:.4f}".format(self.p.data.tolist()[0])
            + ", "
            + "eps="
            + str(self.eps)
            + ")"
        )
    
class MammoModel(nn.Module):
    def __init__(self, name, *, pretrained=False, in_chans=3, p=3, p_trainable=False, eps=1e-6):
        super().__init__()
        model = timm.create_model(name, pretrained=pretrained, in_chans=in_chans)
        clsf = model.default_cfg['classifier']
        n_features = model._modules[clsf].in_features
        model._modules[clsf] = nn.Identity()
        
        self.fc = nn.Linear(n_features, 1) # cancer
        self.model = model

        self.pool = nn.Sequential(
            GeM(p=p, eps=eps, p_trainable=p_trainable),
            nn.Flatten())
    
    def forward(self, x):
        # x = self.model(x)
        x = self.model.forward_features(x)
        x = self.pool(x)
        logits = self.fc(x)
        return logits   
    
    

In [None]:
# Train function
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=Config.apex)
    losses = AverageMeter()
    start = end = time.time()
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.float().to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=Config.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        # grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        # batch scheduler
        # scheduler.step()
        end = time.time()
        if step % Config.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'LR: {lr:.8f}'
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          lr=optimizer.param_groups[0]['lr']))
    return losses.avg

In [None]:
# Validation function
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = inputs.to(device)
        labels = labels.float().to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.squeeze(1).sigmoid().to('cpu').numpy())
        end = time.time()
        if step % Config.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


In [None]:
# https://www.kaggle.com/code/sohier/probabilistic-f-score
def pfbeta(labels, predictions, beta):
    y_true_count = 0
    ctp = 0
    cfp = 0

    for idx in range(len(labels)):
        prediction = min(max(predictions[idx], 0), 1)
        if (labels[idx]):
            y_true_count += 1
            ctp += prediction
            # cfp += 1 - prediction
        else:
            cfp += prediction

    beta_squared = beta * beta
    c_precision = ctp / (ctp + cfp)
    c_recall = ctp / y_true_count
    if (c_precision > 0 and c_recall > 0):
        result = (1 + beta_squared) * (c_precision * c_recall) / (beta_squared * c_precision + c_recall)
        return result
    else:
        return 0

In [None]:
def pfbeta_binarized(labels, predictions):
    positives = predictions[labels == 1]
    scores = []
    for th in positives:
        binarized = (predictions >= th).astype('int')
        score = pfbeta(labels, binarized, 1)
        scores.append(score)
    return np.max(scores)

In [None]:
def pr_auc(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    score = auc(recall, precision)
    return score

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [None]:

def get_custom_folds(train_df, Config):
    train_df_all = train_df.copy()

    # count images per prediction_id
    train_df['prediction_id'] = train_df['patient_id'].astype(str).str.cat(train_df['laterality'], sep='_')
    train_df['prediction_id'] = train_df['patient_id'].astype(str).str.cat(train_df['laterality'], sep='_')
    num_count = train_df[['prediction_id', 'image_id']].groupby('prediction_id').count().reset_index()
    count_map = {pred_id: img_id for pred_id, img_id in zip(num_count['prediction_id'].values, num_count['image_id'].values)}
    train_df['count'] = train_df['prediction_id'].map(count_map)

    # group by patient_id and stratify by age, implant, machine_id, cancer, biopsy, BIRADS, density, and count
    train_df = train_df.groupby('prediction_id').first().reset_index()
    dummy = train_df[['patient_id', 'age', 'implant', 'machine_id']].groupby('patient_id').first()
    machine2int = {machine_id: n for n, machine_id in enumerate(train_df[['machine_id', 'cancer']].groupby('machine_id').mean().sort_values('cancer').index.values)}
    dummy['machine_id'] = dummy['machine_id'].apply(lambda x: machine2int[x])
    dummy2 = train_df[['patient_id', 'cancer', 'biopsy', 'count']].groupby('patient_id').mean()
    dummy3 = train_df[['patient_id', 'BIRADS']].groupby('patient_id').min().fillna(-1)
    dummy4 = train_df[['patient_id', 'density']].groupby('patient_id').max().fillna('E')
    dummy4['density'] = dummy4['density'].map({'E': -1, 'D': 0, 'C': 1, 'B': 2, 'A': 3})
    dummy = pd.concat([dummy, dummy2, dummy3, dummy4], axis=1)
    dummy['age'] = dummy['age'].fillna(dummy['age'].mean())
    dummy['fold'] = -1
    mskf = MultilabelStratifiedKFold(n_splits=Config.n_folds, shuffle=True, random_state=Config.seed)
    for fold, (trn_ind, val_ind) in enumerate(mskf.split(dummy, dummy.values)):
        dummy.iloc[val_ind, -1] = fold
    dummy = dummy.reset_index()
    dummy['patient_id'] = dummy['patient_id'].astype('int')
    fold_map = {patient_id: fold for patient_id, fold in zip(dummy['patient_id'].values, dummy['fold'].values)}
    
    # show some stat regarding each fold
    train_df = train_df.merge(dummy[['patient_id','fold']], on='patient_id', how='left')
    for fold in range(Config.n_folds):
        trn_ind = train_df[train_df['fold'] != fold].index
        val_ind = train_df[train_df['fold'] == fold].index
        print(f'=========== Fold {fold} ===========')
        print(f'Train {len(trn_ind)}', end='')
        _, counts = np.unique(train_df.loc[trn_ind, 'cancer'].values, return_counts=True)
        print(f'        (positive {counts[1]}, negative {counts[0]})')
        print(f'Validation {len(val_ind)}', end='')
        _, counts = np.unique(train_df.loc[val_ind, 'cancer'].values, return_counts=True)
        print(f'    (positive {counts[1]}, negative {counts[0]})')
    
    # concat 'fold' column on train_df_all
    train_df_all['fold'] = train_df_all['patient_id'].map(fold_map)
    return train_df_all


# Train loop (using k-folds)
def train_loop(folds, fold):
    
    print(f'================== fold: {fold} training ======================')
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    
    
    train_dataset = MammoDataset(train_folds, Config, tfms = get_aug(a=Config.a, s=Config.s), windowing=Config.windowing)
    valid_dataset = MammoDataset(valid_folds, Config, tfms = get_aug(a=Config.a, s=Config.s))
        
    
    train_loader = DataLoader(train_dataset,
                             batch_size=Config.batch_size,
                             shuffle=True,
                             num_workers=2, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                             batch_size=Config.batch_size,
                             shuffle=False,
                             num_workers=2, pin_memory=True, drop_last=False)
    
    model = MammoModel(Config.name, pretrained=True)
    model.to(device)
    
    optimizer = Adam(model.parameters(), lr=Config.lr)
    scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=Config.epochs_warmup, num_training_steps=Config.epochs, 
        num_cycles=Config.num_cycles
            )
    

    criterion = nn.BCEWithLogitsLoss(reduction='mean')
    
    best_score = 0.
    best_aucroc = 0.
    best_prauc = 0.
    for epoch in range(Config.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, device)
        scheduler.step()
        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        valid_folds['prediction'] = predictions
        valid_agg = valid_folds[['patient_id', 'laterality', 'cancer', 'prediction', 'fold']].groupby(['patient_id', 'laterality']).mean()
        score = pfbeta_binarized(valid_agg['cancer'].values, valid_agg['prediction'].values)
        prauc = pr_auc(valid_agg['cancer'].values, valid_agg['prediction'].values)
        aucroc = roc_auc_score(valid_agg['cancer'].values, valid_agg['prediction'].values)
        
        elapsed = time.time() - start_time
        
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - pF Score: {score:.4f}, PR-AUC Score: {prauc:.4f}, AUC-ROC Score: {aucroc:.4f}')
        
        if best_prauc < prauc:
            best_prauc = prauc
            # torch.save({'model': model.state_dict(),
            #             'predictions': predictions},
            #             OUTPUT_BASE.joinpath("models", f"{model_base_name}_seed_{config.seed}_fold{fold}_best_prauc_ver{config.VER}.pth"))
            
        if best_aucroc < aucroc:
            best_aucroc = aucroc
            # torch.save({'model': model.state_dict(),
            #             'predictions': predictions},
            #             OUTPUT_BASE.joinpath("models", f"{model_base_name}_seed_{config.seed}_fold{fold}_best_aucroc_ver{config.VER}.pth"))
            
        if best_score < score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_BASE.joinpath(f"{model_base_name}_seed_{Config.seed}_fold{fold}.pth"))
        
            #torch.save({'model': model.state_dict(),
            #            'predictions': predictions},
            #            OUTPUT_BASE.joinpath("models", f"{model_base_name}_seed_{Config.seed}_fold{fold}.pth"))
        
    predictions = torch.load(OUTPUT_BASE.joinpath(f'{model_base_name}_seed_{Config.seed}_fold{fold}.pth'), map_location='cpu')['predictions']
    valid_folds['prediction'] = predictions
    print(f'[Fold{fold}] Best pF Score: {best_score}, PR-AUC Score: {best_prauc}, AUC-ROC Score: {best_aucroc:.4f}')
    torch.cuda.empty_cache()
    gc.collect()
    return valid_folds


In [None]:
# Train loop (without k-folds)

def train_loop_No_kfolds(df, fold = None):
    
    train_df, valid_df = sklearn.model_selection.train_test_split(df, 
                                                                  test_size=0.15, 
                                                                  shuffle=True, 
                                                                  stratify=df['cancer'].values)
    
    
    train_dataset = MammoDataset(train_df, Config, tfms = get_aug(a=Config.a, s=Config.s), windowing=Config.windowing)
    valid_dataset = MammoDataset(valid_df, Config, tfms = get_aug(a=Config.a, s=Config.s))
    
    train_loader = DataLoader(train_dataset,
                             batch_size=Config.batch_size,
                             shuffle=True,
                             num_workers=2, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                             batch_size=Config.batch_size,
                             shuffle=False,
                             num_workers=2, pin_memory=True, drop_last=False)
    
    model = MammoModel(Config.name, pretrained=True)
    model.to(device)
    
    optimizer = Adam(model.parameters(), lr=Config.lr)
    scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=Config.epochs_warmup, num_training_steps=Config.epochs, 
        num_cycles=Config.num_cycles
            )
    

    criterion = nn.BCEWithLogitsLoss(reduction='mean')
    
    best_score = 0.
    best_aucroc = 0.
    best_prauc = 0.
    for epoch in range(Config.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, device)
        scheduler.step()
        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        valid_df['prediction'] = predictions
        
        valid_agg = valid_df
        
        '''
        valid_agg = valid_df[['patient_id',
                                 'laterality', 
                                 'cancer', 
                                 'prediction']].groupby(['patient_id', 'laterality']).mean()
        '''
        
        score = pfbeta_binarized(valid_agg['cancer'].values, valid_agg['prediction'].values)
        prauc = pr_auc(valid_agg['cancer'].values, valid_agg['prediction'].values)
        aucroc = roc_auc_score(valid_agg['cancer'].values, valid_agg['prediction'].values)
        
        elapsed = time.time() - start_time
        
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - pF Score: {score:.4f}, PR-AUC Score: {prauc:.4f}, AUC-ROC Score: {aucroc:.4f}')
        
        if best_prauc < prauc:
            best_prauc = prauc
            # torch.save({'model': model.state_dict(),
            #             'predictions': predictions},
            #             OUTPUT_BASE.joinpath("models", f"{model_base_name}_seed_{config.seed}_fold{fold}_best_prauc_ver{config.VER}.pth"))
            
        if best_aucroc < aucroc:
            best_aucroc = aucroc
            # torch.save({'model': model.state_dict(),
            #             'predictions': predictions},
            #             OUTPUT_BASE.joinpath("models", f"{model_base_name}_seed_{config.seed}_fold{fold}_best_aucroc_ver{config.VER}.pth"))
            
        if best_score < score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_BASE.joinpath(f"{model_base_name}_seed_{Config.seed}.pth"))
        
    predictions = torch.load(OUTPUT_BASE.joinpath(f'{model_base_name}_seed_{Config.seed}.pth'), map_location='cpu')['predictions']
    valid_df['prediction'] = predictions
    print(f'Best pF Score: {best_score}, PR-AUC Score: {best_prauc}, AUC-ROC Score: {best_aucroc:.4f}')
    torch.cuda.empty_cache()
    gc.collect()
    return valid_df


###############################
##### Train using k-folds #####
###############################

INPUT_BASE = Path(Config.INPUT_BASE)
#OUTPUT_BASE = Path(Config.OUTPUT_BASE, exist_ok=True)
OUTPUT_BASE = Path('/kaggle/working/models/', exist_ok=True)
os.makedirs(OUTPUT_BASE, exist_ok = True) 
 

model_base_name = 'efficientv2' if  'efficientv2' in Config.name else 'efficientv5'

### Load Train
train_df = pd.read_csv(INPUT_BASE.joinpath('train.csv'))

# cv splitting, grouped by patient_id and stratified by age, implant, machine_id, cancer, biopsy, BIRADS, density, and the num of images
# get_custom_folds returns train_df with 'fold' column
train_df = get_custom_folds(train_df, Config)

oof_df = pd.DataFrame()
for fold in range(Config.n_folds):
    seed_everything(Config.seed)
    _oof_df = train_loop(train_df, fold)
    oof_df = pd.concat([oof_df, _oof_df])
oof_df = oof_df.reset_index(drop=True)
oof_df_agg = oof_df[['patient_id', 'laterality', 'cancer', 'prediction', 'fold']].groupby(['patient_id', 'laterality']).mean()

print('================ CV ================')

score = pfbeta_binarized(oof_df_agg['cancer'].values, oof_df_agg['prediction'].values)
prauc = pr_auc(oof_df_agg['cancer'].values, oof_df_agg['prediction'].values)
aucroc = roc_auc_score(oof_df_agg['cancer'].values, oof_df_agg['prediction'].values)
print(f'Score: {score}, PR-AUC: {prauc}, AUC-ROC: {aucroc}')
oof_df.to_pickle(OUTPUT_BASE.joinpath('preds', f'oof_df_ver{Config.VER}_seed{Config.seed}.pkl'))


import torch
from pathlib import Path
import os
fold = 1
seed = 42

OUTPUT_BASE = Path('/kaggle/working/models/', exist_ok=True)

os.makedirs(OUTPUT_BASE, exist_ok = True) 

torch.save({'predictions': 'predictions'},
            OUTPUT_BASE.joinpath(f"{'efficientv2'}_seed_{seed}_fold{fold}.pth"))


In [None]:

#################################
##### Train without k-folds #####
#################################

INPUT_BASE = Path(Config.INPUT_BASE)
#OUTPUT_BASE = Path(Config.OUTPUT_BASE)
OUTPUT_BASE = Path('/kaggle/working/models/', exist_ok=True)
os.makedirs(OUTPUT_BASE, exist_ok = True) 

model_base_name = 'efficientv2' if  'efficientv2' in Config.name else 'efficientv5'

### Load Train
train_df = pd.read_csv(INPUT_BASE.joinpath('train.csv'))

# cv splitting, grouped by patient_id and stratified by age, implant, machine_id, cancer, biopsy, BIRADS, density, and the num of images

seed_everything(Config.seed)
oof_df = train_loop_No_kfolds(train_df, fold=None)

oof_df_agg = oof_df

'''
oof_df_agg = oof_df[['patient_id', 
                       'laterality', 
                       'cancer', 
                       'prediction']].groupby(['patient_id', 'laterality']).mean()     
'''
             
print('================ CV ================')

score = pfbeta_binarized(oof_df_agg['cancer'].values, oof_df_agg['prediction'].values)
prauc = pr_auc(oof_df_agg['cancer'].values, oof_df_agg['prediction'].values)
aucroc = roc_auc_score(oof_df_agg['cancer'].values, oof_df_agg['prediction'].values)
print(f'Score: {score}, PR-AUC: {prauc}, AUC-ROC: {aucroc}')

output_path = Path('/kaggle/working/models/preds', exist_ok=True)
os.makedirs(output_path, exist_ok = True) 
oof_df.to_pickle(output_path.joinpath(f'oof_df_ver{Config.VER}_seed{Config.seed}.pkl'))

In [None]:
'''
output_path = Path('/kaggle/working/models/preds', exist_ok=True)
os.makedirs(output_path, exist_ok = True) 
output_path.joinpath(f'oof_df_ver{Config.VER}_seed{Config.seed}.pkl')
'''
#oof_df.to_pickle(OUTPUT_BASE.joinpath(f'oof_df_ver{Config.VER}_seed{Config.seed}.pkl'))

In [None]:
# In this notebook version, I used RGB images and so on, I changed the line of images normalization 
# by adding normalization as transformation in the augmentation part. The line that does normalization 
# in the Dataset class is valid for one channel (grayscale) images.

In [None]:
# Should the normalization be applied on validation and test data?
# In this version I applied it on both the training and validation data.

# see : https://discuss.pytorch.org/t/finding-mean-and-std-for-each-of-the-train-val-and-test-dataloader-to-use-for-normalize-in-data-transform/145974/2


'''
Definitely you should normalize your data. You normalize the data for the following aims:
For having different features in same scale, which is for accelerating learning process.
For caring different features fairly without caring the scale.
After training, your learning algorithm has learnt to deal with the data in scaled form, so you have 
to normalize your test data with the normalizing parameters used for training data.
reference : https://datascience.stackexchange.com/questions/27615/should-we-apply-normalization-to-test-data-as-well#:~:text=Definitely%20you%20should%20normalize%20your,fairly%20without%20caring%20the%20scale.

'''


In [None]:
# In this notebook, we don't use k-folds.
# In this version, we don't use aggregation.