In [None]:
import os
import gc          
#import cv2    
import math
import copy
import time
import random
import glob        

# Plotting
from matplotlib import pyplot as plt
from matplotlib import image as mpimg
import seaborn as sns
from PIL import Image

# Data manipulation
import numpy as np
import pandas as pd

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp      # mixed precision training
import torchvision

# Hugging Face transformers (for pretrained ResNet etc.)
from transformers import AutoImageProcessor, ResNetForImageClassification
# Hugging Face datasets
from datasets import load_dataset

# TorchEval for evaluation metrics
from torcheval.metrics.functional import binary_auroc
# Learning rate scheduler
from torch.optim import lr_scheduler

# Cross-validation
from sklearn.model_selection import KFold, GroupKFold

# Albumentations for image augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


In [None]:
# Root directory for ISIC data
ROOT_DIR = "/home/webadmin/Desktop/isic/"
TRAIN_DIR = f'{ROOT_DIR}/image'
# Configuration dictionary
CONFIG = {
    "seed": 42,
    "n_samples_train":10000,
    "n_samples_val":10000, 
    "epochs": 50,
    "img_size": 384,
    "model_name": "tf_efficientnet_b0_ns",
    "checkpoint_path" : "/home/webadmin/Desktop/ISIC24_Skin_Cancer_Detection/Fastai/efficientformerv2_s2_weights.pth",
    "train_batch_size": 400,
    "valid_batch_size": 400,
    "learning_rate": 1e-4,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "weight_decay": 1e-6,
    "fold" : 4,
    "n_fold": 5,
    "n_accumulate": 1,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

BEST_WEIGHT = 'v2_AUROC0.9324_Loss0.0043_epoch22_lossauroc.pth'#'v2_AUROC0.8736_Loss0.0158_epoch12_lossauroc.pth'#'v2_AUROC0.6942_Loss0.2311_epoch26.pth'


In [7]:
CONFIG['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")#torch.device("cpu")

In [8]:
seed=CONFIG['seed']
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

Random seed across NumPy and PyTorch (both CPU and GPU).

## Data

In [None]:
# Load training metadata
train_df = pd.read_csv(ROOT_DIR + "/train-metadata.csv")
# Load test metadata
test_df = pd.read_csv(ROOT_DIR + "/test-metadata.csv")
# Combine both into a single DataFrame for convenience
all_df = pd.concat([train_df, test_df]).reset_index(drop=True)
# Display the first few rows of each
display(train_df.head())
display(test_df.head())

  train_df = pd.read_csv(ROOT_DIR+"/train-metadata.csv")


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,lesion_id,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,,Benign,Benign,,,,,,,97.517282
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,IL_6727506,Benign,Benign,,,,,,,3.141455
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,,Benign,Benign,,,,,,,99.80404
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,,Benign,Benign,,,,,,,99.989998
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,,Benign,Benign,,,,,,,70.44251


Unnamed: 0,isic_id,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z,attribution,copyright_license
0,ISIC_0015657,IP_6074337,45.0,male,posterior torso,2.7,TBP tile: close-up,3D: XP,22.80433,20.00727,...,0.304827,1.281532,2.299935,0.479339,20,-155.0651,1511.222,113.9801,Memorial Sloan Kettering Cancer Center,CC-BY
1,ISIC_0015729,IP_1664139,35.0,female,lower extremity,2.52,TBP tile: close-up,3D: XP,16.64867,9.657964,...,0.0,1.27194,2.011223,0.42623,25,-112.36924,629.535889,-15.019287,"Frazer Institute, The University of Queensland...",CC-BY
2,ISIC_0015740,IP_7142616,65.0,male,posterior torso,3.16,TBP tile: close-up,3D: XP,24.25384,19.93738,...,0.230742,1.080308,2.705857,0.366071,110,-84.29282,1303.978,-28.57605,FNQH Cairns,CC-BY


loading the metadata for both the training and test sets. Then I combined them into a single DataFrame (all_df) for easier processing later. Finally, I displayed the first few rows of both training and test metadata to check the structure of the data.

In [None]:
# Get all training image file paths
train_images = sorted(glob.glob(f"{TRAIN_DIR}/*.jpg"))

In [None]:
## Images

# Function to get the full path of a training image given its ID
def get_train_file_path(image_id):
    return f"{TRAIN_DIR}/{image_id}.jpg"

In [12]:
#for i in range(10):
#    image = mpimg.imread(train_images[i])
#    print(image.shape)
    

In [13]:
df = train_df.copy()
df['image_path'] = df['isic_id'].apply(get_train_file_path)
#df['image'] = df['isic_id'].apply(show_im)
df = df[ df["image_path"].isin(train_images) ].reset_index(drop=True)


print("# of images , # of positive cases, # of negative cases, # of patients")
print(df.shape, df.target.sum(), (df["target"] == 0).sum(), df["patient_id"].unique().shape)

df_positive = df[df["target"] == 1].reset_index(drop=True)
df_negative = df[df["target"] == 0].reset_index(drop=True)

# of images , # of positive cases, # of negative cases, # of patients
(401059, 56) 393 400666 (1042,)


## Start of Deep Learning: Pytorch

In [14]:
## CHANGE THIS, GOT THIS FROM COMMUNITY MODELS

data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.RandomRotate90(p=0.5),
        A.Flip(p=0.5),
        A.Downscale(p=0.25),
        A.ShiftScaleRotate(shift_limit=0.1, 
                           scale_limit=0.15, 
                           rotate_limit=60, 
                           p=0.5),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "validation": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "train_testing": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        #A.RandomRotate90(p=0.5),
        A.Flip(p=0.5),
        A.Downscale(p=0.25),
        A.ShiftScaleRotate(shift_limit=0.1, 
                           scale_limit=0.15, 
                           rotate_limit=60, 
                           p=0.5),
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        ToTensorV2()], p=1.)
}

In [None]:
class ISICDataset(Dataset):
    def __init__(self, df, phase="train", transforms=None):
        # Pick how many samples to use based on phase
        if phase == 'train':
            offset = CONFIG["n_samples_val"]
            n_samples = CONFIG["n_samples_train"]
        elif phase == 'train_testing':
            offset = CONFIG["n_samples_val"]
            n_samples = CONFIG["n_samples_train"]
        elif phase == 'validation':
            offset = 0
            n_samples = CONFIG["n_samples_val"]

        # Split positives and negatives
        self.df_positive = df[df["target"] == 1].reset_index(drop=True)
        self.df_negative = df[df["target"] == 0].reset_index(drop=True)
        
        # Shuffle negatives and take a slice
        self.df_negative = self.df_negative.sample(frac=1, random_state=42).reset_index(drop=True)
        self.df_negative = self.df_negative[offset:offset + n_samples // 2]

        self.transforms = transforms
        self.n_samples = n_samples

        # Build positive samples with augmentation (multiple views per positive)
        self.positive_samples = []
        for idx in range(len(self.df_positive)):
            img_path = self.df_positive['image_path'].iloc[idx]
            img = Image.open(img_path).convert("RGB")
            img_np = np.array(img).copy()  # defensive copy
            # repeat to fill half of n_samples using positives
            for _ in range(n_samples // (2 * len(self.df_positive))):
                transformed_img = self.transforms(image=img_np.copy())["image"]
                self.positive_samples.append((transformed_img, 1))

        # Build negative samples (one aug per negative)
        self.negative_samples = []
        for idx in range(len(self.df_negative)):
            img_path = self.df_negative['image_path'].iloc[idx]
            img = Image.open(img_path).convert("RGB")
            img_np = np.array(img).copy()  # defensive copy
            transformed_img = self.transforms(image=img_np)["image"]
            self.negative_samples.append((transformed_img, 0))

    def __len__(self):
        # total samples as configured
        return self.n_samples

    def __getitem__(self, index):
        # 50/50 chance to draw from positives or negatives
        if random.random() < 0.5:
            idx = index % len(self.positive_samples)
            img, target = self.positive_samples[idx]
        else:
            idx = index % len(self.negative_samples)
            img, target = self.negative_samples[idx]

        # return batch item
        return {
            'image': img,
            'target': target
        }


In [16]:
df

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,iddx_full,iddx_1,iddx_2,iddx_3,iddx_4,iddx_5,mel_mitotic_index,mel_thick_mm,tbp_lv_dnn_lesion_confidence,image_path
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,Benign,Benign,,,,,,,97.517282,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.10,TBP tile: close-up,3D: white,31.712570,...,Benign,Benign,,,,,,,3.141455,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.40,TBP tile: close-up,3D: XP,22.575830,...,Benign,Benign,,,,,,,99.804040,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,Benign,Benign,,,,,,,99.989998,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.725520,...,Benign,Benign,,,,,,,70.442510,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401054,ISIC_9999937,0,IP_1140263,70.0,male,anterior torso,6.80,TBP tile: close-up,3D: XP,22.574335,...,Benign,Benign,,,,,,,99.999988,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
401055,ISIC_9999951,0,IP_5678181,60.0,male,posterior torso,3.11,TBP tile: close-up,3D: white,19.977640,...,Benign,Benign,,,,,,,99.999820,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
401056,ISIC_9999960,0,IP_0076153,65.0,female,anterior torso,2.05,TBP tile: close-up,3D: XP,17.332567,...,Benign,Benign,,,,,,,99.999416,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...
401057,ISIC_9999964,0,IP_5231513,30.0,female,anterior torso,2.80,TBP tile: close-up,3D: XP,22.288570,...,Benign,Benign,,,,,,,100.000000,/home/mccruz/isic/ISIC2024_Skin_Cancer_Detecti...


In [None]:
# Create training and validation datasets using custom ISICDataset
train_dataset = ISICDataset(df, phase="train", transforms=data_transforms["train_testing"])
valid_dataset = ISICDataset(df, phase="validation", transforms=data_transforms["train_testing"])
# DataLoader for training set
train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG['train_batch_size'],  
    num_workers=3,                           
    shuffle=True,                           
    pin_memory=True,                        
    drop_last=True                         
)
# DataLoader for validation set
valid_loader = DataLoader(
    valid_dataset,
    batch_size=CONFIG['valid_batch_size'],
    num_workers=3,
    shuffle=False,                          
    pin_memory=True
)

In [None]:
## Model

# Load pretrained ResNet-50 from Hugging Face
processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

# Define a new classifier head (binary classification instead of 1000 classes)
new_classifier = nn.Sequential(
    nn.Flatten(start_dim=1, end_dim=-1),         # flatten features
    nn.Linear(in_features=2048, out_features=2, bias=True),  # map to 2 classes
    nn.Sigmoid()                                 # squash outputs between 0-1
)

# Replace the model's default classifier with the new one
model.classifier = new_classifier

# Freeze all parameters except the new classifier
for param in model.classifier.parameters():
    param.requires_grad = True

# Load best saved weights (trained checkpoint)
model.load_state_dict(torch.load(BEST_WEIGHT, map_location=torch.device('cpu')))

# Move model to GPU if available
model.to(CONFIG['device']);


Loaded a ResNet-50 model pretrained on ImageNet and replaced its default classifier (1000 classes) with a new head for binary classification (melanoma vs non-melanoma). The new classifier flattens the features, applies a linear layer to output two logits, and passes them through a sigmoid.

In [None]:
# from sklearn.metrics import roc_curve, auc, roc_auc_score

# Custom loss/metric function
def criterion(submission, solution, min_tpr: float = 0.80) -> float:
    # Use only the probability/logit for class 1 (melanoma)
    submission = submission[:, 1]  # model outputs for class 1
    solution  = solution[:, 1]     # target for class 1

    print(submission.shape, solution.shape)


    gamma = 2
    alpha = 0.8

    # Base BCE between predicted prob and target (mean over batch)
    BCE = F.binary_cross_entropy(submission, solution, reduction='mean')

    # Convert BCE to a confidence-like term via exp(-BCE)
    BCE_EXP = torch.exp(-BCE)

    # Focal modulation: focus more on harder samples
    focal_loss = alpha * (1 - BCE_EXP) ** gamma * BCE

    return focal_loss


This function defines a custom loss. It first picks the class-1 values (melanoma) from both predictions and targets. The commented block shows an alternative idea to compute a partial AUC (disabled for now). The active part computes a focal-style binary cross-entropy: it takes standard BCE.

In [21]:
#device = torch.device("cpu")

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    device = CONFIG["device"]
    model.train()  # set model to training mode

    dataset_size = 0
    running_loss = 0.0
    running_auroc = 0.0

    # Loop through batches
    for step, batch in enumerate(train_loader):
        print(step)

        # Move data to device
        images = batch['image'].to(device, dtype=torch.float)
        targets = batch['target'].to(device, dtype=torch.int64)  # targets as class indices

        batch_size = images.size(0)
        model = model.to(device)

        # Process images with pretrained processor
        inputs = processor(images, return_tensors="pt").to(device)

        # Forward pass
        logits = model(**inputs).logits

        # Compute loss (use one-hot targets for BCE/Focal-style loss)
        loss = criterion(
            logits, torch.nn.functional.one_hot(targets, num_classes=2) * 1.0
        )
        loss = loss / CONFIG['n_accumulate']  # gradient accumulation

        print(loss)

        # Backward pass
        loss.backward()

        # Update optimizer after accumulating gradients
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()
            optimizer.zero_grad()

            # Step scheduler if used
            if scheduler is not None:
                scheduler.step()

        predicted_label = torch.argmax(logits, dim=-1)  # predicted class indices
        auroc = binary_auroc(input=predicted_label, target=targets).item()

        # Track total loss and metric
        running_loss += (loss.item() * batch_size)
        running_auroc += (auroc * batch_size)
        dataset_size += batch_size

        # Compute average loss/metric so far
        train_epoch_loss = running_loss / dataset_size
        train_epoch_auroc = running_auroc / dataset_size

    gc.collect()  # free up memory

    return train_epoch_loss, train_epoch_auroc


This function trains the model for one epoch. For each batch:

In [23]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    running_auroc = 0.0
    
    for batch in valid_loader:   
        images = batch['image'].to(device, dtype=torch.float)
        targets = batch['target'].to(device, dtype=torch.int64)  # Ensure targets are int64 for class indices

        
        batch_size = images.size(0)

        model = model.to(device)

        # Ensure images are properly processed
        inputs = processor(images, return_tensors="pt").to(device)

        # No need for torch.no_grad() during training
        logits = model(**inputs).logits  # Get raw logits

        # Choose the appropriate loss function
        loss = criterion(logits, torch.nn.functional.one_hot(targets, num_classes=2) * 1.0)  # Ensure targets are not one-hot encoded
        loss = loss / CONFIG['n_accumulate']

        
        print(loss)

        predicted_label = torch.argmax(logits, dim=-1)  # Ensure to get class indices
        auroc = binary_auroc(input=predicted_label, target=targets).item()  # Ensure targets are class indices

        running_loss += (loss.item() * batch_size)
        running_auroc  += (auroc * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        epoch_auroc = running_auroc / dataset_size
        
      
    gc.collect()
    
    return epoch_loss, epoch_auroc

This function runs one validation epoch. The model is put in evaluation mode and gradients are disabled for efficiency. For each batch, it:

In [None]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Print which GPU is being used
    print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

In [None]:
# Function to fetch learning rate scheduler
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        # Cosine annealing schedule with a fixed cycle length (T_max)
        scheduler = lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=CONFIG['T_max'],
            eta_min=CONFIG['min_lr']
        )
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        # Cosine annealing with restarts after every T_0 epochs
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer,
            T_0=CONFIG['T_0'],
            eta_min=CONFIG['min_lr']
        )
    elif CONFIG['scheduler'] == None:
        return None
    
    return scheduler


In [26]:
device = torch.device("cuda") 

In [27]:
    optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)

In [None]:
# Define Adam optimizer with learning rate and weight decay from config
optimizer = optim.Adam(
    model.parameters(),
    lr=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay']
)

# Fetch learning rate scheduler based on config
scheduler = fetch_scheduler(optimizer)


Adam optimizer for training, using the learning rate and weight decay values from the config.

In [None]:
    

    #start = time.time()
    #best_model_wts = copy.deepcopy(model.state_dict())
    #best_epoch_auroc = -np.inf
    #history = {"Train Loss": [], "Valid Loss": [], 'Train AUROC': [], 'Valid AUROC' : [], 'lr' : []}
    
    for epoch in range(13,13+CONFIG['epochs']): 
        gc.collect()
        train_epoch_loss, train_epoch_auroc = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss, val_epoch_auroc = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Train AUROC'].append(train_epoch_auroc)
        history['Valid AUROC'].append(val_epoch_auroc)
        history['lr'].append( scheduler.get_lr()[0] )
        
        print(history)
        # deep copy the model
        if 2>1:# best_epoch_auroc <= val_epoch_auroc:
            print(f"Validation AUROC Improved ({best_epoch_auroc} ---> {val_epoch_auroc})")
            best_epoch_auroc = val_epoch_auroc
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "/home/webadmin/Desktop/isic/v2_AUROC{:.4f}_Loss{:.4f}_epoch{:.0f}_lossauroc.pth".format(val_epoch_auroc, val_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best AUROC: {:.4f}".format(best_epoch_auroc))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    

0
torch.Size([400]) torch.Size([400])
tensor(0.0167, grad_fn=<DivBackward0>)
1
torch.Size([400]) torch.Size([400])
tensor(0.0169, grad_fn=<DivBackward0>)
2
torch.Size([400]) torch.Size([400])
tensor(0.0119, grad_fn=<DivBackward0>)
3
torch.Size([400]) torch.Size([400])
tensor(0.0177, grad_fn=<DivBackward0>)


This is the training log per batch. The first number (0, 1, 2, 3) is the batch index. The two torch.Size([400]) show that both the model outputs and targets have batch size 400 (so shapes match). The (tensor) value is the loss for that batch (after dividing by n_accumulate). It’s hovering around 0.012–0.018, which is low — good sign. Small up/down moves between batches are normal; what matters is the overall downward trend across the epoch. If these losses keep decreasing (or stay consistently low) and AUROC stays high, the model is training well.