In [8]:
import os
print("Current working directory:", os.getcwd())
IMAGES_DIR = os.environ['SM_CHANNEL_TRAINING']

Current working directory: /home/ec2-user/SageMaker/histology-image-analysis


In [None]:
# From Perplexity.ai
# When creating your PyTorch estimator,
# specify the S3 location of your data as an input channel:
estimator = PyTorch(
    entry_point='train.py',
    role=role,
    instance_count=1,
    instance_type='m5.xlarge',
    # framework_version='1.8.1',
    # py_version='py3',
    input_mode='File',
    inputs={'training': 's3://mhist-streamlit-app/images/original/'}
)

In [9]:
# Load CSV files
import pandas as pd

# image codes are 3 letters long
# 'name' : MHIST_<code>.png
# 'label' = HP or SSA # binary, categorical label
# 'experts' = (int) 0 through 7
# 'code' = 3-letter image code

# Training set samples: 2175
# Test set samples: 977
train_df = pd.read_csv('training/trainset_info.csv')
test_df = pd.read_csv('training/testset_info.csv')
print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)
train_df.head(), test_df.head()

train_df.shape (2175, 3)
test_df.shape (977, 3)


(            name  experts  label
 0  MHIST_aaa.png        6      1
 1  MHIST_aab.png        0      0
 2  MHIST_aac.png        5      1
 3  MHIST_aae.png        1      0
 4  MHIST_aaf.png        5      1,
             name  experts  label
 0  MHIST_aag.png        2      0
 1  MHIST_aah.png        2      0
 2  MHIST_aaq.png        5      1
 3  MHIST_aar.png        0      0
 4  MHIST_aay.png        1      0)

In [10]:
import torch
from torchvision import transforms

# Don't resize nor crop. These are medical images, so we don't want to lose
# image integrity. Also, most models, like ViT expect images to be 224x224 pixels.

# ToTensor: Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a
# torch.FloatTensor with shape (C x H x W) in the range [0.0, 1.0]

# For Normalize: (calculated from the training data per channel)
train_mean = [0.738, 0.649, 0.775]
train_std =  [0.197, 0.244, 0.17]

# Flatten data for FC
DEFAULT_FC_TRANSFORMS = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(train_mean, train_std),
    transforms.Lambda(lambda x: torch.flatten(x))
])

# Don't need to flatten our 2-D, 3-channel image data for ViT
DEFAULT_VIT_TRANSFORMS = transforms.Compose([
    # transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(train_mean, train_std)
])

# We don't need to use a different transform for test sets here because
# we're only pre-processing images, not adding synthetic data


In [2]:
from PIL import Image
from torch.utils.data import Dataset

# Custom Dataset class:
class MHIST_dataset(Dataset):
    def __init__(self, df, images_dir=IMAGES_DIR, transform=None):
        self.df = df
        self.images_dir = images_dir
        if transform == None:
            print("Error: missing transform for MHIST_dataset")
            raise ValueError("Error: missing transform for MHIST_dataset")
        self.transform = transform


    def __len__(self):
        return len(self.df)


    # getitem() returns {image tensor, label int64, image filename}
    def __getitem__(self, idx):
        # df['label'] = 0 or 1 (int64)
        # df['name'] ex: MHIST_abc.png
        row = self.df.iloc[idx]
        full_path = os.path.join(self.images_dir, row['name'])
        image_PIL = Image.open(full_path).convert('RGB')
        if image_PIL is None:
            raise FileNotFoundError(full_path)
        if self.transform:
            image = self.transform(image_PIL) # includes ToTensor

        return {
            'image': image,
            'label': row['label'],
            'filename': row['name'],
        }

In [11]:
# Create a WeightedRandomSampler to balance the training data (not validation data)
# This will randomly oversample the minority class and undersample majority class during training
import numpy as np
from torch.utils.data import WeightedRandomSampler

WeightedRandomSampler len: 2175 dtype float64


In [12]:
# Use the random sampler instead of shuffle to create DataLoaders
# Use the original Dataset and Dataloader for validation
import math
from torch.utils.data import DataLoader
BATCH_SIZE = 32 #256 for A100
SHUFFLE = True
DROP_LAST_BATCH = False # The dataset size might not divisible by the batch size

TRAIN_SAMPLES = 2175
TEST_SAMPLES  =  977

over_FC_train_loader = DataLoader(MHIST_dataset(train_df, transform=DEFAULT_FC_TRANSFORMS),
                                    batch_size=BATCH_SIZE, sampler=sampler, drop_last=DROP_LAST_BATCH)

FC_val_loader = DataLoader(MHIST_dataset(test_df, transform=DEFAULT_FC_TRANSFORMS),
                        batch_size=BATCH_SIZE, shuffle=SHUFFLE, drop_last=DROP_LAST_BATCH)

over_FC_loaders = {'train':over_FC_train_loader, 'val':FC_val_loader}

num_batches     = math.floor(TRAIN_SAMPLES/BATCH_SIZE)
val_num_batches = math.floor(TEST_SAMPLES/BATCH_SIZE)
print('batch_size', BATCH_SIZE)
print('Floor of labels/batch size from annotations: ', num_batches, 'balanced train batches, ', val_num_batches, ' val batches')
print('                    Batches from dataloader: ', len(over_FC_loaders['train']), 'balanced train batches, ', len(over_FC_loaders['val']), ' val batches')


batch_size 32
Floor of labels/batch size from annotations:  67 balanced train batches,  30  val batches
                    Batches from dataloader:  68 balanced train batches,  31  val batches
