### Step 1: Convert Slide Images (.ndpi) to JPG Files

In [None]:
import openslide, os, shutil
import pandas as pd

os.makedirs('slide_images')
stages = ['0','IA', 'IB', 'IIA', 'IIB', 'IIIA', 'IIIB', 'IIIC','IV']
for stage in stages:
    os.makedirs(f"slide_images/{stage}")

slides_df = pd.read_csv('./datasets/brca-psj-path/contest-phase-2/slide-manifest-train.csv')
outcomes = pd.read_csv('./datasets/brca-psj-path/contest-phase-2/csv-train/outcomes.csv')
test_df = slides_df.merge(outcomes[['biopsy_id','patient_ngsci_id','stage']], on="biopsy_id", how="left")
paths = test_df.slide_path.to_list()
for path in paths:
    slide = openslide.open_slide(path)
    slide.read_region((0,0), 4, slide.level_dimensions[4]).convert("RGB").save(f"./slide_images/{path[40:-5]}.jpg")

images = os.listdir('./slide_images')
for image in images:
    stage = test_df[test_df.slide_id==image[:-4]].reset_index().stage[0]
    shutil.move(f"/home/ngsci/slide_images/{image}", f"/home/ngsci/slide_images/{stage}/{image}")
  
shutil.move(f"/home/ngsci/slide_images/IA", f"/home/ngsci/slide_images/I")
shutil.move(f"/home/ngsci/slide_images/IB", f"/home/ngsci/slide_images/I")
shutil.move(f"/home/ngsci/slide_images/IIA", f"/home/ngsci/slide_images/II")
shutil.move(f"/home/ngsci/slide_images/IIB", f"/home/ngsci/slide_images/II")
shutil.move(f"/home/ngsci/slide_images/IIIA", f"/home/ngsci/slide_images/III")
shutil.move(f"/home/ngsci/slide_images/IIIB", f"/home/ngsci/slide_images/III")
shutil.move(f"/home/ngsci/slide_images/IIIC", f"/home/ngsci/slide_images/III")

stages.remove['0']
stages.remove['IV']
for stage in stages:
    os.removedirs(f"./slide_images/{stage}")

### Step 2: Image Preprocessing Training dino-vitb16 Transformers Model

In [None]:
# Must install datasets from source on nightingale instance:
# !cd datasets-main
# !pip install -e .
# !cd ..
# !export TF_ENABLE_ONEDNN_OPTS=0

from datasets import load_dataset
from transformers import ViTImageProcessor, ViTForImageClassification

dataset = load_dataset('imagefolder', data_dir='./slide_images')
id2label = {id:label for id, label in enumerate(dataset['train'].features['label'].names)}
label2id = {label:id for id,label in id2label.items()}

# Must download model and upload to nightingale instance:
processor = ViTImageProcessor.from_pretrained("facebook/dino-vitb16")
model = ViTForImageClassification.from_pretrained("facebook/dino-vitb16", num_labels=5, ignore_mismatched_sizes=True)


In [None]:
from torchvision.transforms import (CenterCrop, 
                                    Compose, 
                                    Normalize, 
                                    RandomHorizontalFlip,
                                    RandomResizedCrop, 
                                    Resize, 
                                    ToTensor)

image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]
# size = 224

normalize = Normalize(mean=image_mean, std=image_std)
_train_transforms = Compose(
        [
            RandomResizedCrop(size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

_val_transforms = Compose(
        [
            Resize(size),
            CenterCrop(size),
            ToTensor(),
            normalize,
        ]
    )

def train_transforms(examples):
    examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
    return examples

dataset['train'].set_transform(train_transforms)

In [None]:
from torch.utils.data import DataLoader
import torch
from transformers import TrainingArguments, Trainer

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

args = TrainingArguments(
    'vitb16_tuned',
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    remove_unused_columns=False,
)

trainer = Trainer(
    model,
    args,
    train_dataset=dataset['train'],
    data_collator=collate_fn,
    tokenizer=processor
)

trainer.train()

### Step 3: Model Predictions Output and Submission
##### Make sure to change the checkpoint path to the best finetuned model

In [None]:
from PIL import Image

# Extracting Holdout Image Set
images = os.listdir('./datasets/brca-psj-path/ndpi-holdout')
for image in images:
    slide = openslide.open_slide(f"./datasets/brca-psj-path/ndpi-holdout/{image}")
    slide.read_region((0,0), 4, slide.level_dimensions[4]).convert("RGB").save(f"./slide_images/holdout/{image[:-5]}.jpg")

checkpoint = '' # Insert Checkpoint Path HERE!!!
processor = ViTImageProcessor.from_pretrained(checkpoint)
model = ViTForImageClassification.from_pretrained(checkpoint)

outputs = None
logits = None
predicted_class_id = None
results_df = pd.DataFrame()

def infer(slide):
    global logits, outputs, predicted_class_id, probs
    image = Image.open(slide)
    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax(-1).item()
    probs = torch.nn.functional.softmax(logits, dim=-1)
    print("Predicted class:", model.config.id2label[predicted_class_id])
    # print(f"Predicted class: {predicted_class_id}")
    
imageList = os.listdir('slide_images/holdout')       
for image in imageList:    
    infer(f"./holdout/{image}")
    results_df.loc[imageList.index(image),0] = str(image[:-4]) # biopsy id
    results_df.loc[imageList.index(image),1] = float(probs[0][0].item()) # prob 0
    results_df.loc[imageList.index(image),2] = float(probs[0][1].item()) # prob I
    results_df.loc[imageList.index(image),3] = float(probs[0][2].item()) # prob II
    results_df.loc[imageList.index(image),4] = float(probs[0][3].item()) # prob III
    results_df.loc[imageList.index(image),5] = float(probs[0][4].item()) # prob IV
    results_df.loc[imageList.index(image),6] = int(predicted_class_id) # predicted class

In [None]:
# Final submission CSV (Averaging Probabilities)
df = results_df.rename(columns={'0': 'slide_id'})
df2 = pd.read_csv('datasets/brca-psj-path/contest-phase-2/slide-manifest-holdout.csv')
final_df = df.merge(df2, on='slide_id', how='left')
biopsies = final_df.drop_duplicates('biopsy_id')['biopsy_id'].to_list()
submit_df = pd.DataFrame(columns=['0','1','2','3','4','5','6'])

for biopsy in biopsies:
    row = (len(submit_df)+1)
    selection = final_df[final_df.biopsy_id==biopsy][['1','2','3','4','5']].mean()
    submit_df.loc[row,'0'] = str(biopsy)
    submit_df.loc[row,'1'] = float(selection[0])
    submit_df.loc[row,'2'] = float(selection[1])
    submit_df.loc[row,'3'] = float(selection[2])
    submit_df.loc[row,'4'] = float(selection[3])
    submit_df.loc[row,'5'] = float(selection[4])
    submit_df.loc[row,'6'] = int(list(selection).index(selection.max()))

submit_df.to_csv('./project/submit_df.csv', index=False, header=False)

import ngsci
ngsci.submit_contest_entry("./project/submit_df", description="dino-vitb16 trained on level 4 slide images")