In [None]:
!pip install -r requirements.txt

In [None]:
# Author: Bonaventure F. P. Dossou - bonaventure.dossou@mila.quebec (bonaventuredossou.github.io)
# Data transformation, Models Configurations and Training (more details on Solution.md)
# Check License under LICENSE.md

from __future__ import print_function 
from __future__ import division
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm
import torchvision
from torchvision import datasets, transforms, models
from torchvision.models import resnet18, resnet50, resnet152, efficientnet_v2_m, convnext_base, wide_resnet101_2, vgg19_bn, regnet_x_32gf, swin_b, maxvit_t
import matplotlib.pyplot as plt
import time
import os
import copy
os.environ['TORCH_HOME'] = os.path.join('/','home','ngsci','project')

In [None]:
data_dir = os.path.join('/','home','ngsci','project', 'breast_cancer')
print(data_dir)
num_classes = 5
batch_size = 32
num_epochs = 50
feature_extract = False
num_gpus = [i for i in range(torch.cuda.device_count())]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if len(num_gpus) > 1:
    print("Let's use", len(num_gpus), "GPUs!")
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in num_gpus)

best_models = ["efficientnet", "maxvit", "swin", "wide_resnet101", "vgg", "convnext", "resnet50", "regnet"]

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import ngsci
import glob

home = os.getenv("HOME")
contest_dir = os.path.join(home, "datasets", "brca-psj-path", "contest-phase-2")
slide_manifest = pd.read_csv(os.path.join(contest_dir, "slide-manifest-holdout.csv"))

In [None]:
import openslide
def downsample_slide(ndpi_filepath, output_dir, slide_id):
    with openslide.OpenSlide(ndpi_filepath) as openslide_obj:
        dim = openslide_obj.dimensions
        new_dim = (224, 224)
        image = openslide_obj.get_thumbnail(new_dim)
        image.save(os.path.join(output_dir, f"{slide_id}.png"))

def create_images_to_dir(dataset_split, data_paths):
    directory = os.path.join('/','home','ngsci','project', 'breast_cancer', dataset_split)
    total_data = len(data_paths)
    for index in tqdm(range(total_data), desc ="Data Creation Progress"):
        _slide_id, slide_path = data_paths[index]
        downsample_slide(slide_path, directory, _slide_id)

test_data_dir = os.path.join('/','home','ngsci', 'project', 'breast_cancer', 'test')
test_slides_fp = os.path.join(test_data_dir,'*')
test_slides_list = glob.glob(test_slides_fp)
print('Eval Images: {}'.format(len(test_slides_list)))

In [None]:
from PIL import Image
from tqdm import tqdm
import csv
import pandas as pd
import numpy as np
from ast import literal_eval

def run_inference_image(path, model):
    model.eval()
    slide_id = path.split('/')[-1].split('.')[0]
    image = Image.open(path)

    transform_data = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.CenterCrop((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    
    img_t = transform_data(image)
    img_t = img_t.float().unsqueeze(0)
    with torch.no_grad():
        output = model(img_t.to(device))
    
    prediction = output.squeeze(0).softmax(0)
    class_id = prediction.argmax().item()
    all_proba_class_id = prediction.cpu().numpy().tolist() + [class_id]
    return slide_id, np.array(all_proba_class_id)

def run_inference(paths, model):
    predictions = []
    for index in tqdm(range(len(paths)), desc ="Evaluation Progress"):
        predictions.append(run_inference_image(paths[index], model))
    pred_dict = {biopsy: preds for biopsy, preds in predictions}
    return pred_dict

def build_model(model_name):
    
    if model_name == "resnet18":
        lr = 1e-5
        model_ft = resnet18(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft
    
    if model_name == "resnet50":
        lr = 1e-4
        model_ft = resnet50(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "resnet152":
        lr = 1e-5
        model_ft = resnet152(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft
    
    if model_name == "wide_resnet101":
        lr = 1e-4
        model_ft = wide_resnet101_2(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "vgg":
        lr = 1e-4
        model_ft = vgg19_bn(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft
    
    if model_name == "efficientnet":
        lr = 4e-4
        model_ft = efficientnet_v2_m(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.classifier[1].in_features
        model_ft.classifier[1] = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "efficientnet_l":
        lr = 4e-4
        model_ft = efficientnet_v2_l(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.classifier[1].in_features
        model_ft.classifier[1] = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "convnext":
        lr = 1e-5
        model_ft = convnext_base(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.classifier[2].in_features
        model_ft.classifier[2] = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "regnet":
        lr = 1e-5
        model_ft = regnet_x_32gf(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "swin":
        lr = 1e-5
        model_ft = swin_b(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.head.in_features
        model_ft.head = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "swin_v2_b":
        lr = 1e-5
        model_ft = swin_v2_b(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.head.in_features
        model_ft.head = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

    if model_name == "maxvit":
        lr = 1e-4
        model_ft = maxvit_t(weights=None)
        checkpoints = torch.load('best_final_weights/breast_cancer_{}_{}_{}_{}.pt'.format(model_name, batch_size, num_epochs, lr))
        num_ftrs = model_ft.classifier[5].in_features
        model_ft.classifier[5] = nn.Linear(num_ftrs, 5)
        model_ft.load_state_dict(checkpoints)
        return model_ft

def to_device(model):
    if len(num_gpus) > 1:
        model = torch.nn.DataParallel(model, device_ids=num_gpus)
        model = model.module
    model = model.to(device)
    return model

def save_predictions(pred_dict, name_model):
    frame = pd.DataFrame()
    frame['slide_id'] = list(pred_dict.keys())
    preds = np.array(list(pred_dict.values()))
    frame['prob_stage_0'] = preds[:, 0]
    frame['prob_stage_1'] = preds[:, 1]
    frame['prob_stage_2'] = preds[:, 2]
    frame['prob_stage_3'] = preds[:, 3]
    frame['prob_stage_4'] = preds[:, 4]
    frame['stage_pred'] = preds[:, 5]
    frame.to_csv('predictions/predictions_{}_{}_{}.csv'.format(name_model, batch_size, num_epochs), index=False)

for model_ in best_models:
    print('Predicting for {}'.format(model_))
    predictions_dict = run_inference(test_slides_list, to_device(build_model(model_)))
    save_predictions(predictions_dict, model_)

In [None]:
for model_ in best_models:
    file_ = 'predictions/predictions_{}_{}_{}.csv'.format(model_, batch_size, num_epochs)
    predictions_model = pd.read_csv(file_)
    
    biopsy_stage_prediction = (
        predictions_model
        .merge(slide_manifest)
        .drop(columns=['slide_id','slide_path','patient_ngsci_id'])
        .groupby("biopsy_id")
        .mean()
        .reset_index()
    )
    biopsy_stage_prediction.to_csv('predictions/final_predictions_{}_{}_{}.csv'.format(model_, batch_size, num_epochs), index=False)

In [None]:
from scipy.stats import gmean, tmean
import numpy as np

In [None]:
data = pd.read_csv('predictions/final_predictions_{}_{}_{}.csv'.format("efficientnet", batch_size, num_epochs))
biopsy_ids = data.biopsy_id.tolist()
final_frame = pd.DataFrame()
final_frame['biopsy_id'] = biopsy_ids

def get_model_column(model_name, column):
    data = pd.read_csv('predictions/final_predictions_{}_{}_{}.csv'.format(model_name, batch_size, num_epochs))
    return data[column].tolist()

columns = ['prob_stage_0', 'prob_stage_1', 'prob_stage_2', 'prob_stage_3', 'prob_stage_4']

for column in columns:
    columns_geo_mean = []
    for name_model in best_models:
        columns_geo_mean.append(get_model_column(name_model, column))
    
    geo_mean = [gmean([efficientnet, maxvit, swin, wide_resnet101, vgg, convnext, resnet50, regnet]) *
                tmean([efficientnet, maxvit, swin, wide_resnet101, vgg, convnext, resnet50, regnet])
                for efficientnet, maxvit, swin, wide_resnet101, vgg, convnext, resnet50, regnet
                in zip(*columns_geo_mean)]

    final_frame[column] = geo_mean

# normalize to get sum of proba -> 1
final_frame["Sum"] = final_frame.sum(axis=1)
final_frame = final_frame.loc[:,"prob_stage_0":"prob_stage_4"].div(final_frame["Sum"], axis=0)

def column_to_index(x):
    return columns.index(x)

final_frame['biopsy_id'] = biopsy_ids
cols = ['biopsy_id'] + columns
final_frame = final_frame.loc[:, cols]
stage_pred = []

for _ in range(len(final_frame)):
    stage_pred.append(np.argmax(final_frame.loc[_, :].values[1:]))

final_frame['stage_pred'] = stage_pred
final_frame.to_csv('predictions/preds_all_geo_times_arith_mean.csv', index=False, header=False)
final_frame.head()

In [None]:
import ngsci
submission_file = 'predictions/preds_all_geo_times_arith_mean.csv'
ngsci.submit_contest_entry(submission_file, description="preds_all_geo_times_arith_mean")