### Dependencies

In [1]:
# Base Dependencies
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

# LinAlg / Stats / Plotting Dependencies
import numpy as np
import pandas as pd
pd.set_option("display.precision", 3)
from tqdm import tqdm

# Scikit-Learn Imports
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# Utils
from patch_evaluation_utils import kendalltau_bpq

### How To Use
1. Create the "embeddings_patch_library" using "patch_extraction.py"
3. Run this notebook!

### CRC-100K (Without SN)

In [4]:
crc100k_nonorm_aucs_all = {}
models = ['beit',
          'beit_90',
          'beit_imagenet',
          'resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['beit',
               'beit_90',
          'beit_imagenet',
              'resnet50_trunc',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'crc100knonorm_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'crc100k_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']

    train_labels[train_labels=='MUS'] = 'STR'
    val_labels[val_labels=='MUS'] = 'STR'
    le = LabelEncoder().fit(train_labels)
    train_labels = le.transform(train_labels)
    val_labels = le.transform(val_labels)
    
    if enc in crc100k_nonorm_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        crc100k_nonorm_aucs_all[enc] = aucs
        print('ACC',np.mean(val_labels == y_pred))
aucs_df = pd.DataFrame(crc100k_nonorm_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = ['ADI', 'BACK', 'DEB', 'LYM', 'MUC', 'NORM', 'STR', 'TUM', 'All']
crc100kr = aucs_df.copy()
crc100kr

ACC 0.7080779944289693
ACC 0.6844011142061281
ACC 0.873816155988858
ACC 0.7615598885793872
ACC 0.8002785515320334
ACC 0.8194986072423398


Unnamed: 0,ADI,BACK,DEB,LYM,MUC,NORM,STR,TUM,All
beit,0.963,0.648,0.93,0.905,0.842,0.942,0.943,0.96,0.892
beit_90,0.96,0.537,0.916,0.927,0.792,0.949,0.939,0.963,0.873
beit_imagenet,0.991,0.908,0.985,0.985,0.967,0.944,0.987,0.974,0.968
resnet50_trunc,0.988,0.909,0.9,0.87,0.886,0.988,0.963,0.978,0.935
SimCLR (BRCA),0.981,0.765,0.955,0.951,0.926,0.976,0.979,0.973,0.938
DINO (BRCA),0.991,0.729,0.961,0.95,0.978,0.957,0.99,0.973,0.941


In [3]:
model_names

['beit',
 'beit_90',
 'beit_imagenet',
 'resnet50_trunc',
 'SimCLR (BRCA)',
 'DINO (BRCA)']

In [None]:
train_fname = os.path.join('./embeddings_patch_library/', 'crc100knonorm_train_%s.pkl' % enc)
with open(train_fname, 'rb') as handle:
    asset_dict = pickle.load(handle)
    train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

### CRC-100K (With SN)

In [51]:
crc100k_aucs_all = {}
models = ['beit',
          'beit_90',
          'beit_imagenet',
          
          'resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['beit',
            'beit_90',
          'beit_imagenet',
              'resnet50_trunc',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]
for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'crc100k_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'crc100k_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']

    train_labels[train_labels=='MUS'] = 'STR'
    val_labels[val_labels=='MUS'] = 'STR'
    le = LabelEncoder().fit(train_labels)
    train_labels = le.transform(train_labels)
    val_labels = le.transform(val_labels)
    
    if enc in crc100k_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        crc100k_aucs_all[enc] = aucs
        print('ACC',np.mean(val_labels == y_pred))
aucs_df = pd.DataFrame(crc100k_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = ['ADI', 'BACK', 'DEB', 'LYM', 'MUC', 'NORM', 'STR', 'TUM', 'All']
crc100kn = aucs_df.copy()
crc100kn

ACC 0.8881615598885794
ACC 0.9338440111420613
ACC 0.8771587743732591
ACC 0.9071030640668524
ACC 0.8991643454038997
ACC 0.9363509749303621


Unnamed: 0,ADI,BACK,DEB,LYM,MUC,NORM,STR,TUM,All
beit,0.986,1.0,0.991,0.979,0.923,0.972,0.959,0.974,0.973
beit_90,0.996,1.0,0.999,0.993,0.983,0.957,0.991,0.977,0.987
beit_imagenet,0.989,1.0,0.997,0.993,0.899,0.967,0.96,0.973,0.972
resnet50_trunc,0.983,1.0,0.997,0.974,0.963,0.988,0.982,0.978,0.983
SimCLR (BRCA),0.988,1.0,0.994,0.98,0.969,0.973,0.979,0.969,0.981
DINO (BRCA),0.999,1.0,0.999,0.985,0.992,0.96,0.992,0.967,0.987


In [44]:
import torch
weight1 = torch.load('/dssg/home/acct-medftn/medftn/BEPT/Model/mmselfsup/TCGA_Checkpoints/beitv2_backbone_imagenet1k.pth')
weight2 = torch.load('/dssg/home/acct-medftn/medftn/BEPT/Model/mmselfsup/TCGA_Checkpoints/beitv2_vit-base_imagenet.pth')

In [49]:
weight2['state_dict'].keys()

dict_keys(['backbone.cls_token', 'backbone.mask_token', 'backbone.patch_embed.projection.weight', 'backbone.patch_embed.projection.bias', 'backbone.rel_pos_bias.relative_position_bias_table', 'backbone.rel_pos_bias.relative_position_index', 'backbone.layers.0.gamma_1', 'backbone.layers.0.gamma_2', 'backbone.layers.0.ln1.weight', 'backbone.layers.0.ln1.bias', 'backbone.layers.0.attn.q_bias', 'backbone.layers.0.attn.v_bias', 'backbone.layers.0.attn.qkv.weight', 'backbone.layers.0.attn.proj.weight', 'backbone.layers.0.attn.proj.bias', 'backbone.layers.0.ln2.weight', 'backbone.layers.0.ln2.bias', 'backbone.layers.0.ffn.layers.0.0.weight', 'backbone.layers.0.ffn.layers.0.0.bias', 'backbone.layers.0.ffn.layers.1.weight', 'backbone.layers.0.ffn.layers.1.bias', 'backbone.layers.1.gamma_1', 'backbone.layers.1.gamma_2', 'backbone.layers.1.ln1.weight', 'backbone.layers.1.ln1.bias', 'backbone.layers.1.attn.q_bias', 'backbone.layers.1.attn.v_bias', 'backbone.layers.1.attn.qkv.weight', 'backbone.lay

### 线性分类协议

In [41]:
enc = 'beit'
train_fname = os.path.join('./embeddings_patch_library/', 'crc100k_train_%s.pkl' % enc)
with open(train_fname, 'rb') as handle:
    asset_dict = pickle.load(handle)
    train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']
val_fname = os.path.join('./embeddings_patch_library/', 'crc100k_val_%s.pkl' % enc)
with open(val_fname, 'rb') as handle:
    asset_dict = pickle.load(handle)
    val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']
le = LabelEncoder().fit(train_labels)
train_labels = le.transform(train_labels)
val_labels = le.transform(val_labels)


In [42]:
train_embeddings.shape


(100000, 768)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# 定义线性分类器神经网络
class LinearClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LinearClassifier, self).__init__()
        
        # 线性层
        self.linear1 = nn.Linear(input_size, 1000)
        self.linear2 = nn.Linear(1000, 9)
        # Softmax激活函数
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        # 前向传播
        x = self.linear1(x)

        x = self.linear2(x)
        x = self.softmax(x)
        return x
# import torch
# import torch.nn as nn
# import torchvision
# # 定义线性分类协议网络
# class LinearProtocolNet(nn.Module):
#     def __init__(self, pretrained_model, num_classes):
#         super(LinearProtocolNet, self).__init__()
        
#         # 冻结预训练模型的参数
#         for param in pretrained_model.parameters():
#             param.requires_grad = False
        
#         # 获取预训练模型的输出特征维度
#         pretrained_features = pretrained_model.fc.in_features
        
#         # 添加线性分类器层
#         self.classifier = nn.Linear(pretrained_features, num_classes)
        
#     def forward(self, x):
#         # 前向传播
#         x = self.classifier(x)
#         return x

# 示例用法
# 假设预训练模型为resnet18，输出类别数为10
# pretrained_model = torchvision.models.resnet18(pretrained=True)
num_classes = 9

# 创建线性分类协议网络
# linear_classifier = LinearProtocolNet(pretrained_model, num_classes)
# print(linear_protocol_net)
# 打印网络结构

# 创建线性分类器神经网络
input_size = 768
num_classes = 9
linear_classifier = LinearClassifier(input_size, num_classes).to('cuda')

# 创建虚拟数据

input_data = torch.tensor(train_embeddings).to('cuda')
labels = torch.tensor(train_labels).to('cuda')

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(linear_classifier.parameters(), lr=5)
optimizer =  optim.Adam(linear_classifier.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
valinput_data = torch.tensor(val_embeddings).to('cuda')
labels_test = torch.tensor(val_labels).to('cuda')


# 进行训练
num_epochs = 100
for epoch in range(num_epochs):
    # 前向传播
    outputs = linear_classifier(input_data)
    
    # 计算损失
    loss = criterion(outputs, labels)
    predicted_train = torch.max(outputs.data, 1)
    accuracy_train = ((predicted_train.indices == labels).to(float)).sum().item() / labels.shape[0]
    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # 打印训练信息
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()},acc_train:{accuracy_train}')
    linear_classifier.eval()
    with torch.no_grad():
        outputs_test = linear_classifier(valinput_data)
        _, predicted_test = torch.max(outputs_test.data, 1)
        accuracy_test = (predicted_test == labels_test).sum().item() / valinput_data.shape[0]
        print(accuracy_test)

In [None]:
predicted_train

### BreastPathQ

In [None]:
bpq_mse_all = []
models = ['resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['ImageNet',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'breastpathq_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'breastpathq_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']
    
    clf = LinearRegression().fit(train_embeddings, train_labels)
    y_score = clf.predict(val_embeddings)
    bpq_mse_all.append([sklearn.metrics.mean_squared_error(val_labels, y_score), kendalltau_bpq(val_labels, y_score)])

mse_df = pd.DataFrame(bpq_mse_all)
mse_df.columns = ['MSE', 'Tau']
mse_df.index = model_names
bpq = mse_df.copy()
bpq

### BCSS

In [None]:
bcss_aucs_all = {}
models = ['resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['ImageNet',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]

for enc in models:
    train_fname = './embeddings_patch_library/bcss_train_%s.pkl' % enc
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = './embeddings_patch_library/bcss_val_%s.pkl' % enc
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']
    
    if enc in bcss_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        bcss_aucs_all[enc] = aucs
        
aucs_df = pd.DataFrame(bcss_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = list(np.unique(train_labels)) + ['All']
bcss = aucs_df.copy()
bcss