# Packages

In [1]:
# Internal Packages
from core.parameters import *
from core.net_list import NET_LIST
from core.scheduler_list import SCHEDULER_LIST
from core.optimizer_list import OPTIMIZER_LIST
from core.loss_list import LOSS_LIST
from nets.ResNet50Attention import ResNet50Attention
from nets.ResNet101Attention import ResNet101Attention
from common.myfunctions import plot_confusion_matrix
from common.customloss import QuadraticKappa, WeightedMultiLabelLogLoss, WeightedMultiLabelFocalLogLoss
import common.weights_initialization as w_init
import preprocess.preprocess as prep

# Base Packages
import os
import glob
import copy
import time
import pandas as pd
import numpy as np
from PIL import Image
#import pydicom

# Torch Packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Torchvision Packages
import torchvision.transforms.functional as TF
from torchvision import transforms, utils, datasets
from torchvision.models import densenet121, vgg16, resnet50, resnet101, inception_v3

# Miscellaneous Packages
from efficientnet_pytorch import EfficientNet
from skimage import io, transform
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.utils import class_weight
from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt
%matplotlib inline

  'Matplotlib is building the font cache using fc-list. '


# Summary

In [2]:
comb = len(INPUT_SIZES) * len(SAMPLE_FRACS) * len(BATCH_SIZES) * len(MODELS) * len(OPTIMIZERS) * len(SCHEDULERS) * len(LOSSES)
print('Total Combinations:', comb)
print()
i=1

for inp in INPUT_SIZES:
    for frac in SAMPLE_FRACS:
        for bch in BATCH_SIZES:
            for m in MODELS:
                for o in OPTIMIZERS:
                    for s in SCHEDULERS:
                        for l in LOSSES:
                            model_name = f'{i}\n Input Size: {str(inp)}\n Dataset Frac.: {str(frac)}\n Batch Size: {str(bch)}\n Model: {m}\n Scheduler: {s}\n Optimizer: {o}\n Loss: {l}\n'
                            print(model_name)
                            i += 1

Total Combinations: 2

1
 Input Size: 512
 Dataset Frac.: 0.5
 Batch Size: 64
 Model: ResNet101AttentionPre
 Scheduler: None
 Optimizer: Adam0001
 Loss: SmoothL1Loss

2
 Input Size: 512
 Dataset Frac.: 0.5
 Batch Size: 64
 Model: ResNet50AttentionPre
 Scheduler: None
 Optimizer: Adam0001
 Loss: SmoothL1Loss



# Cuda

In [3]:
if torch.cuda.is_available(): #GPU
    is_cuda = True
    
    if CUDA_DEVICES[0] == -1: # All GPUs
        CUDA_DEVICES = list(range(0, torch.cuda.device_count()))
    
    cuda_list = ','.join([str(c) for c in CUDA_DEVICES])
    
    device = torch.device("cuda:{}".format(cuda_list))
    
    print("Total GPU is", torch.cuda.device_count())
    
else: #CPU
    is_cuda = False
    device = "cpu"

# Set seed for CUDA (all GPU)    
#torch.cuda.manual_seed_all(SEED)    
    
print('Cuda:', is_cuda, ', Device:', device)

Total GPU is 1
Cuda: True , Device: cuda:0


# Custom Dataset

In [4]:
class CustomDataset(Dataset):

    def __init__(self, data_dir, test_split, sample_frac, input_size, transform=None, phase='train', clear_cache=False):

        self.input_size = input_size
        self.transform = transform
        self.x = []
        self.y = []

        ids = []
        labels = []
        
        # Load IDs and Labels from directories
        for d in os.listdir(data_dir):
            
            img_list = os.listdir(os.path.join(data_dir, d))
            ids.extend(img_list)
            labels.extend([d] * len(img_list))
            
        x_train, x_test, y_train, y_test = train_test_split(ids, labels, test_size = test_split, random_state = SEED)
        
        # Sample Train Dataset
        if sample_frac < 1.0:
            
            df = pd.DataFrame({'x': x_train, 'y': y_train})
            
            df_sample = df.sample(frac = sample_frac, random_state=SEED)
            
            x_train = df_sample['x'].tolist()
            y_train = df_sample['y'].tolist()

        # Check Object Phase
        if phase == 'train':
            self.x = x_train
            self.y = y_train
        elif phase == 'test':
            self.x = x_test
            self.y = y_test
        
        # Check for Preprocess Images
        prep.Preprocess(data_dir, self.x, self.y, input_size, clear_cache)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        
        img_name = os.path.join(DST_DATA_DIR, str(self.input_size), str(self.y[idx]), self.x[idx].split('.')[0] + '.npy')
        
        image = np.load(img_name)
        
        label = int(self.y[idx])
        
        if self.transform:
       
           image = self.transform(TF.to_pil_image(image))

        return (image,label)

# Data Loader

In [5]:
def getDataLoaders(input_size, sample_frac, batch_size):
    
    train_transf = transforms.Compose(TRAIN_AUGMENTATION)
    test_transf = transforms.Compose(TEST_AUGMENTATION)

    train_dataset = CustomDataset(DATA_DIR, 
                                  TEST_SPLIT, 
                                  sample_frac, 
                                  input_size, 
                                  transform=train_transf, 
                                  phase='train', 
                                  clear_cache=CLEAR_ALL_DATA_BEFORE_PREPROCESS)


    test_dataset = CustomDataset(DATA_DIR, 
                                  TEST_SPLIT, 
                                  sample_frac, 
                                  input_size, 
                                  transform=test_transf, 
                                  phase='test', 
                                  clear_cache=CLEAR_ALL_DATA_BEFORE_PREPROCESS)

    # Garregando os dados
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    # Make a dict to pass though train function
    dataloaders_dict = {'train': train_loader, 'val': test_loader}
    
    return dataloaders_dict

# Calc Classes Weight

In [6]:
if NUM_CLASSES > 1:

    distrib_freq = train_dataset.y.sum().to_numpy()

    w_classes = distrib_freq.sum() / (NUM_CLASSES * distrib_freq)

    for l in LOSSES:
        if 'weight' in LOSS_LIST[l]:
            LOSS_LIST[l]['weight'] = torch.from_numpy(w_classes).to(device)


# Model

In [7]:
def getModel(model_name, num_classes):
    
    model_parameters = NET_LIST[model_name]
    base_model = model_parameters['base_model']
    pretrained = model_parameters['pretrained']
    
    if base_model=='densenet121':
        
        model = densenet121(pretrained = pretrained)
        model.classifier = nn.Linear(1024, num_classes)   
            
    elif base_model=='densenet121multitask':
        
        model = densenet121multitask(pretrained = pretrained)
        model.classifier = nn.Linear(1024, num_classes)   
        model.aux_classifier = nn.Linear(1024, 1)   
            
    elif base_model=='vgg16':
        
        model = vgg16(pretrained = pretrained)
        model.classifier[6] = nn.Linear(4096, num_classes) 
    
    elif base_model=='resnet50':
        
        model = resnet50(pretrained = pretrained)
        model.fc = nn.Linear(2048, num_classes) 

    elif base_model=='resnet101':
        
        model = resnet101(pretrained = pretrained)
        model.fc = nn.Linear(2048, num_classes) 
        
    elif base_model=='ResNet50Attention':
        model = ResNet50Attention(num_classes, 
                                  attention=True, 
                                  pretrained = pretrained)

    elif base_model=='ResNet101Attention':
        model = ResNet101Attention(num_classes, 
                                  attention=True, 
                                  pretrained = pretrained)
       
    elif base_model=='ResNet50AttentionMultiTask':
        model = ResNet50AttentionMultiTask(num_classes, 
                                  attention=True, 
                                  pretrained = pretrained)
        
    elif base_model=='inception_v3':
        
        model = inception_v3(pretrained = pretrained)
        model.fc = nn.Linear(2048, num_classes) 
        model.AuxLogits.fc = nn.Linear(768, num_classes)
        
    elif base_model=='efficientnetb7':
        
        model = EfficientNet.from_pretrained('efficientnet-b7')
        model._fc = nn.Linear(2560, NUM_CLASSES) 
        
    # Parallel    
    # Obs.: when load model, the DataParallel is already in the model.
    if is_cuda & (torch.cuda.device_count() > 1) & (not model_parameters['is_inception']):
        
        if not CUDA_DEVICES:
            print("Let's use", torch.cuda.device_count(), "GPUs!")
            model = nn.DataParallel(model) 
        else:
            print("Let's use", CUDA_DEVICES, "GPUs!")
            model = nn.DataParallel(model, device_ids = CUDA_DEVICES) # When load checkpoint, the DataParallel is already in the model.
    
    # Frozen Layers
    for name, param in model.named_parameters():
        for l in model_parameters['layers_to_frozen']:
            if l in name:
                param.requires_grad = False

    if LOAD_CHECKPOINT:

        # Get lastest model file
        list_of_files = glob.glob(MODEL_DIR + f'/{base_model}_*.pt') # * means all if need specific format then *.csv
        
        if len(list_of_files) > 0:
            
            latest_file = max(list_of_files, key=os.path.getctime)

            print(f'Loading state dict from checkpoint \n\t {latest_file}')

            model.load_state_dict(torch.load(latest_file, map_location=device))
    else:
        
        if not pretrained:
            model.apply(w_init.weight_init) #Custom weight initialization
                
    if is_cuda:
        model = model.to(device)
        
    return model

# Scheduler

In [8]:
def getScheduler(scheduler_name, optimizer):
    
    if not scheduler_name:
        return None

    scheduler_parameters = SCHEDULER_LIST[scheduler_name]

    if scheduler_parameters['function'] == 'ReduceLROnPlateau':

        scheduler = ReduceLROnPlateau(optimizer, 
                                      mode = scheduler_parameters['mode'], 
                                      factor = scheduler_parameters['factor'], 
                                      patience = scheduler_parameters['patience'], 
                                      verbose = scheduler_parameters['verbose'], 
                                      threshold = scheduler_parameters['threshold'], 
                                      threshold_mode = scheduler_parameters['threshold_mode'], 
                                      cooldown = scheduler_parameters['cooldown'], 
                                      min_lr = scheduler_parameters['min_lr'], 
                                      eps = scheduler_parameters['eps'])

    return scheduler

# Optimizer

In [9]:
def getOptimizer(optimizer_name, model):

    params_to_update = []
    
    for name, param in model.named_parameters():
    
        if param.requires_grad == True:
        
            params_to_update.append(param)
            
            #print("\t",name)
            
    opt_parameters = OPTIMIZER_LIST[optimizer_name]

    if opt_parameters['function'] == 'Adam':
        
        optimizer = torch.optim.Adam(params_to_update, 
                                     lr = opt_parameters['lr'],
                                     betas = opt_parameters['betas'],
                                     eps = opt_parameters['eps'],
                                     weight_decay = opt_parameters['weight_decay'],
                                     amsgrad = opt_parameters['amsgrad']
                                    )
    elif opt_parameters['function'] == 'SGD':
        
        optimizer = torch.optim.SGD(params_to_update, 
                                     lr = opt_parameters['lr'],
                                     weight_decay = opt_parameters['weight_decay'],
                                     momentum = opt_parameters['momentum']
                                    )

    return optimizer

# Loss Function

In [10]:
def getLossFunction(loss_nme):
    
    loss_parameters = LOSS_LIST[loss_nme]

    if loss_parameters['function'] == 'SmoothL1Loss':
        criterion = nn.SmoothL1Loss(
            reduction = loss_parameters['reduction']
        )

    elif loss_parameters['function'] == 'CrossEntropyLoss':
        criterion = nn.CrossEntropyLoss(
            weight = loss_parameters['weight'],
            size_average = loss_parameters['size_average'],
            ignore_index = loss_parameters['ignore_index'],
            reduce = loss_parameters['reduce'],
            reduction = loss_parameters['reduction']
        )

    elif loss_parameters['function'] == 'NLLLoss':

        criterion = nn.NLLLoss(
            weight = loss_parameters['weight'],
            size_average = loss_parameters['size_average'],
            ignore_index = loss_parameters['ignore_index'],
            reduce = loss_parameters['reduce'],
            reduction = loss_parameters['reduction']
        )

    elif loss_parameters['function'] == 'QuadraticKappa':
        criterion = QuadraticKappa(
            n_classes = loss_parameters['n_classes']
        )
        
    elif loss_parameters['function'] == 'WeightedMultiLabelLogLoss':

        criterion = WeightedMultiLabelLogLoss(
            n_classes = loss_parameters['n_classes'],
            weight = loss_parameters['weight']
        )
    elif loss_parameters['function'] == 'WeightedMultiLabelFocalLogLoss':

        criterion = WeightedMultiLabelFocalLogLoss(
            n_classes = loss_parameters['n_classes'],
            weight = loss_parameters['weight'],
            gamma = loss_parameters['gamma']
        )
        
    return criterion

def onehot(labels, num_classes):
    return torch.zeros(len(labels), num_classes).scatter_(1, labels.unsqueeze(1).cpu(), 1.).cuda()


def calcLoss(criterion, loss_name, outputs, labels):
    
    loss_parameters = LOSS_LIST[loss_name]
    last_layer = loss_parameters['last_layer']
    
    if last_layer == 'softmax':
        outputs = torch.softmax(outputs, dim=1)
        preds_loss = torch.argmax(outputs, 1)
        preds_metric = torch.argmax(outputs, 1)
        
    elif last_layer == 'logsoftmax':
        logsoftmax = nn.LogSoftmax(dim=1)
        outputs = logsoftmax(outputs)
        preds_loss = outputs
        preds_metric = torch.argmax(torch.exp(outputs),  1) ### AINDA NÃO TESTADO.
        
        #OBS.: torch.exp(outputs) revert log
        
    elif last_layer == 'sigmoid':        
        outputs = torch.sigmoid(outputs)
        preds_loss = outputs > 0.5
        preds_metric = torch.argmax(outputs, 1)
        
    elif last_layer == 'linear':        
        preds_loss = outputs
        preds_metric = outputs
        labels = labels.type(torch.float)

    # Transform label from shape 1 to (1, n_classes)
    if loss_parameters['onehotlabel']:
        labels = onehot(labels, NUM_CLASSES)
        
    loss = criterion(preds_loss, labels)
            
    return loss, preds_metric

# Metric Function

In [11]:
def calcMetric(preds, labels):
    
    if METRIC == 'KAPPA':
        preds = np.round(preds)
        score = cohen_kappa_score(preds, labels, weights='quadratic')
    
    elif METRIC == 'ACC':
        score = sum(preds == labels)
        
    return score

# Train Function

In [12]:
def train_model(model, model_name, loss_name, dataloaders, criterion, optimizer, scheduler, num_epochs=25, is_inception=False):

    since = time.time()

    best_score = 0.0 if SAVE_BEST == 'metric' else float("inf")
    epoch_metric = 0.0
    
    print(model_name)
    print('-' * 100)

    for epoch in range(num_epochs):
        
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        epoch_since = time.time()
        lr = optimizer.param_groups[0]['lr']
                
        print('Learning Rate:', lr)
        tensorboard.add_scalar('LR', lr, epoch)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_preds = []
            running_labels = []

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    if is_inception and phase == 'train':
                        # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
                        outputs, aux_outputs = model(inputs)
                        
                        loss1, preds = calcLoss(criterion, loss_name, outputs, labels)
                        loss2, preds = calcLoss(criterion, loss_name, aux_outputs, labels)
                        
                        loss = loss1 + 0.4*loss2
                        
                    else:
                        
                        outputs = model(inputs)
                        
                        outputs = outputs.squeeze()
                        
                        loss, preds = calcLoss(criterion, loss_name, outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                
                # statistics
                running_loss += loss.item() * inputs.size(0)
                
                running_preds = np.append(running_preds, preds.squeeze().cpu().detach().numpy())
                running_labels = np.append(running_labels, labels.squeeze().cpu().detach().numpy())
                
            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            
            if METRIC:
                epoch_metric = calcMetric(running_preds, running_labels)
                tensorboard.add_scalar('{} {}'.format(METRIC, phase), epoch_metric, epoch)
            
            print('{} Loss: {:.4f} {}: {:.4f}'.format(phase, epoch_loss, METRIC, epoch_metric))
            
            # Write loss into Tensorboard
            tensorboard.add_scalar('Loss {}'.format(phase), epoch_loss, epoch)

            # Save the best model
            if phase == 'val':
                
                if scheduler:
                    scheduler.step(epoch_loss)
                
                save_flag = False
                
                if SAVE_BEST == 'metric' and epoch_metric > best_score:
                    
                    best_score = epoch_metric
                    save_flag = True
                    
                elif SAVE_BEST == 'loss' and epoch_loss < best_score:
                    
                    best_score = epoch_loss
                    save_flag = True
                
                if save_flag:
                    print('Saving the best model at {}'.format(MODEL_DIR))
                    torch.save(model.state_dict(), MODEL_DIR + '/' + model_name + '_' + SAVE_BEST + str(best_score) + '.pt')
            
                epoch_time_elapsed = time.time() - epoch_since
                print('Epoch time elapsed: {:.0f}m {:.0f}s'.format(epoch_time_elapsed // 60, epoch_time_elapsed % 60))
            
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val {}: {:4f}'.format(SAVE_BEST, best_score))

    return best_score


# Grid Search

In [13]:
model_name_list = []
metric_list = []

for inp in INPUT_SIZES:
    
    for frac in SAMPLE_FRACS:
        
        for bch in BATCH_SIZES:
        
            dataloaders_dict = getDataLoaders(inp, frac, bch)

            for m in MODELS:

                model_parameters = NET_LIST[m]
                base_model = model_parameters['base_model']
                model = getModel(m, NUM_CLASSES)

                for o in OPTIMIZERS:

                    optimizer = getOptimizer(o, model)

                    for s in SCHEDULERS:

                        scheduler = getScheduler(s, optimizer)

                        for l in LOSSES:

                            criterion = getLossFunction(l)

                            model_name = f'{base_model}_Inp{str(inp)}-{AUGMENTATION_TAG}-Data{str(frac)}-Bch{str(bch)}-{m}-{s}-{o}-{l}'

                            tensorboard = SummaryWriter(comment = model_name)

                            #summary(model, input_size=(CHANNELS, inp, inp))

                            # Train and evaluate
                            best_score = train_model(
                                model, 
                                model_name, 
                                l,
                                dataloaders_dict, 
                                criterion, 
                                optimizer, 
                                scheduler,
                                num_epochs=NUM_EPOCH, 
                                is_inception=NET_LIST[m]['is_inception'])

                            model_name_list.append(model_name)
                            metric_list.append(best_score)


Downloading: "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth" to /home/ubuntu/.cache/torch/checkpoints/resnet101-5d3b4d8f.pth
100.0%


Loading state dict from checkpoint 
	 /mnt/diabetic_retinopathy/models/ResNet101Attention_Inp512-DatasetMeanStd-Data0.5-Bch64-ResNet101AttentionPre-None-Adam0001-SmoothL1Loss_loss0.16344905125054568.pt


RuntimeError: Error(s) in loading state_dict for ResNet101Attention:
	Missing key(s) in state_dict: "conv1.weight", "bn1.weight", "bn1.bias", "bn1.running_mean", "bn1.running_var", "layer1.0.conv1.weight", "layer1.0.bn1.weight", "layer1.0.bn1.bias", "layer1.0.bn1.running_mean", "layer1.0.bn1.running_var", "layer1.0.conv2.weight", "layer1.0.bn2.weight", "layer1.0.bn2.bias", "layer1.0.bn2.running_mean", "layer1.0.bn2.running_var", "layer1.0.conv3.weight", "layer1.0.bn3.weight", "layer1.0.bn3.bias", "layer1.0.bn3.running_mean", "layer1.0.bn3.running_var", "layer1.0.downsample.0.weight", "layer1.0.downsample.1.weight", "layer1.0.downsample.1.bias", "layer1.0.downsample.1.running_mean", "layer1.0.downsample.1.running_var", "layer1.1.conv1.weight", "layer1.1.bn1.weight", "layer1.1.bn1.bias", "layer1.1.bn1.running_mean", "layer1.1.bn1.running_var", "layer1.1.conv2.weight", "layer1.1.bn2.weight", "layer1.1.bn2.bias", "layer1.1.bn2.running_mean", "layer1.1.bn2.running_var", "layer1.1.conv3.weight", "layer1.1.bn3.weight", "layer1.1.bn3.bias", "layer1.1.bn3.running_mean", "layer1.1.bn3.running_var", "layer1.2.conv1.weight", "layer1.2.bn1.weight", "layer1.2.bn1.bias", "layer1.2.bn1.running_mean", "layer1.2.bn1.running_var", "layer1.2.conv2.weight", "layer1.2.bn2.weight", "layer1.2.bn2.bias", "layer1.2.bn2.running_mean", "layer1.2.bn2.running_var", "layer1.2.conv3.weight", "layer1.2.bn3.weight", "layer1.2.bn3.bias", "layer1.2.bn3.running_mean", "layer1.2.bn3.running_var", "layer2.0.conv1.weight", "layer2.0.bn1.weight", "layer2.0.bn1.bias", "layer2.0.bn1.running_mean", "layer2.0.bn1.running_var", "layer2.0.conv2.weight", "layer2.0.bn2.weight", "layer2.0.bn2.bias", "layer2.0.bn2.running_mean", "layer2.0.bn2.running_var", "layer2.0.conv3.weight", "layer2.0.bn3.weight", "layer2.0.bn3.bias", "layer2.0.bn3.running_mean", "layer2.0.bn3.running_var", "layer2.0.downsample.0.weight", "layer2.0.downsample.1.weight", "layer2.0.downsample.1.bias", "layer2.0.downsample.1.running_mean", "layer2.0.downsample.1.running_var", "layer2.1.conv1.weight", "layer2.1.bn1.weight", "layer2.1.bn1.bias", "layer2.1.bn1.running_mean", "layer2.1.bn1.running_var", "layer2.1.conv2.weight", "layer2.1.bn2.weight", "layer2.1.bn2.bias", "layer2.1.bn2.running_mean", "layer2.1.bn2.running_var", "layer2.1.conv3.weight", "layer2.1.bn3.weight", "layer2.1.bn3.bias", "layer2.1.bn3.running_mean", "layer2.1.bn3.running_var", "layer2.2.conv1.weight", "layer2.2.bn1.weight", "layer2.2.bn1.bias", "layer2.2.bn1.running_mean", "layer2.2.bn1.running_var", "layer2.2.conv2.weight", "layer2.2.bn2.weight", "layer2.2.bn2.bias", "layer2.2.bn2.running_mean", "layer2.2.bn2.running_var", "layer2.2.conv3.weight", "layer2.2.bn3.weight", "layer2.2.bn3.bias", "layer2.2.bn3.running_mean", "layer2.2.bn3.running_var", "layer2.3.conv1.weight", "layer2.3.bn1.weight", "layer2.3.bn1.bias", "layer2.3.bn1.running_mean", "layer2.3.bn1.running_var", "layer2.3.conv2.weight", "layer2.3.bn2.weight", "layer2.3.bn2.bias", "layer2.3.bn2.running_mean", "layer2.3.bn2.running_var", "layer2.3.conv3.weight", "layer2.3.bn3.weight", "layer2.3.bn3.bias", "layer2.3.bn3.running_mean", "layer2.3.bn3.running_var", "layer3.0.conv1.weight", "layer3.0.bn1.weight", "layer3.0.bn1.bias", "layer3.0.bn1.running_mean", "layer3.0.bn1.running_var", "layer3.0.conv2.weight", "layer3.0.bn2.weight", "layer3.0.bn2.bias", "layer3.0.bn2.running_mean", "layer3.0.bn2.running_var", "layer3.0.conv3.weight", "layer3.0.bn3.weight", "layer3.0.bn3.bias", "layer3.0.bn3.running_mean", "layer3.0.bn3.running_var", "layer3.0.downsample.0.weight", "layer3.0.downsample.1.weight", "layer3.0.downsample.1.bias", "layer3.0.downsample.1.running_mean", "layer3.0.downsample.1.running_var", "layer3.1.conv1.weight", "layer3.1.bn1.weight", "layer3.1.bn1.bias", "layer3.1.bn1.running_mean", "layer3.1.bn1.running_var", "layer3.1.conv2.weight", "layer3.1.bn2.weight", "layer3.1.bn2.bias", "layer3.1.bn2.running_mean", "layer3.1.bn2.running_var", "layer3.1.conv3.weight", "layer3.1.bn3.weight", "layer3.1.bn3.bias", "layer3.1.bn3.running_mean", "layer3.1.bn3.running_var", "layer3.2.conv1.weight", "layer3.2.bn1.weight", "layer3.2.bn1.bias", "layer3.2.bn1.running_mean", "layer3.2.bn1.running_var", "layer3.2.conv2.weight", "layer3.2.bn2.weight", "layer3.2.bn2.bias", "layer3.2.bn2.running_mean", "layer3.2.bn2.running_var", "layer3.2.conv3.weight", "layer3.2.bn3.weight", "layer3.2.bn3.bias", "layer3.2.bn3.running_mean", "layer3.2.bn3.running_var", "layer3.3.conv1.weight", "layer3.3.bn1.weight", "layer3.3.bn1.bias", "layer3.3.bn1.running_mean", "layer3.3.bn1.running_var", "layer3.3.conv2.weight", "layer3.3.bn2.weight", "layer3.3.bn2.bias", "layer3.3.bn2.running_mean", "layer3.3.bn2.running_var", "layer3.3.conv3.weight", "layer3.3.bn3.weight", "layer3.3.bn3.bias", "layer3.3.bn3.running_mean", "layer3.3.bn3.running_var", "layer3.4.conv1.weight", "layer3.4.bn1.weight", "layer3.4.bn1.bias", "layer3.4.bn1.running_mean", "layer3.4.bn1.running_var", "layer3.4.conv2.weight", "layer3.4.bn2.weight", "layer3.4.bn2.bias", "layer3.4.bn2.running_mean", "layer3.4.bn2.running_var", "layer3.4.conv3.weight", "layer3.4.bn3.weight", "layer3.4.bn3.bias", "layer3.4.bn3.running_mean", "layer3.4.bn3.running_var", "layer3.5.conv1.weight", "layer3.5.bn1.weight", "layer3.5.bn1.bias", "layer3.5.bn1.running_mean", "layer3.5.bn1.running_var", "layer3.5.conv2.weight", "layer3.5.bn2.weight", "layer3.5.bn2.bias", "layer3.5.bn2.running_mean", "layer3.5.bn2.running_var", "layer3.5.conv3.weight", "layer3.5.bn3.weight", "layer3.5.bn3.bias", "layer3.5.bn3.running_mean", "layer3.5.bn3.running_var", "layer3.6.conv1.weight", "layer3.6.bn1.weight", "layer3.6.bn1.bias", "layer3.6.bn1.running_mean", "layer3.6.bn1.running_var", "layer3.6.conv2.weight", "layer3.6.bn2.weight", "layer3.6.bn2.bias", "layer3.6.bn2.running_mean", "layer3.6.bn2.running_var", "layer3.6.conv3.weight", "layer3.6.bn3.weight", "layer3.6.bn3.bias", "layer3.6.bn3.running_mean", "layer3.6.bn3.running_var", "layer3.7.conv1.weight", "layer3.7.bn1.weight", "layer3.7.bn1.bias", "layer3.7.bn1.running_mean", "layer3.7.bn1.running_var", "layer3.7.conv2.weight", "layer3.7.bn2.weight", "layer3.7.bn2.bias", "layer3.7.bn2.running_mean", "layer3.7.bn2.running_var", "layer3.7.conv3.weight", "layer3.7.bn3.weight", "layer3.7.bn3.bias", "layer3.7.bn3.running_mean", "layer3.7.bn3.running_var", "layer3.8.conv1.weight", "layer3.8.bn1.weight", "layer3.8.bn1.bias", "layer3.8.bn1.running_mean", "layer3.8.bn1.running_var", "layer3.8.conv2.weight", "layer3.8.bn2.weight", "layer3.8.bn2.bias", "layer3.8.bn2.running_mean", "layer3.8.bn2.running_var", "layer3.8.conv3.weight", "layer3.8.bn3.weight", "layer3.8.bn3.bias", "layer3.8.bn3.running_mean", "layer3.8.bn3.running_var", "layer3.9.conv1.weight", "layer3.9.bn1.weight", "layer3.9.bn1.bias", "layer3.9.bn1.running_mean", "layer3.9.bn1.running_var", "layer3.9.conv2.weight", "layer3.9.bn2.weight", "layer3.9.bn2.bias", "layer3.9.bn2.running_mean", "layer3.9.bn2.running_var", "layer3.9.conv3.weight", "layer3.9.bn3.weight", "layer3.9.bn3.bias", "layer3.9.bn3.running_mean", "layer3.9.bn3.running_var", "layer3.10.conv1.weight", "layer3.10.bn1.weight", "layer3.10.bn1.bias", "layer3.10.bn1.running_mean", "layer3.10.bn1.running_var", "layer3.10.conv2.weight", "layer3.10.bn2.weight", "layer3.10.bn2.bias", "layer3.10.bn2.running_mean", "layer3.10.bn2.running_var", "layer3.10.conv3.weight", "layer3.10.bn3.weight", "layer3.10.bn3.bias", "layer3.10.bn3.running_mean", "layer3.10.bn3.running_var", "layer3.11.conv1.weight", "layer3.11.bn1.weight", "layer3.11.bn1.bias", "layer3.11.bn1.running_mean", "layer3.11.bn1.running_var", "layer3.11.conv2.weight", "layer3.11.bn2.weight", "layer3.11.bn2.bias", "layer3.11.bn2.running_mean", "layer3.11.bn2.running_var", "layer3.11.conv3.weight", "layer3.11.bn3.weight", "layer3.11.bn3.bias", "layer3.11.bn3.running_mean", "layer3.11.bn3.running_var", "layer3.12.conv1.weight", "layer3.12.bn1.weight", "layer3.12.bn1.bias", "layer3.12.bn1.running_mean", "layer3.12.bn1.running_var", "layer3.12.conv2.weight", "layer3.12.bn2.weight", "layer3.12.bn2.bias", "layer3.12.bn2.running_mean", "layer3.12.bn2.running_var", "layer3.12.conv3.weight", "layer3.12.bn3.weight", "layer3.12.bn3.bias", "layer3.12.bn3.running_mean", "layer3.12.bn3.running_var", "layer3.13.conv1.weight", "layer3.13.bn1.weight", "layer3.13.bn1.bias", "layer3.13.bn1.running_mean", "layer3.13.bn1.running_var", "layer3.13.conv2.weight", "layer3.13.bn2.weight", "layer3.13.bn2.bias", "layer3.13.bn2.running_mean", "layer3.13.bn2.running_var", "layer3.13.conv3.weight", "layer3.13.bn3.weight", "layer3.13.bn3.bias", "layer3.13.bn3.running_mean", "layer3.13.bn3.running_var", "layer3.14.conv1.weight", "layer3.14.bn1.weight", "layer3.14.bn1.bias", "layer3.14.bn1.running_mean", "layer3.14.bn1.running_var", "layer3.14.conv2.weight", "layer3.14.bn2.weight", "layer3.14.bn2.bias", "layer3.14.bn2.running_mean", "layer3.14.bn2.running_var", "layer3.14.conv3.weight", "layer3.14.bn3.weight", "layer3.14.bn3.bias", "layer3.14.bn3.running_mean", "layer3.14.bn3.running_var", "layer3.15.conv1.weight", "layer3.15.bn1.weight", "layer3.15.bn1.bias", "layer3.15.bn1.running_mean", "layer3.15.bn1.running_var", "layer3.15.conv2.weight", "layer3.15.bn2.weight", "layer3.15.bn2.bias", "layer3.15.bn2.running_mean", "layer3.15.bn2.running_var", "layer3.15.conv3.weight", "layer3.15.bn3.weight", "layer3.15.bn3.bias", "layer3.15.bn3.running_mean", "layer3.15.bn3.running_var", "layer3.16.conv1.weight", "layer3.16.bn1.weight", "layer3.16.bn1.bias", "layer3.16.bn1.running_mean", "layer3.16.bn1.running_var", "layer3.16.conv2.weight", "layer3.16.bn2.weight", "layer3.16.bn2.bias", "layer3.16.bn2.running_mean", "layer3.16.bn2.running_var", "layer3.16.conv3.weight", "layer3.16.bn3.weight", "layer3.16.bn3.bias", "layer3.16.bn3.running_mean", "layer3.16.bn3.running_var", "layer3.17.conv1.weight", "layer3.17.bn1.weight", "layer3.17.bn1.bias", "layer3.17.bn1.running_mean", "layer3.17.bn1.running_var", "layer3.17.conv2.weight", "layer3.17.bn2.weight", "layer3.17.bn2.bias", "layer3.17.bn2.running_mean", "layer3.17.bn2.running_var", "layer3.17.conv3.weight", "layer3.17.bn3.weight", "layer3.17.bn3.bias", "layer3.17.bn3.running_mean", "layer3.17.bn3.running_var", "layer3.18.conv1.weight", "layer3.18.bn1.weight", "layer3.18.bn1.bias", "layer3.18.bn1.running_mean", "layer3.18.bn1.running_var", "layer3.18.conv2.weight", "layer3.18.bn2.weight", "layer3.18.bn2.bias", "layer3.18.bn2.running_mean", "layer3.18.bn2.running_var", "layer3.18.conv3.weight", "layer3.18.bn3.weight", "layer3.18.bn3.bias", "layer3.18.bn3.running_mean", "layer3.18.bn3.running_var", "layer3.19.conv1.weight", "layer3.19.bn1.weight", "layer3.19.bn1.bias", "layer3.19.bn1.running_mean", "layer3.19.bn1.running_var", "layer3.19.conv2.weight", "layer3.19.bn2.weight", "layer3.19.bn2.bias", "layer3.19.bn2.running_mean", "layer3.19.bn2.running_var", "layer3.19.conv3.weight", "layer3.19.bn3.weight", "layer3.19.bn3.bias", "layer3.19.bn3.running_mean", "layer3.19.bn3.running_var", "layer3.20.conv1.weight", "layer3.20.bn1.weight", "layer3.20.bn1.bias", "layer3.20.bn1.running_mean", "layer3.20.bn1.running_var", "layer3.20.conv2.weight", "layer3.20.bn2.weight", "layer3.20.bn2.bias", "layer3.20.bn2.running_mean", "layer3.20.bn2.running_var", "layer3.20.conv3.weight", "layer3.20.bn3.weight", "layer3.20.bn3.bias", "layer3.20.bn3.running_mean", "layer3.20.bn3.running_var", "layer3.21.conv1.weight", "layer3.21.bn1.weight", "layer3.21.bn1.bias", "layer3.21.bn1.running_mean", "layer3.21.bn1.running_var", "layer3.21.conv2.weight", "layer3.21.bn2.weight", "layer3.21.bn2.bias", "layer3.21.bn2.running_mean", "layer3.21.bn2.running_var", "layer3.21.conv3.weight", "layer3.21.bn3.weight", "layer3.21.bn3.bias", "layer3.21.bn3.running_mean", "layer3.21.bn3.running_var", "layer3.22.conv1.weight", "layer3.22.bn1.weight", "layer3.22.bn1.bias", "layer3.22.bn1.running_mean", "layer3.22.bn1.running_var", "layer3.22.conv2.weight", "layer3.22.bn2.weight", "layer3.22.bn2.bias", "layer3.22.bn2.running_mean", "layer3.22.bn2.running_var", "layer3.22.conv3.weight", "layer3.22.bn3.weight", "layer3.22.bn3.bias", "layer3.22.bn3.running_mean", "layer3.22.bn3.running_var", "layer4.0.conv1.weight", "layer4.0.bn1.weight", "layer4.0.bn1.bias", "layer4.0.bn1.running_mean", "layer4.0.bn1.running_var", "layer4.0.conv2.weight", "layer4.0.bn2.weight", "layer4.0.bn2.bias", "layer4.0.bn2.running_mean", "layer4.0.bn2.running_var", "layer4.0.conv3.weight", "layer4.0.bn3.weight", "layer4.0.bn3.bias", "layer4.0.bn3.running_mean", "layer4.0.bn3.running_var", "layer4.0.downsample.0.weight", "layer4.0.downsample.1.weight", "layer4.0.downsample.1.bias", "layer4.0.downsample.1.running_mean", "layer4.0.downsample.1.running_var", "layer4.1.conv1.weight", "layer4.1.bn1.weight", "layer4.1.bn1.bias", "layer4.1.bn1.running_mean", "layer4.1.bn1.running_var", "layer4.1.conv2.weight", "layer4.1.bn2.weight", "layer4.1.bn2.bias", "layer4.1.bn2.running_mean", "layer4.1.bn2.running_var", "layer4.1.conv3.weight", "layer4.1.bn3.weight", "layer4.1.bn3.bias", "layer4.1.bn3.running_mean", "layer4.1.bn3.running_var", "layer4.2.conv1.weight", "layer4.2.bn1.weight", "layer4.2.bn1.bias", "layer4.2.bn1.running_mean", "layer4.2.bn1.running_var", "layer4.2.conv2.weight", "layer4.2.bn2.weight", "layer4.2.bn2.bias", "layer4.2.bn2.running_mean", "layer4.2.bn2.running_var", "layer4.2.conv3.weight", "layer4.2.bn3.weight", "layer4.2.bn3.bias", "layer4.2.bn3.running_mean", "layer4.2.bn3.running_var", "fc.weight", "fc.bias", "projector1.op.weight", "projector2.op.weight", "projector3.op.weight", "attn1.op.weight", "attn2.op.weight", "attn3.op.weight". 
	Unexpected key(s) in state_dict: "module.conv1.weight", "module.bn1.weight", "module.bn1.bias", "module.bn1.running_mean", "module.bn1.running_var", "module.bn1.num_batches_tracked", "module.layer1.0.conv1.weight", "module.layer1.0.bn1.weight", "module.layer1.0.bn1.bias", "module.layer1.0.bn1.running_mean", "module.layer1.0.bn1.running_var", "module.layer1.0.bn1.num_batches_tracked", "module.layer1.0.conv2.weight", "module.layer1.0.bn2.weight", "module.layer1.0.bn2.bias", "module.layer1.0.bn2.running_mean", "module.layer1.0.bn2.running_var", "module.layer1.0.bn2.num_batches_tracked", "module.layer1.0.conv3.weight", "module.layer1.0.bn3.weight", "module.layer1.0.bn3.bias", "module.layer1.0.bn3.running_mean", "module.layer1.0.bn3.running_var", "module.layer1.0.bn3.num_batches_tracked", "module.layer1.0.downsample.0.weight", "module.layer1.0.downsample.1.weight", "module.layer1.0.downsample.1.bias", "module.layer1.0.downsample.1.running_mean", "module.layer1.0.downsample.1.running_var", "module.layer1.0.downsample.1.num_batches_tracked", "module.layer1.1.conv1.weight", "module.layer1.1.bn1.weight", "module.layer1.1.bn1.bias", "module.layer1.1.bn1.running_mean", "module.layer1.1.bn1.running_var", "module.layer1.1.bn1.num_batches_tracked", "module.layer1.1.conv2.weight", "module.layer1.1.bn2.weight", "module.layer1.1.bn2.bias", "module.layer1.1.bn2.running_mean", "module.layer1.1.bn2.running_var", "module.layer1.1.bn2.num_batches_tracked", "module.layer1.1.conv3.weight", "module.layer1.1.bn3.weight", "module.layer1.1.bn3.bias", "module.layer1.1.bn3.running_mean", "module.layer1.1.bn3.running_var", "module.layer1.1.bn3.num_batches_tracked", "module.layer1.2.conv1.weight", "module.layer1.2.bn1.weight", "module.layer1.2.bn1.bias", "module.layer1.2.bn1.running_mean", "module.layer1.2.bn1.running_var", "module.layer1.2.bn1.num_batches_tracked", "module.layer1.2.conv2.weight", "module.layer1.2.bn2.weight", "module.layer1.2.bn2.bias", "module.layer1.2.bn2.running_mean", "module.layer1.2.bn2.running_var", "module.layer1.2.bn2.num_batches_tracked", "module.layer1.2.conv3.weight", "module.layer1.2.bn3.weight", "module.layer1.2.bn3.bias", "module.layer1.2.bn3.running_mean", "module.layer1.2.bn3.running_var", "module.layer1.2.bn3.num_batches_tracked", "module.layer2.0.conv1.weight", "module.layer2.0.bn1.weight", "module.layer2.0.bn1.bias", "module.layer2.0.bn1.running_mean", "module.layer2.0.bn1.running_var", "module.layer2.0.bn1.num_batches_tracked", "module.layer2.0.conv2.weight", "module.layer2.0.bn2.weight", "module.layer2.0.bn2.bias", "module.layer2.0.bn2.running_mean", "module.layer2.0.bn2.running_var", "module.layer2.0.bn2.num_batches_tracked", "module.layer2.0.conv3.weight", "module.layer2.0.bn3.weight", "module.layer2.0.bn3.bias", "module.layer2.0.bn3.running_mean", "module.layer2.0.bn3.running_var", "module.layer2.0.bn3.num_batches_tracked", "module.layer2.0.downsample.0.weight", "module.layer2.0.downsample.1.weight", "module.layer2.0.downsample.1.bias", "module.layer2.0.downsample.1.running_mean", "module.layer2.0.downsample.1.running_var", "module.layer2.0.downsample.1.num_batches_tracked", "module.layer2.1.conv1.weight", "module.layer2.1.bn1.weight", "module.layer2.1.bn1.bias", "module.layer2.1.bn1.running_mean", "module.layer2.1.bn1.running_var", "module.layer2.1.bn1.num_batches_tracked", "module.layer2.1.conv2.weight", "module.layer2.1.bn2.weight", "module.layer2.1.bn2.bias", "module.layer2.1.bn2.running_mean", "module.layer2.1.bn2.running_var", "module.layer2.1.bn2.num_batches_tracked", "module.layer2.1.conv3.weight", "module.layer2.1.bn3.weight", "module.layer2.1.bn3.bias", "module.layer2.1.bn3.running_mean", "module.layer2.1.bn3.running_var", "module.layer2.1.bn3.num_batches_tracked", "module.layer2.2.conv1.weight", "module.layer2.2.bn1.weight", "module.layer2.2.bn1.bias", "module.layer2.2.bn1.running_mean", "module.layer2.2.bn1.running_var", "module.layer2.2.bn1.num_batches_tracked", "module.layer2.2.conv2.weight", "module.layer2.2.bn2.weight", "module.layer2.2.bn2.bias", "module.layer2.2.bn2.running_mean", "module.layer2.2.bn2.running_var", "module.layer2.2.bn2.num_batches_tracked", "module.layer2.2.conv3.weight", "module.layer2.2.bn3.weight", "module.layer2.2.bn3.bias", "module.layer2.2.bn3.running_mean", "module.layer2.2.bn3.running_var", "module.layer2.2.bn3.num_batches_tracked", "module.layer2.3.conv1.weight", "module.layer2.3.bn1.weight", "module.layer2.3.bn1.bias", "module.layer2.3.bn1.running_mean", "module.layer2.3.bn1.running_var", "module.layer2.3.bn1.num_batches_tracked", "module.layer2.3.conv2.weight", "module.layer2.3.bn2.weight", "module.layer2.3.bn2.bias", "module.layer2.3.bn2.running_mean", "module.layer2.3.bn2.running_var", "module.layer2.3.bn2.num_batches_tracked", "module.layer2.3.conv3.weight", "module.layer2.3.bn3.weight", "module.layer2.3.bn3.bias", "module.layer2.3.bn3.running_mean", "module.layer2.3.bn3.running_var", "module.layer2.3.bn3.num_batches_tracked", "module.layer3.0.conv1.weight", "module.layer3.0.bn1.weight", "module.layer3.0.bn1.bias", "module.layer3.0.bn1.running_mean", "module.layer3.0.bn1.running_var", "module.layer3.0.bn1.num_batches_tracked", "module.layer3.0.conv2.weight", "module.layer3.0.bn2.weight", "module.layer3.0.bn2.bias", "module.layer3.0.bn2.running_mean", "module.layer3.0.bn2.running_var", "module.layer3.0.bn2.num_batches_tracked", "module.layer3.0.conv3.weight", "module.layer3.0.bn3.weight", "module.layer3.0.bn3.bias", "module.layer3.0.bn3.running_mean", "module.layer3.0.bn3.running_var", "module.layer3.0.bn3.num_batches_tracked", "module.layer3.0.downsample.0.weight", "module.layer3.0.downsample.1.weight", "module.layer3.0.downsample.1.bias", "module.layer3.0.downsample.1.running_mean", "module.layer3.0.downsample.1.running_var", "module.layer3.0.downsample.1.num_batches_tracked", "module.layer3.1.conv1.weight", "module.layer3.1.bn1.weight", "module.layer3.1.bn1.bias", "module.layer3.1.bn1.running_mean", "module.layer3.1.bn1.running_var", "module.layer3.1.bn1.num_batches_tracked", "module.layer3.1.conv2.weight", "module.layer3.1.bn2.weight", "module.layer3.1.bn2.bias", "module.layer3.1.bn2.running_mean", "module.layer3.1.bn2.running_var", "module.layer3.1.bn2.num_batches_tracked", "module.layer3.1.conv3.weight", "module.layer3.1.bn3.weight", "module.layer3.1.bn3.bias", "module.layer3.1.bn3.running_mean", "module.layer3.1.bn3.running_var", "module.layer3.1.bn3.num_batches_tracked", "module.layer3.2.conv1.weight", "module.layer3.2.bn1.weight", "module.layer3.2.bn1.bias", "module.layer3.2.bn1.running_mean", "module.layer3.2.bn1.running_var", "module.layer3.2.bn1.num_batches_tracked", "module.layer3.2.conv2.weight", "module.layer3.2.bn2.weight", "module.layer3.2.bn2.bias", "module.layer3.2.bn2.running_mean", "module.layer3.2.bn2.running_var", "module.layer3.2.bn2.num_batches_tracked", "module.layer3.2.conv3.weight", "module.layer3.2.bn3.weight", "module.layer3.2.bn3.bias", "module.layer3.2.bn3.running_mean", "module.layer3.2.bn3.running_var", "module.layer3.2.bn3.num_batches_tracked", "module.layer3.3.conv1.weight", "module.layer3.3.bn1.weight", "module.layer3.3.bn1.bias", "module.layer3.3.bn1.running_mean", "module.layer3.3.bn1.running_var", "module.layer3.3.bn1.num_batches_tracked", "module.layer3.3.conv2.weight", "module.layer3.3.bn2.weight", "module.layer3.3.bn2.bias", "module.layer3.3.bn2.running_mean", "module.layer3.3.bn2.running_var", "module.layer3.3.bn2.num_batches_tracked", "module.layer3.3.conv3.weight", "module.layer3.3.bn3.weight", "module.layer3.3.bn3.bias", "module.layer3.3.bn3.running_mean", "module.layer3.3.bn3.running_var", "module.layer3.3.bn3.num_batches_tracked", "module.layer3.4.conv1.weight", "module.layer3.4.bn1.weight", "module.layer3.4.bn1.bias", "module.layer3.4.bn1.running_mean", "module.layer3.4.bn1.running_var", "module.layer3.4.bn1.num_batches_tracked", "module.layer3.4.conv2.weight", "module.layer3.4.bn2.weight", "module.layer3.4.bn2.bias", "module.layer3.4.bn2.running_mean", "module.layer3.4.bn2.running_var", "module.layer3.4.bn2.num_batches_tracked", "module.layer3.4.conv3.weight", "module.layer3.4.bn3.weight", "module.layer3.4.bn3.bias", "module.layer3.4.bn3.running_mean", "module.layer3.4.bn3.running_var", "module.layer3.4.bn3.num_batches_tracked", "module.layer3.5.conv1.weight", "module.layer3.5.bn1.weight", "module.layer3.5.bn1.bias", "module.layer3.5.bn1.running_mean", "module.layer3.5.bn1.running_var", "module.layer3.5.bn1.num_batches_tracked", "module.layer3.5.conv2.weight", "module.layer3.5.bn2.weight", "module.layer3.5.bn2.bias", "module.layer3.5.bn2.running_mean", "module.layer3.5.bn2.running_var", "module.layer3.5.bn2.num_batches_tracked", "module.layer3.5.conv3.weight", "module.layer3.5.bn3.weight", "module.layer3.5.bn3.bias", "module.layer3.5.bn3.running_mean", "module.layer3.5.bn3.running_var", "module.layer3.5.bn3.num_batches_tracked", "module.layer3.6.conv1.weight", "module.layer3.6.bn1.weight", "module.layer3.6.bn1.bias", "module.layer3.6.bn1.running_mean", "module.layer3.6.bn1.running_var", "module.layer3.6.bn1.num_batches_tracked", "module.layer3.6.conv2.weight", "module.layer3.6.bn2.weight", "module.layer3.6.bn2.bias", "module.layer3.6.bn2.running_mean", "module.layer3.6.bn2.running_var", "module.layer3.6.bn2.num_batches_tracked", "module.layer3.6.conv3.weight", "module.layer3.6.bn3.weight", "module.layer3.6.bn3.bias", "module.layer3.6.bn3.running_mean", "module.layer3.6.bn3.running_var", "module.layer3.6.bn3.num_batches_tracked", "module.layer3.7.conv1.weight", "module.layer3.7.bn1.weight", "module.layer3.7.bn1.bias", "module.layer3.7.bn1.running_mean", "module.layer3.7.bn1.running_var", "module.layer3.7.bn1.num_batches_tracked", "module.layer3.7.conv2.weight", "module.layer3.7.bn2.weight", "module.layer3.7.bn2.bias", "module.layer3.7.bn2.running_mean", "module.layer3.7.bn2.running_var", "module.layer3.7.bn2.num_batches_tracked", "module.layer3.7.conv3.weight", "module.layer3.7.bn3.weight", "module.layer3.7.bn3.bias", "module.layer3.7.bn3.running_mean", "module.layer3.7.bn3.running_var", "module.layer3.7.bn3.num_batches_tracked", "module.layer3.8.conv1.weight", "module.layer3.8.bn1.weight", "module.layer3.8.bn1.bias", "module.layer3.8.bn1.running_mean", "module.layer3.8.bn1.running_var", "module.layer3.8.bn1.num_batches_tracked", "module.layer3.8.conv2.weight", "module.layer3.8.bn2.weight", "module.layer3.8.bn2.bias", "module.layer3.8.bn2.running_mean", "module.layer3.8.bn2.running_var", "module.layer3.8.bn2.num_batches_tracked", "module.layer3.8.conv3.weight", "module.layer3.8.bn3.weight", "module.layer3.8.bn3.bias", "module.layer3.8.bn3.running_mean", "module.layer3.8.bn3.running_var", "module.layer3.8.bn3.num_batches_tracked", "module.layer3.9.conv1.weight", "module.layer3.9.bn1.weight", "module.layer3.9.bn1.bias", "module.layer3.9.bn1.running_mean", "module.layer3.9.bn1.running_var", "module.layer3.9.bn1.num_batches_tracked", "module.layer3.9.conv2.weight", "module.layer3.9.bn2.weight", "module.layer3.9.bn2.bias", "module.layer3.9.bn2.running_mean", "module.layer3.9.bn2.running_var", "module.layer3.9.bn2.num_batches_tracked", "module.layer3.9.conv3.weight", "module.layer3.9.bn3.weight", "module.layer3.9.bn3.bias", "module.layer3.9.bn3.running_mean", "module.layer3.9.bn3.running_var", "module.layer3.9.bn3.num_batches_tracked", "module.layer3.10.conv1.weight", "module.layer3.10.bn1.weight", "module.layer3.10.bn1.bias", "module.layer3.10.bn1.running_mean", "module.layer3.10.bn1.running_var", "module.layer3.10.bn1.num_batches_tracked", "module.layer3.10.conv2.weight", "module.layer3.10.bn2.weight", "module.layer3.10.bn2.bias", "module.layer3.10.bn2.running_mean", "module.layer3.10.bn2.running_var", "module.layer3.10.bn2.num_batches_tracked", "module.layer3.10.conv3.weight", "module.layer3.10.bn3.weight", "module.layer3.10.bn3.bias", "module.layer3.10.bn3.running_mean", "module.layer3.10.bn3.running_var", "module.layer3.10.bn3.num_batches_tracked", "module.layer3.11.conv1.weight", "module.layer3.11.bn1.weight", "module.layer3.11.bn1.bias", "module.layer3.11.bn1.running_mean", "module.layer3.11.bn1.running_var", "module.layer3.11.bn1.num_batches_tracked", "module.layer3.11.conv2.weight", "module.layer3.11.bn2.weight", "module.layer3.11.bn2.bias", "module.layer3.11.bn2.running_mean", "module.layer3.11.bn2.running_var", "module.layer3.11.bn2.num_batches_tracked", "module.layer3.11.conv3.weight", "module.layer3.11.bn3.weight", "module.layer3.11.bn3.bias", "module.layer3.11.bn3.running_mean", "module.layer3.11.bn3.running_var", "module.layer3.11.bn3.num_batches_tracked", "module.layer3.12.conv1.weight", "module.layer3.12.bn1.weight", "module.layer3.12.bn1.bias", "module.layer3.12.bn1.running_mean", "module.layer3.12.bn1.running_var", "module.layer3.12.bn1.num_batches_tracked", "module.layer3.12.conv2.weight", "module.layer3.12.bn2.weight", "module.layer3.12.bn2.bias", "module.layer3.12.bn2.running_mean", "module.layer3.12.bn2.running_var", "module.layer3.12.bn2.num_batches_tracked", "module.layer3.12.conv3.weight", "module.layer3.12.bn3.weight", "module.layer3.12.bn3.bias", "module.layer3.12.bn3.running_mean", "module.layer3.12.bn3.running_var", "module.layer3.12.bn3.num_batches_tracked", "module.layer3.13.conv1.weight", "module.layer3.13.bn1.weight", "module.layer3.13.bn1.bias", "module.layer3.13.bn1.running_mean", "module.layer3.13.bn1.running_var", "module.layer3.13.bn1.num_batches_tracked", "module.layer3.13.conv2.weight", "module.layer3.13.bn2.weight", "module.layer3.13.bn2.bias", "module.layer3.13.bn2.running_mean", "module.layer3.13.bn2.running_var", "module.layer3.13.bn2.num_batches_tracked", "module.layer3.13.conv3.weight", "module.layer3.13.bn3.weight", "module.layer3.13.bn3.bias", "module.layer3.13.bn3.running_mean", "module.layer3.13.bn3.running_var", "module.layer3.13.bn3.num_batches_tracked", "module.layer3.14.conv1.weight", "module.layer3.14.bn1.weight", "module.layer3.14.bn1.bias", "module.layer3.14.bn1.running_mean", "module.layer3.14.bn1.running_var", "module.layer3.14.bn1.num_batches_tracked", "module.layer3.14.conv2.weight", "module.layer3.14.bn2.weight", "module.layer3.14.bn2.bias", "module.layer3.14.bn2.running_mean", "module.layer3.14.bn2.running_var", "module.layer3.14.bn2.num_batches_tracked", "module.layer3.14.conv3.weight", "module.layer3.14.bn3.weight", "module.layer3.14.bn3.bias", "module.layer3.14.bn3.running_mean", "module.layer3.14.bn3.running_var", "module.layer3.14.bn3.num_batches_tracked", "module.layer3.15.conv1.weight", "module.layer3.15.bn1.weight", "module.layer3.15.bn1.bias", "module.layer3.15.bn1.running_mean", "module.layer3.15.bn1.running_var", "module.layer3.15.bn1.num_batches_tracked", "module.layer3.15.conv2.weight", "module.layer3.15.bn2.weight", "module.layer3.15.bn2.bias", "module.layer3.15.bn2.running_mean", "module.layer3.15.bn2.running_var", "module.layer3.15.bn2.num_batches_tracked", "module.layer3.15.conv3.weight", "module.layer3.15.bn3.weight", "module.layer3.15.bn3.bias", "module.layer3.15.bn3.running_mean", "module.layer3.15.bn3.running_var", "module.layer3.15.bn3.num_batches_tracked", "module.layer3.16.conv1.weight", "module.layer3.16.bn1.weight", "module.layer3.16.bn1.bias", "module.layer3.16.bn1.running_mean", "module.layer3.16.bn1.running_var", "module.layer3.16.bn1.num_batches_tracked", "module.layer3.16.conv2.weight", "module.layer3.16.bn2.weight", "module.layer3.16.bn2.bias", "module.layer3.16.bn2.running_mean", "module.layer3.16.bn2.running_var", "module.layer3.16.bn2.num_batches_tracked", "module.layer3.16.conv3.weight", "module.layer3.16.bn3.weight", "module.layer3.16.bn3.bias", "module.layer3.16.bn3.running_mean", "module.layer3.16.bn3.running_var", "module.layer3.16.bn3.num_batches_tracked", "module.layer3.17.conv1.weight", "module.layer3.17.bn1.weight", "module.layer3.17.bn1.bias", "module.layer3.17.bn1.running_mean", "module.layer3.17.bn1.running_var", "module.layer3.17.bn1.num_batches_tracked", "module.layer3.17.conv2.weight", "module.layer3.17.bn2.weight", "module.layer3.17.bn2.bias", "module.layer3.17.bn2.running_mean", "module.layer3.17.bn2.running_var", "module.layer3.17.bn2.num_batches_tracked", "module.layer3.17.conv3.weight", "module.layer3.17.bn3.weight", "module.layer3.17.bn3.bias", "module.layer3.17.bn3.running_mean", "module.layer3.17.bn3.running_var", "module.layer3.17.bn3.num_batches_tracked", "module.layer3.18.conv1.weight", "module.layer3.18.bn1.weight", "module.layer3.18.bn1.bias", "module.layer3.18.bn1.running_mean", "module.layer3.18.bn1.running_var", "module.layer3.18.bn1.num_batches_tracked", "module.layer3.18.conv2.weight", "module.layer3.18.bn2.weight", "module.layer3.18.bn2.bias", "module.layer3.18.bn2.running_mean", "module.layer3.18.bn2.running_var", "module.layer3.18.bn2.num_batches_tracked", "module.layer3.18.conv3.weight", "module.layer3.18.bn3.weight", "module.layer3.18.bn3.bias", "module.layer3.18.bn3.running_mean", "module.layer3.18.bn3.running_var", "module.layer3.18.bn3.num_batches_tracked", "module.layer3.19.conv1.weight", "module.layer3.19.bn1.weight", "module.layer3.19.bn1.bias", "module.layer3.19.bn1.running_mean", "module.layer3.19.bn1.running_var", "module.layer3.19.bn1.num_batches_tracked", "module.layer3.19.conv2.weight", "module.layer3.19.bn2.weight", "module.layer3.19.bn2.bias", "module.layer3.19.bn2.running_mean", "module.layer3.19.bn2.running_var", "module.layer3.19.bn2.num_batches_tracked", "module.layer3.19.conv3.weight", "module.layer3.19.bn3.weight", "module.layer3.19.bn3.bias", "module.layer3.19.bn3.running_mean", "module.layer3.19.bn3.running_var", "module.layer3.19.bn3.num_batches_tracked", "module.layer3.20.conv1.weight", "module.layer3.20.bn1.weight", "module.layer3.20.bn1.bias", "module.layer3.20.bn1.running_mean", "module.layer3.20.bn1.running_var", "module.layer3.20.bn1.num_batches_tracked", "module.layer3.20.conv2.weight", "module.layer3.20.bn2.weight", "module.layer3.20.bn2.bias", "module.layer3.20.bn2.running_mean", "module.layer3.20.bn2.running_var", "module.layer3.20.bn2.num_batches_tracked", "module.layer3.20.conv3.weight", "module.layer3.20.bn3.weight", "module.layer3.20.bn3.bias", "module.layer3.20.bn3.running_mean", "module.layer3.20.bn3.running_var", "module.layer3.20.bn3.num_batches_tracked", "module.layer3.21.conv1.weight", "module.layer3.21.bn1.weight", "module.layer3.21.bn1.bias", "module.layer3.21.bn1.running_mean", "module.layer3.21.bn1.running_var", "module.layer3.21.bn1.num_batches_tracked", "module.layer3.21.conv2.weight", "module.layer3.21.bn2.weight", "module.layer3.21.bn2.bias", "module.layer3.21.bn2.running_mean", "module.layer3.21.bn2.running_var", "module.layer3.21.bn2.num_batches_tracked", "module.layer3.21.conv3.weight", "module.layer3.21.bn3.weight", "module.layer3.21.bn3.bias", "module.layer3.21.bn3.running_mean", "module.layer3.21.bn3.running_var", "module.layer3.21.bn3.num_batches_tracked", "module.layer3.22.conv1.weight", "module.layer3.22.bn1.weight", "module.layer3.22.bn1.bias", "module.layer3.22.bn1.running_mean", "module.layer3.22.bn1.running_var", "module.layer3.22.bn1.num_batches_tracked", "module.layer3.22.conv2.weight", "module.layer3.22.bn2.weight", "module.layer3.22.bn2.bias", "module.layer3.22.bn2.running_mean", "module.layer3.22.bn2.running_var", "module.layer3.22.bn2.num_batches_tracked", "module.layer3.22.conv3.weight", "module.layer3.22.bn3.weight", "module.layer3.22.bn3.bias", "module.layer3.22.bn3.running_mean", "module.layer3.22.bn3.running_var", "module.layer3.22.bn3.num_batches_tracked", "module.layer4.0.conv1.weight", "module.layer4.0.bn1.weight", "module.layer4.0.bn1.bias", "module.layer4.0.bn1.running_mean", "module.layer4.0.bn1.running_var", "module.layer4.0.bn1.num_batches_tracked", "module.layer4.0.conv2.weight", "module.layer4.0.bn2.weight", "module.layer4.0.bn2.bias", "module.layer4.0.bn2.running_mean", "module.layer4.0.bn2.running_var", "module.layer4.0.bn2.num_batches_tracked", "module.layer4.0.conv3.weight", "module.layer4.0.bn3.weight", "module.layer4.0.bn3.bias", "module.layer4.0.bn3.running_mean", "module.layer4.0.bn3.running_var", "module.layer4.0.bn3.num_batches_tracked", "module.layer4.0.downsample.0.weight", "module.layer4.0.downsample.1.weight", "module.layer4.0.downsample.1.bias", "module.layer4.0.downsample.1.running_mean", "module.layer4.0.downsample.1.running_var", "module.layer4.0.downsample.1.num_batches_tracked", "module.layer4.1.conv1.weight", "module.layer4.1.bn1.weight", "module.layer4.1.bn1.bias", "module.layer4.1.bn1.running_mean", "module.layer4.1.bn1.running_var", "module.layer4.1.bn1.num_batches_tracked", "module.layer4.1.conv2.weight", "module.layer4.1.bn2.weight", "module.layer4.1.bn2.bias", "module.layer4.1.bn2.running_mean", "module.layer4.1.bn2.running_var", "module.layer4.1.bn2.num_batches_tracked", "module.layer4.1.conv3.weight", "module.layer4.1.bn3.weight", "module.layer4.1.bn3.bias", "module.layer4.1.bn3.running_mean", "module.layer4.1.bn3.running_var", "module.layer4.1.bn3.num_batches_tracked", "module.layer4.2.conv1.weight", "module.layer4.2.bn1.weight", "module.layer4.2.bn1.bias", "module.layer4.2.bn1.running_mean", "module.layer4.2.bn1.running_var", "module.layer4.2.bn1.num_batches_tracked", "module.layer4.2.conv2.weight", "module.layer4.2.bn2.weight", "module.layer4.2.bn2.bias", "module.layer4.2.bn2.running_mean", "module.layer4.2.bn2.running_var", "module.layer4.2.bn2.num_batches_tracked", "module.layer4.2.conv3.weight", "module.layer4.2.bn3.weight", "module.layer4.2.bn3.bias", "module.layer4.2.bn3.running_mean", "module.layer4.2.bn3.running_var", "module.layer4.2.bn3.num_batches_tracked", "module.fc.weight", "module.fc.bias", "module.projector1.op.weight", "module.projector2.op.weight", "module.projector3.op.weight", "module.attn1.op.weight", "module.attn2.op.weight", "module.attn3.op.weight". 

# The Best Model Metrics

In [None]:
fig, ax = plt.subplots()    
width = 0.75 # the width of the bars 
ind = np.arange(len(metric_list))  # the x locations for the groups
ax.barh(ind, metric_list, width)
ax.set_yticks(ind+width/2)
ax.set_yticklabels(model_name_list, minor=False)
plt.xlabel('Loss')
for i, v in enumerate(metric_list):
    ax.text(v, i, str(v))