# Running Stats Analysis

We ran this section to get the statistical signficance numbers. We provide the already extraced csv files which can be used in the summary section.  

In [80]:
import argparse
from tqdm import tqdm

from train import init_model
import torch
import torchvision.transforms as transforms
from torch import nn
from torchvision.models import resnet50, vgg16
from PIL import Image
import scipy.stats as stats
import numpy as np
from argparser import get_args_parser
import utils
from data import get_data_loaders
from scipy.stats import wilcoxon, mannwhitneyu, kruskal
from sklearn.utils import resample

import itertools
import os

Manually change the dataset path. 

In [95]:
parser = argparse.ArgumentParser()
parser.add_argument('arch')
args = get_args_parser().parse_known_args()[0]
data_name = 'BUSI'
# args.data_path = f'dataset/{data_name}'
args.data_path = f'dataset/BUSI/'
num_classes = 3
args.data_path

'dataset/BUSI/'

In [96]:
models = ["resnet50", "vgg16", "vit-ti16", "vit-s16", "vit-s32", "vit-b16", "vit-b32", "vim-s", "vssm-ti", "vssm-s", "vssm-b"]
model_pairs = list(itertools.permutations(models, 2))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Set seeds for reproducibility
utils.set_seed(args.seed)

results = {}
for model_pair in tqdm(model_pairs):
    model1_preds = []
    model2_preds = []
    for fold_index in range(args.k_folds):
        train_loader, val_loader, test_loader = get_data_loaders(args, seed=fold_index + args.seed)
 
        args.arch = model_pair[0]
        model1 = init_model(args, device, num_classes)
        state_dict = torch.load(f'checkpoints/results_{data_name}/{args.arch}/fold_{fold_index}/best_checkpoint.pth')['model']
        model1.load_state_dict(state_dict)

        args.arch = model_pair[1]
        model2 = init_model(args, device, num_classes)
        state_dict = torch.load(f'checkpoints/results_{data_name}/{args.arch}/fold_{fold_index}/best_checkpoint.pth')['model']
        model2.load_state_dict(state_dict)

        model1.eval()
        model2.eval()
        fold_preds1 = []
        fold_preds2 = []
        for data, label in test_loader:
            data = data.to(device)
            label = label.to(device)
            with torch.no_grad():
                pred1 = model1(data)
                pred2 = model2(data)
            pred1 = (torch.argmax(pred1, axis=1) == label).to(torch.int32)
            pred2 = (torch.argmax(pred2, axis=1) == label).to(torch.int32)
            fold_preds1.append(pred1.cpu().numpy())
            fold_preds2.append(pred2.cpu().numpy())
        model1_preds.append(np.concatenate(fold_preds1))
        model2_preds.append(np.concatenate(fold_preds2))
    model1_preds = np.concatenate(model1_preds)
    model2_preds = np.concatenate(model2_preds)
    diff = model1_preds-model2_preds
    t_statistic, p_value_t = stats.ttest_1samp(diff, 0)
    stat_wilcoxon, p_value_wilcoxon = wilcoxon(model1_preds, model2_preds)
    stat_mannwhitney, p_value_mannwhitney = mannwhitneyu(model1_preds, model2_preds)
    stat_kruskal, p_value_kruskal = kruskal(model1_preds, model2_preds)
    
    # Bootstrap sampling for mean difference confidence interval
    mean_diffs = []
    for _ in range(1000):  # Number of bootstrap samples
        sample1 = resample(model1_preds)
        sample2 = resample(model2_preds)
        mean_diffs.append(np.mean(sample1 - sample2))
    ci_lower = np.percentile(mean_diffs, 2.5)
    ci_upper = np.percentile(mean_diffs, 97.5)
    
    print(f"Testing for {model_pair}")

    alpha = 0.05  # commonly used significance level
    # T-test results
    if p_value_t < alpha:
        print(f"  T-test: T={t_statistic:.2f}, p={p_value_t:.3f}. Result is statistically significant.")
    else:
        print(f"  T-test: T={t_statistic:.2f}, p={p_value_t:.3f}. Result is not statistically significant.")

    # Wilcoxon test results
    if p_value_wilcoxon < alpha:
        print(f"  Wilcoxon Signed-Rank Test: Statistic={stat_wilcoxon}, p={p_value_wilcoxon:.3f}. Result is statistically significant.")
    else:
        print(f"  Wilcoxon Signed-Rank Test: Statistic={stat_wilcoxon}, p={p_value_wilcoxon:.3f}. Result is not statistically significant.")

    # Mann-Whitney U test results
    if p_value_mannwhitney < alpha:
        print(f"  Mann-Whitney U Test: U={stat_mannwhitney}, p={p_value_mannwhitney:.3f}. Result is statistically significant.")
    else:
        print(f"  Mann-Whitney U Test: U={stat_mannwhitney}, p={p_value_mannwhitney:.3f}. Result is not statistically significant.")

    # Kruskal-Wallis test results
    if p_value_kruskal < alpha:
        print(f"  Kruskal-Wallis Test: H={stat_kruskal}, p={p_value_kruskal:.3f}. Result is statistically significant.")
    else:
        print(f"  Kruskal-Wallis Test: H={stat_kruskal}, p={p_value_kruskal:.3f}. Result is not statistically significant.")

    # Bootstrap confidence interval
    print(f"  95% CI for mean difference: {ci_lower:.2f} to {ci_upper:.2f}")
    
    results[model_pair] = {
        'T-test': (t_statistic, p_value_t),
        'Wilcoxon': (stat_wilcoxon, p_value_wilcoxon),
        'Mann-Whitney': (stat_mannwhitney, p_value_mannwhitney),
        'Kruskal-Wallis': (stat_kruskal, p_value_kruskal),
        'Bootstrap CI': (ci_lower, ci_upper)
    }

  1%|██▏                                                                                                                                                                                                                                           | 1/110 [00:24<43:54, 24.17s/it]

Testing for ('resnet50', 'vgg16')
  T-test: T=0.12, p=0.901. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1056.0, p=0.901. Result is not statistically significant.
  Mann-Whitney U Test: U=171405.0, p=0.934. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.006910249514144428, p=0.934. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


  2%|████▎                                                                                                                                                                                                                                         | 2/110 [00:32<26:51, 14.92s/it]

Testing for ('resnet50', 'vit-ti16')
  T-test: T=-0.24, p=0.811. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1207.0, p=0.811. Result is not statistically significant.
  Mann-Whitney U Test: U=170527.5, p=0.867. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.02805644890333662, p=0.867. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


  3%|██████▍                                                                                                                                                                                                                                       | 3/110 [00:42<22:15, 12.48s/it]

Testing for ('resnet50', 'vit-s16')
  T-test: T=-0.58, p=0.559. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1258.0, p=0.558. Result is not statistically significant.
  Mann-Whitney U Test: U=169650.0, p=0.673. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.1780481415381438, p=0.673. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.03


  4%|████████▋                                                                                                                                                                                                                                     | 4/110 [00:50<19:17, 10.92s/it]

Testing for ('resnet50', 'vit-s32')
  T-test: T=0.96, p=0.335. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1716.0, p=0.335. Result is not statistically significant.
  Mann-Whitney U Test: U=173745.0, p=0.463. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.538737262533538, p=0.463. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


  5%|██████████▊                                                                                                                                                                                                                                   | 5/110 [01:08<23:24, 13.38s/it]

Testing for ('resnet50', 'vit-b16')
  T-test: T=-1.10, p=0.272. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=986.0, p=0.272. Result is not statistically significant.
  Mann-Whitney U Test: U=168480.0, p=0.443. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5890487654676368, p=0.443. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.02


  5%|████████████▉                                                                                                                                                                                                                                 | 6/110 [01:23<24:10, 13.94s/it]

Testing for ('resnet50', 'vit-b32')
  T-test: T=-0.23, p=0.821. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1501.0, p=0.821. Result is not statistically significant.
  Mann-Whitney U Test: U=170527.5, p=0.867. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.02805644890333662, p=0.867. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


  6%|███████████████▏                                                                                                                                                                                                                              | 7/110 [01:35<22:46, 13.26s/it]

Testing for ('resnet50', 'vim-s')
  T-test: T=-1.70, p=0.091. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=690.0, p=0.091. Result is not statistically significant.
  Mann-Whitney U Test: U=167310.0, p=0.263. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.2557508342609243, p=0.262. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


  7%|█████████████████▎                                                                                                                                                                                                                            | 8/110 [01:49<23:02, 13.55s/it]

Testing for ('resnet50', 'vssm-ti')
  T-test: T=-2.15, p=0.032. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=736.0, p=0.032. Result is statistically significant.
  Mann-Whitney U Test: U=166140.0, p=0.138. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.195640447393827, p=0.138. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.01


  8%|███████████████████▍                                                                                                                                                                                                                          | 9/110 [02:06<24:40, 14.66s/it]

Testing for ('resnet50', 'vssm-s')
  T-test: T=-1.10, p=0.272. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=986.0, p=0.272. Result is not statistically significant.
  Mann-Whitney U Test: U=168480.0, p=0.443. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5890487654676368, p=0.443. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.03


  9%|█████████████████████▌                                                                                                                                                                                                                       | 10/110 [02:30<29:00, 17.40s/it]

Testing for ('resnet50', 'vssm-b')
  T-test: T=-2.44, p=0.015. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=828.0, p=0.015. Result is statistically significant.
  Mann-Whitney U Test: U=165262.5, p=0.079. Result is not statistically significant.
  Kruskal-Wallis Test: H=3.0914476119960814, p=0.079. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.00


 10%|███████████████████████▋                                                                                                                                                                                                                     | 11/110 [02:50<29:56, 18.15s/it]

Testing for ('vgg16', 'resnet50')
  T-test: T=-0.12, p=0.901. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1056.0, p=0.901. Result is not statistically significant.
  Mann-Whitney U Test: U=170820.0, p=0.934. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.006910249514144428, p=0.934. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


 11%|█████████████████████████▊                                                                                                                                                                                                                   | 12/110 [03:09<30:07, 18.44s/it]

Testing for ('vgg16', 'vit-ti16')
  T-test: T=-0.36, p=0.718. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1155.0, p=0.718. Result is not statistically significant.
  Mann-Whitney U Test: U=170235.0, p=0.802. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06281156530474258, p=0.802. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 12%|████████████████████████████                                                                                                                                                                                                                 | 13/110 [03:27<29:36, 18.31s/it]

Testing for ('vgg16', 'vit-s16')
  T-test: T=-0.79, p=0.431. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=767.0, p=0.431. Result is not statistically significant.
  Mann-Whitney U Test: U=169357.5, p=0.614. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.2550792804152145, p=0.614. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.03


 13%|██████████████████████████████▏                                                                                                                                                                                                              | 14/110 [03:44<28:41, 17.94s/it]

Testing for ('vgg16', 'vit-s32')
  T-test: T=0.92, p=0.359. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1309.0, p=0.359. Result is not statistically significant.
  Mann-Whitney U Test: U=173452.5, p=0.515. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.42370424066806134, p=0.515. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 14%|████████████████████████████████▎                                                                                                                                                                                                            | 15/110 [04:10<32:27, 20.50s/it]

Testing for ('vgg16', 'vit-b16')
  T-test: T=-1.27, p=0.204. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=819.0, p=0.204. Result is not statistically significant.
  Mann-Whitney U Test: U=168187.5, p=0.395. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.7233910891090336, p=0.395. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 15%|██████████████████████████████████▍                                                                                                                                                                                                          | 16/110 [04:34<33:54, 21.64s/it]

Testing for ('vgg16', 'vit-b32')
  T-test: T=-0.33, p=0.742. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1680.0, p=0.742. Result is not statistically significant.
  Mann-Whitney U Test: U=170235.0, p=0.802. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06281156530474258, p=0.802. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


 15%|████████████████████████████████████▋                                                                                                                                                                                                        | 17/110 [04:55<33:11, 21.42s/it]

Testing for ('vgg16', 'vim-s')
  T-test: T=-1.78, p=0.075. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=756.0, p=0.075. Result is not statistically significant.
  Mann-Whitney U Test: U=167017.5, p=0.229. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.4484650786426818, p=0.229. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 16%|██████████████████████████████████████▊                                                                                                                                                                                                      | 18/110 [05:17<32:46, 21.37s/it]

Testing for ('vgg16', 'vssm-ti')
  T-test: T=-2.26, p=0.024. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=747.5, p=0.024. Result is statistically significant.
  Mann-Whitney U Test: U=165847.5, p=0.118. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.447756178266297, p=0.118. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.01


 17%|████████████████████████████████████████▉                                                                                                                                                                                                    | 19/110 [05:43<34:33, 22.79s/it]

Testing for ('vgg16', 'vssm-s')
  T-test: T=-1.27, p=0.204. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=819.0, p=0.204. Result is not statistically significant.
  Mann-Whitney U Test: U=168187.5, p=0.395. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.7233910891090336, p=0.395. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 18%|███████████████████████████████████████████                                                                                                                                                                                                  | 20/110 [06:13<37:26, 24.96s/it]

Testing for ('vgg16', 'vssm-b')
  T-test: T=-2.75, p=0.006. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=570.0, p=0.006. Result is statistically significant.
  Mann-Whitney U Test: U=164970.0, p=0.066. Result is not statistically significant.
  Kruskal-Wallis Test: H=3.388762168949951, p=0.066. Result is not statistically significant.
  95% CI for mean difference: -0.08 to 0.00


 19%|█████████████████████████████████████████████▏                                                                                                                                                                                               | 21/110 [06:21<29:38, 19.99s/it]

Testing for ('vit-ti16', 'resnet50')
  T-test: T=0.24, p=0.811. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1207.0, p=0.811. Result is not statistically significant.
  Mann-Whitney U Test: U=171697.5, p=0.867. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.02805644890333662, p=0.867. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 20%|███████████████████████████████████████████████▍                                                                                                                                                                                             | 22/110 [06:37<27:26, 18.71s/it]

Testing for ('vit-ti16', 'vgg16')
  T-test: T=0.36, p=0.718. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1155.0, p=0.718. Result is not statistically significant.
  Mann-Whitney U Test: U=171990.0, p=0.802. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06281156530474258, p=0.802. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.05


 21%|█████████████████████████████████████████████████▌                                                                                                                                                                                           | 23/110 [06:45<22:29, 15.51s/it]

Testing for ('vit-ti16', 'vit-s16')
  T-test: T=-0.38, p=0.701. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=899.0, p=0.701. Result is not statistically significant.
  Mann-Whitney U Test: U=170235.0, p=0.799. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06476494161311973, p=0.799. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


 22%|███████████████████████████████████████████████████▋                                                                                                                                                                                         | 24/110 [06:52<18:38, 13.00s/it]

Testing for ('vit-ti16', 'vit-s32')
  T-test: T=1.24, p=0.216. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1360.0, p=0.216. Result is not statistically significant.
  Mann-Whitney U Test: U=174330.0, p=0.368. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.8123417085439001, p=0.367. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 23%|█████████████████████████████████████████████████████▊                                                                                                                                                                                       | 25/110 [07:08<19:48, 13.98s/it]

Testing for ('vit-ti16', 'vit-b16')
  T-test: T=-0.87, p=0.386. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=957.0, p=0.385. Result is not statistically significant.
  Mann-Whitney U Test: U=169065.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.02


 24%|████████████████████████████████████████████████████████                                                                                                                                                                                     | 26/110 [07:22<19:25, 13.88s/it]

Testing for ('vit-ti16', 'vit-b32')
  T-test: T=0.00, p=1.000. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1387.5, p=1.000. Result is not statistically significant.
  Mann-Whitney U Test: U=171112.5, p=1.000. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.257704169053019e-12, p=1.000. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


 25%|██████████████████████████████████████████████████████████▏                                                                                                                                                                                  | 27/110 [07:32<17:47, 12.86s/it]

Testing for ('vit-ti16', 'vim-s')
  T-test: T=-1.43, p=0.152. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=720.0, p=0.152. Result is not statistically significant.
  Mann-Whitney U Test: U=167895.0, p=0.340. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.9090494277042347, p=0.340. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 25%|████████████████████████████████████████████████████████████▎                                                                                                                                                                                | 28/110 [07:44<16:56, 12.40s/it]

Testing for ('vit-ti16', 'vssm-ti')
  T-test: T=-1.84, p=0.067. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=884.0, p=0.067. Result is not statistically significant.
  Mann-Whitney U Test: U=166725.0, p=0.189. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.7289602902802532, p=0.189. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.01


 26%|██████████████████████████████████████████████████████████████▍                                                                                                                                                                              | 29/110 [08:00<18:06, 13.42s/it]

Testing for ('vit-ti16', 'vssm-s')
  T-test: T=-0.87, p=0.386. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=957.0, p=0.385. Result is not statistically significant.
  Mann-Whitney U Test: U=169065.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.03


 27%|████████████████████████████████████████████████████████████████▋                                                                                                                                                                            | 30/110 [08:19<20:25, 15.32s/it]

Testing for ('vit-ti16', 'vssm-b')
  T-test: T=-2.37, p=0.018. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=590.0, p=0.018. Result is statistically significant.
  Mann-Whitney U Test: U=165847.5, p=0.111. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.5334171660973195, p=0.111. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.01


 28%|██████████████████████████████████████████████████████████████████▊                                                                                                                                                                          | 31/110 [08:30<18:12, 13.83s/it]

Testing for ('vit-s16', 'resnet50')
  T-test: T=0.58, p=0.559. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1258.0, p=0.558. Result is not statistically significant.
  Mann-Whitney U Test: U=172575.0, p=0.673. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.1780481415381438, p=0.673. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 29%|████████████████████████████████████████████████████████████████████▉                                                                                                                                                                        | 32/110 [08:47<19:32, 15.03s/it]

Testing for ('vit-s16', 'vgg16')
  T-test: T=0.79, p=0.431. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=767.0, p=0.431. Result is not statistically significant.
  Mann-Whitney U Test: U=172867.5, p=0.614. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.2550792804152145, p=0.614. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 30%|███████████████████████████████████████████████████████████████████████                                                                                                                                                                      | 33/110 [08:55<16:21, 12.74s/it]

Testing for ('vit-s16', 'vit-ti16')
  T-test: T=0.38, p=0.701. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=899.0, p=0.701. Result is not statistically significant.
  Mann-Whitney U Test: U=171990.0, p=0.799. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06476494161311973, p=0.799. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 31%|█████████████████████████████████████████████████████████████████████████▎                                                                                                                                                                   | 34/110 [09:04<14:35, 11.53s/it]

Testing for ('vit-s16', 'vit-s32')
  T-test: T=1.75, p=0.080. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=812.5, p=0.080. Result is not statistically significant.
  Mann-Whitney U Test: U=175207.5, p=0.248. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.3347858507713914, p=0.248. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.07


 32%|███████████████████████████████████████████████████████████████████████████▍                                                                                                                                                                 | 35/110 [09:22<17:01, 13.62s/it]

Testing for ('vit-s16', 'vit-b16')
  T-test: T=-0.54, p=0.587. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=687.5, p=0.586. Result is not statistically significant.
  Mann-Whitney U Test: U=169942.5, p=0.730. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.11954187544804414, p=0.730. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 33%|█████████████████████████████████████████████████████████████████████████████▌                                                                                                                                                               | 36/110 [09:38<17:39, 14.32s/it]

Testing for ('vit-s16', 'vit-b32')
  T-test: T=0.36, p=0.722. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1224.0, p=0.722. Result is not statistically significant.
  Mann-Whitney U Test: U=171990.0, p=0.799. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06476494161311973, p=0.799. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.04


 34%|███████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                             | 37/110 [09:50<16:36, 13.65s/it]

Testing for ('vit-s16', 'vim-s')
  T-test: T=-1.07, p=0.285. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=684.0, p=0.285. Result is not statistically significant.
  Mann-Whitney U Test: U=168772.5, p=0.484. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.48899346405361827, p=0.484. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.03


 35%|█████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                                           | 38/110 [10:03<16:08, 13.45s/it]

Testing for ('vit-s16', 'vssm-ti')
  T-test: T=-1.67, p=0.096. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=530.0, p=0.096. Result is not statistically significant.
  Mann-Whitney U Test: U=167602.5, p=0.289. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.125963184932142, p=0.289. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 35%|████████████████████████████████████████████████████████████████████████████████████                                                                                                                                                         | 39/110 [10:20<17:17, 14.61s/it]

Testing for ('vit-s16', 'vssm-s')
  T-test: T=-0.59, p=0.556. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=493.5, p=0.555. Result is not statistically significant.
  Mann-Whitney U Test: U=169942.5, p=0.730. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.11954187544804414, p=0.730. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 36%|██████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                      | 40/110 [10:42<19:27, 16.68s/it]

Testing for ('vit-s16', 'vssm-b')
  T-test: T=-2.30, p=0.022. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=308.0, p=0.022. Result is statistically significant.
  Mann-Whitney U Test: U=166725.0, p=0.181. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.7909792252541157, p=0.181. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.01


 37%|████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                    | 41/110 [10:51<16:40, 14.50s/it]

Testing for ('vit-s32', 'resnet50')
  T-test: T=-0.96, p=0.335. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1716.0, p=0.335. Result is not statistically significant.
  Mann-Whitney U Test: U=168480.0, p=0.463. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.538737262533538, p=0.463. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.03


 38%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                                  | 42/110 [11:09<17:25, 15.38s/it]

Testing for ('vit-s32', 'vgg16')
  T-test: T=-0.92, p=0.359. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1309.0, p=0.359. Result is not statistically significant.
  Mann-Whitney U Test: U=168772.5, p=0.515. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.42370424066806134, p=0.515. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.03


 39%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                                | 43/110 [11:15<14:12, 12.73s/it]

Testing for ('vit-s32', 'vit-ti16')
  T-test: T=-1.24, p=0.216. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1360.0, p=0.216. Result is not statistically significant.
  Mann-Whitney U Test: U=167895.0, p=0.368. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.8123417085439001, p=0.367. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 40%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                              | 44/110 [11:23<12:26, 11.32s/it]

Testing for ('vit-s32', 'vit-s16')
  T-test: T=-1.75, p=0.080. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=812.5, p=0.080. Result is not statistically significant.
  Mann-Whitney U Test: U=167017.5, p=0.248. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.3347858507713914, p=0.248. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.02


 41%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                            | 45/110 [11:41<14:22, 13.27s/it]

Testing for ('vit-s32', 'vit-b16')
  T-test: T=-2.10, p=0.036. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1050.0, p=0.036. Result is statistically significant.
  Mann-Whitney U Test: U=165847.5, p=0.134. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.2500000000012697, p=0.134. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.01


 42%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                          | 46/110 [11:56<14:40, 13.76s/it]

Testing for ('vit-s32', 'vit-b32')
  T-test: T=-1.37, p=0.173. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=891.0, p=0.172. Result is not statistically significant.
  Mann-Whitney U Test: U=167895.0, p=0.368. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.8123417085439001, p=0.367. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 43%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                       | 47/110 [12:07<13:40, 13.03s/it]

Testing for ('vit-s32', 'vim-s')
  T-test: T=-2.54, p=0.011. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1039.5, p=0.012. Result is statistically significant.
  Mann-Whitney U Test: U=164677.5, p=0.064. Result is not statistically significant.
  Kruskal-Wallis Test: H=3.4293992144705183, p=0.064. Result is not statistically significant.
  95% CI for mean difference: -0.08 to 0.01


 44%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                                     | 48/110 [12:20<13:11, 12.77s/it]

Testing for ('vit-s32', 'vssm-ti')
  T-test: T=-3.09, p=0.002. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=839.5, p=0.002. Result is statistically significant.
  Mann-Whitney U Test: U=163507.5, p=0.027. Result is statistically significant.
  Kruskal-Wallis Test: H=4.890123762377427, p=0.027. Result is statistically significant.
  95% CI for mean difference: -0.08 to -0.01


 45%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                                   | 49/110 [12:36<14:08, 13.91s/it]

Testing for ('vit-s32', 'vssm-s')
  T-test: T=-2.22, p=0.027. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=804.0, p=0.027. Result is statistically significant.
  Mann-Whitney U Test: U=165847.5, p=0.134. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.2500000000012697, p=0.134. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.01


 45%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                                 | 50/110 [12:57<15:59, 15.99s/it]

Testing for ('vit-s32', 'vssm-b')
  T-test: T=-3.82, p=0.000. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=450.0, p=0.000. Result is statistically significant.
  Mann-Whitney U Test: U=162630.0, p=0.013. Result is statistically significant.
  Kruskal-Wallis Test: H=6.181607258506039, p=0.013. Result is statistically significant.
  95% CI for mean difference: -0.09 to -0.01


 46%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                               | 51/110 [13:16<16:40, 16.96s/it]

Testing for ('vit-b16', 'resnet50')
  T-test: T=1.10, p=0.272. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=986.0, p=0.272. Result is not statistically significant.
  Mann-Whitney U Test: U=173745.0, p=0.443. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5890487654676368, p=0.443. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 47%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                             | 52/110 [13:44<19:32, 20.21s/it]

Testing for ('vit-b16', 'vgg16')
  T-test: T=1.27, p=0.204. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=819.0, p=0.204. Result is not statistically significant.
  Mann-Whitney U Test: U=174037.5, p=0.395. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.7233910891090336, p=0.395. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 48%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                          | 53/110 [13:59<17:40, 18.61s/it]

Testing for ('vit-b16', 'vit-ti16')
  T-test: T=0.87, p=0.386. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=957.0, p=0.385. Result is not statistically significant.
  Mann-Whitney U Test: U=173160.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 49%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                        | 54/110 [14:15<16:44, 17.93s/it]

Testing for ('vit-b16', 'vit-s16')
  T-test: T=0.54, p=0.587. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=687.5, p=0.586. Result is not statistically significant.
  Mann-Whitney U Test: U=172282.5, p=0.730. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.11954187544804414, p=0.730. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 50%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                                      | 55/110 [14:31<15:44, 17.17s/it]

Testing for ('vit-b16', 'vit-s32')
  T-test: T=2.10, p=0.036. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1050.0, p=0.036. Result is statistically significant.
  Mann-Whitney U Test: U=176377.5, p=0.134. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.2500000000012697, p=0.134. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.07


 51%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                    | 56/110 [14:54<17:11, 19.09s/it]

Testing for ('vit-b16', 'vit-b32')
  T-test: T=0.90, p=0.371. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=837.0, p=0.370. Result is not statistically significant.
  Mann-Whitney U Test: U=173160.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 52%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                  | 57/110 [15:15<17:11, 19.46s/it]

Testing for ('vit-b16', 'vim-s')
  T-test: T=-0.52, p=0.600. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=796.5, p=0.599. Result is not statistically significant.
  Mann-Whitney U Test: U=169942.5, p=0.724. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.12510702054863262, p=0.724. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 53%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                | 58/110 [15:35<17:03, 19.68s/it]

Testing for ('vit-b16', 'vssm-ti')
  T-test: T=-1.18, p=0.239. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=446.5, p=0.238. Result is not statistically significant.
  Mann-Whitney U Test: U=168772.5, p=0.474. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5125226064572506, p=0.474. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.02


 54%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                              | 59/110 [16:00<18:02, 21.22s/it]

Testing for ('vit-b16', 'vssm-s')
  T-test: T=0.00, p=1.000. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=588.0, p=1.000. Result is not statistically significant.
  Mann-Whitney U Test: U=171112.5, p=1.000. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.3562161086734853e-12, p=1.000. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


 55%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 60/110 [16:29<19:37, 23.55s/it]

Testing for ('vit-b16', 'vssm-b')
  T-test: T=-1.57, p=0.116. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=475.0, p=0.116. Result is not statistically significant.
  Mann-Whitney U Test: U=167895.0, p=0.321. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.987021052412665, p=0.320. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.02


 55%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 61/110 [16:45<17:32, 21.48s/it]

Testing for ('vit-b32', 'resnet50')
  T-test: T=0.23, p=0.821. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1501.0, p=0.821. Result is not statistically significant.
  Mann-Whitney U Test: U=171697.5, p=0.867. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.02805644890333662, p=0.867. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.04


 56%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                       | 62/110 [17:12<18:31, 23.15s/it]

Testing for ('vit-b32', 'vgg16')
  T-test: T=0.33, p=0.742. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1680.0, p=0.742. Result is not statistically significant.
  Mann-Whitney U Test: U=171990.0, p=0.802. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06281156530474258, p=0.802. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.05


 57%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 63/110 [17:25<15:36, 19.92s/it]

Testing for ('vit-b32', 'vit-ti16')
  T-test: T=0.00, p=1.000. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1387.5, p=1.000. Result is not statistically significant.
  Mann-Whitney U Test: U=171112.5, p=1.000. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.257704169053019e-12, p=1.000. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


 58%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                   | 64/110 [17:39<13:59, 18.25s/it]

Testing for ('vit-b32', 'vit-s16')
  T-test: T=-0.36, p=0.722. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1224.0, p=0.722. Result is not statistically significant.
  Mann-Whitney U Test: U=170235.0, p=0.799. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.06476494161311973, p=0.799. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                 | 65/110 [17:53<12:41, 16.91s/it]

Testing for ('vit-b32', 'vit-s32')
  T-test: T=1.37, p=0.173. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=891.0, p=0.172. Result is not statistically significant.
  Mann-Whitney U Test: U=174330.0, p=0.368. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.8123417085439001, p=0.367. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 66/110 [18:15<13:37, 18.57s/it]

Testing for ('vit-b32', 'vit-b16')
  T-test: T=-0.90, p=0.371. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=837.0, p=0.370. Result is not statistically significant.
  Mann-Whitney U Test: U=169065.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.03


 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 67/110 [18:32<12:56, 18.06s/it]

Testing for ('vit-b32', 'vim-s')
  T-test: T=-1.31, p=0.192. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1080.0, p=0.192. Result is not statistically significant.
  Mann-Whitney U Test: U=167895.0, p=0.340. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.9090494277042347, p=0.340. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.02


 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 68/110 [18:50<12:33, 17.94s/it]

Testing for ('vit-b32', 'vssm-ti')
  T-test: T=-1.89, p=0.059. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=768.0, p=0.059. Result is not statistically significant.
  Mann-Whitney U Test: U=166725.0, p=0.189. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.7289602902802532, p=0.189. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.01


 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 69/110 [19:12<13:05, 19.16s/it]

Testing for ('vit-b32', 'vssm-s')
  T-test: T=-0.84, p=0.400. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1085.0, p=0.399. Result is not statistically significant.
  Mann-Whitney U Test: U=169065.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.03


 64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 70/110 [19:38<14:16, 21.40s/it]

Testing for ('vit-b32', 'vssm-b')
  T-test: T=-2.19, p=0.029. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=862.5, p=0.029. Result is statistically significant.
  Mann-Whitney U Test: U=165847.5, p=0.111. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.5334171660973195, p=0.111. Result is not statistically significant.
  95% CI for mean difference: -0.07 to 0.01


 65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 71/110 [19:51<12:17, 18.90s/it]

Testing for ('vim-s', 'resnet50')
  T-test: T=1.70, p=0.091. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=690.0, p=0.091. Result is not statistically significant.
  Mann-Whitney U Test: U=174915.0, p=0.263. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.2557508342609243, p=0.262. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 72/110 [20:15<12:53, 20.36s/it]

Testing for ('vim-s', 'vgg16')
  T-test: T=1.78, p=0.075. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=756.0, p=0.075. Result is not statistically significant.
  Mann-Whitney U Test: U=175207.5, p=0.229. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.4484650786426818, p=0.229. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 73/110 [20:25<10:34, 17.16s/it]

Testing for ('vim-s', 'vit-ti16')
  T-test: T=1.43, p=0.152. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=720.0, p=0.152. Result is not statistically significant.
  Mann-Whitney U Test: U=174330.0, p=0.340. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.9090494277042347, p=0.340. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 74/110 [20:37<09:21, 15.61s/it]

Testing for ('vim-s', 'vit-s16')
  T-test: T=1.07, p=0.285. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=684.0, p=0.285. Result is not statistically significant.
  Mann-Whitney U Test: U=173452.5, p=0.484. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.48899346405361827, p=0.484. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 75/110 [20:48<08:17, 14.22s/it]

Testing for ('vim-s', 'vit-s32')
  T-test: T=2.54, p=0.011. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1039.5, p=0.012. Result is statistically significant.
  Mann-Whitney U Test: U=177547.5, p=0.064. Result is not statistically significant.
  Kruskal-Wallis Test: H=3.4293992144705183, p=0.064. Result is not statistically significant.
  95% CI for mean difference: -0.00 to 0.08


 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                         | 76/110 [21:08<09:05, 16.05s/it]

Testing for ('vim-s', 'vit-b16')
  T-test: T=0.52, p=0.600. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=796.5, p=0.599. Result is not statistically significant.
  Mann-Whitney U Test: U=172282.5, p=0.724. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.12510702054863262, p=0.724. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 77/110 [21:24<08:47, 16.00s/it]

Testing for ('vim-s', 'vit-b32')
  T-test: T=1.31, p=0.192. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1080.0, p=0.192. Result is not statistically significant.
  Mann-Whitney U Test: U=174330.0, p=0.340. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.9090494277042347, p=0.340. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 78/110 [21:39<08:22, 15.70s/it]

Testing for ('vim-s', 'vssm-ti')
  T-test: T=-0.58, p=0.564. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=539.0, p=0.564. Result is not statistically significant.
  Mann-Whitney U Test: U=169942.5, p=0.717. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.1313335580279096, p=0.717. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 72%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 79/110 [21:59<08:42, 16.87s/it]

Testing for ('vim-s', 'vssm-s')
  T-test: T=0.58, p=0.564. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=539.0, p=0.564. Result is not statistically significant.
  Mann-Whitney U Test: U=172282.5, p=0.724. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.12510702054863262, p=0.724. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.04


 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                | 80/110 [22:22<09:25, 18.84s/it]

Testing for ('vim-s', 'vssm-b')
  T-test: T=-1.02, p=0.308. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=480.0, p=0.307. Result is not statistically significant.
  Mann-Whitney U Test: U=169065.0, p=0.522. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.40995526928007964, p=0.522. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.02


 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 81/110 [22:36<08:21, 17.28s/it]

Testing for ('vssm-ti', 'resnet50')
  T-test: T=2.15, p=0.032. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=736.0, p=0.032. Result is statistically significant.
  Mann-Whitney U Test: U=176085.0, p=0.138. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.195640447393827, p=0.138. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.07


 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 82/110 [22:59<08:51, 18.98s/it]

Testing for ('vssm-ti', 'vgg16')
  T-test: T=2.26, p=0.024. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=747.5, p=0.024. Result is statistically significant.
  Mann-Whitney U Test: U=176377.5, p=0.118. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.447756178266297, p=0.118. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.07


 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 83/110 [23:09<07:23, 16.43s/it]

Testing for ('vssm-ti', 'vit-ti16')
  T-test: T=1.84, p=0.067. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=884.0, p=0.067. Result is not statistically significant.
  Mann-Whitney U Test: U=175500.0, p=0.189. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.7289602902802532, p=0.189. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.06


 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 84/110 [23:22<06:37, 15.28s/it]

Testing for ('vssm-ti', 'vit-s16')
  T-test: T=1.67, p=0.096. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=530.0, p=0.096. Result is not statistically significant.
  Mann-Whitney U Test: U=174622.5, p=0.289. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.125963184932142, p=0.289. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                     | 85/110 [23:33<05:53, 14.14s/it]

Testing for ('vssm-ti', 'vit-s32')
  T-test: T=3.09, p=0.002. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=839.5, p=0.002. Result is statistically significant.
  Mann-Whitney U Test: U=178717.5, p=0.027. Result is statistically significant.
  Kruskal-Wallis Test: H=4.890123762377427, p=0.027. Result is statistically significant.
  95% CI for mean difference: 0.01 to 0.09


 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 86/110 [23:54<06:28, 16.18s/it]

Testing for ('vssm-ti', 'vit-b16')
  T-test: T=1.18, p=0.239. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=446.5, p=0.238. Result is not statistically significant.
  Mann-Whitney U Test: U=173452.5, p=0.474. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5125226064572506, p=0.474. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 87/110 [24:11<06:18, 16.44s/it]

Testing for ('vssm-ti', 'vit-b32')
  T-test: T=1.89, p=0.059. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=768.0, p=0.059. Result is not statistically significant.
  Mann-Whitney U Test: U=175500.0, p=0.189. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.7289602902802532, p=0.189. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.06


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 88/110 [24:25<05:43, 15.63s/it]

Testing for ('vssm-ti', 'vim-s')
  T-test: T=0.58, p=0.564. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=539.0, p=0.564. Result is not statistically significant.
  Mann-Whitney U Test: U=172282.5, p=0.717. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.1313335580279096, p=0.717. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.04


 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                             | 89/110 [24:45<05:57, 17.02s/it]

Testing for ('vssm-ti', 'vssm-s')
  T-test: T=1.27, p=0.206. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=328.0, p=0.206. Result is not statistically significant.
  Mann-Whitney U Test: U=173452.5, p=0.474. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5125226064572506, p=0.474. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 90/110 [25:10<06:24, 19.24s/it]

Testing for ('vssm-ti', 'vssm-b')
  T-test: T=-0.45, p=0.655. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=483.0, p=0.655. Result is not statistically significant.
  Mann-Whitney U Test: U=170235.0, p=0.781. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.07729834177100475, p=0.781. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 91/110 [25:28<05:59, 18.90s/it]

Testing for ('vssm-s', 'resnet50')
  T-test: T=1.10, p=0.272. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=986.0, p=0.272. Result is not statistically significant.
  Mann-Whitney U Test: U=173745.0, p=0.443. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5890487654676368, p=0.443. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 92/110 [25:55<06:24, 21.33s/it]

Testing for ('vssm-s', 'vgg16')
  T-test: T=1.27, p=0.204. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=819.0, p=0.204. Result is not statistically significant.
  Mann-Whitney U Test: U=174037.5, p=0.395. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.7233910891090336, p=0.395. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 93/110 [26:09<05:27, 19.29s/it]

Testing for ('vssm-s', 'vit-ti16')
  T-test: T=0.87, p=0.386. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=957.0, p=0.385. Result is not statistically significant.
  Mann-Whitney U Test: U=173160.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.05


 85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 94/110 [26:26<04:55, 18.48s/it]

Testing for ('vssm-s', 'vit-s16')
  T-test: T=0.59, p=0.556. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=493.5, p=0.555. Result is not statistically significant.
  Mann-Whitney U Test: U=172282.5, p=0.730. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.11954187544804414, p=0.730. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.04


 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 95/110 [26:41<04:24, 17.61s/it]

Testing for ('vssm-s', 'vit-s32')
  T-test: T=2.22, p=0.027. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=804.0, p=0.027. Result is statistically significant.
  Mann-Whitney U Test: U=176377.5, p=0.134. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.2500000000012697, p=0.134. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.07


 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 96/110 [27:06<04:36, 19.73s/it]

Testing for ('vssm-s', 'vit-b16')
  T-test: T=0.00, p=1.000. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=588.0, p=1.000. Result is not statistically significant.
  Mann-Whitney U Test: U=171112.5, p=1.000. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.3562161086734853e-12, p=1.000. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.04


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 97/110 [27:28<04:24, 20.36s/it]

Testing for ('vssm-s', 'vit-b32')
  T-test: T=0.84, p=0.400. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=1085.0, p=0.399. Result is not statistically significant.
  Mann-Whitney U Test: U=173160.0, p=0.549. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.3601649889032333, p=0.548. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 98/110 [27:47<03:57, 19.83s/it]

Testing for ('vssm-s', 'vim-s')
  T-test: T=-0.58, p=0.564. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=539.0, p=0.564. Result is not statistically significant.
  Mann-Whitney U Test: U=169942.5, p=0.724. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.12510702054863262, p=0.724. Result is not statistically significant.
  95% CI for mean difference: -0.04 to 0.03


 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 99/110 [28:05<03:32, 19.33s/it]

Testing for ('vssm-s', 'vssm-ti')
  T-test: T=-1.27, p=0.206. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=328.0, p=0.206. Result is not statistically significant.
  Mann-Whitney U Test: U=168772.5, p=0.474. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.5125226064572506, p=0.474. Result is not statistically significant.
  95% CI for mean difference: -0.05 to 0.02


 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 100/110 [28:33<03:39, 21.94s/it]

Testing for ('vssm-s', 'vssm-b')
  T-test: T=-1.57, p=0.116. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=475.0, p=0.116. Result is not statistically significant.
  Mann-Whitney U Test: U=167895.0, p=0.321. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.987021052412665, p=0.320. Result is not statistically significant.
  95% CI for mean difference: -0.06 to 0.02


 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                   | 101/110 [28:54<03:14, 21.64s/it]

Testing for ('vssm-b', 'resnet50')
  T-test: T=2.44, p=0.015. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=828.0, p=0.015. Result is statistically significant.
  Mann-Whitney U Test: U=176962.5, p=0.079. Result is not statistically significant.
  Kruskal-Wallis Test: H=3.0914476119960814, p=0.079. Result is not statistically significant.
  95% CI for mean difference: -0.00 to 0.07


 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 102/110 [29:24<03:13, 24.17s/it]

Testing for ('vssm-b', 'vgg16')
  T-test: T=2.75, p=0.006. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=570.0, p=0.006. Result is statistically significant.
  Mann-Whitney U Test: U=177255.0, p=0.066. Result is not statistically significant.
  Kruskal-Wallis Test: H=3.388762168949951, p=0.066. Result is not statistically significant.
  95% CI for mean difference: -0.00 to 0.07


 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 103/110 [29:41<02:35, 22.19s/it]

Testing for ('vssm-b', 'vit-ti16')
  T-test: T=2.37, p=0.018. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=590.0, p=0.018. Result is statistically significant.
  Mann-Whitney U Test: U=176377.5, p=0.111. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.5334171660973195, p=0.111. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.07


 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 104/110 [30:01<02:08, 21.42s/it]

Testing for ('vssm-b', 'vit-s16')
  T-test: T=2.30, p=0.022. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=308.0, p=0.022. Result is statistically significant.
  Mann-Whitney U Test: U=175500.0, p=0.181. Result is not statistically significant.
  Kruskal-Wallis Test: H=1.7909792252541157, p=0.181. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.06


 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 105/110 [30:19<01:42, 20.56s/it]

Testing for ('vssm-b', 'vit-s32')
  T-test: T=3.82, p=0.000. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=450.0, p=0.000. Result is statistically significant.
  Mann-Whitney U Test: U=179595.0, p=0.013. Result is statistically significant.
  Kruskal-Wallis Test: H=6.181607258506039, p=0.013. Result is statistically significant.
  95% CI for mean difference: 0.01 to 0.09


 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 106/110 [30:48<01:31, 22.81s/it]

Testing for ('vssm-b', 'vit-b16')
  T-test: T=1.57, p=0.116. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=475.0, p=0.116. Result is not statistically significant.
  Mann-Whitney U Test: U=174330.0, p=0.321. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.987021052412665, p=0.320. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06


 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 107/110 [31:13<01:11, 23.69s/it]

Testing for ('vssm-b', 'vit-b32')
  T-test: T=2.19, p=0.029. Result is statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=862.5, p=0.029. Result is statistically significant.
  Mann-Whitney U Test: U=176377.5, p=0.111. Result is not statistically significant.
  Kruskal-Wallis Test: H=2.5334171660973195, p=0.111. Result is not statistically significant.
  95% CI for mean difference: -0.01 to 0.07


 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 108/110 [31:36<00:46, 23.27s/it]

Testing for ('vssm-b', 'vim-s')
  T-test: T=1.02, p=0.308. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=480.0, p=0.307. Result is not statistically significant.
  Mann-Whitney U Test: U=173160.0, p=0.522. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.40995526928007964, p=0.522. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.05


 99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 109/110 [31:58<00:23, 23.12s/it]

Testing for ('vssm-b', 'vssm-ti')
  T-test: T=0.45, p=0.655. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=483.0, p=0.655. Result is not statistically significant.
  Mann-Whitney U Test: U=171990.0, p=0.781. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.07729834177100475, p=0.781. Result is not statistically significant.
  95% CI for mean difference: -0.03 to 0.04


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [32:26<00:00, 17.69s/it]

Testing for ('vssm-b', 'vssm-s')
  T-test: T=1.57, p=0.116. Result is not statistically significant.
  Wilcoxon Signed-Rank Test: Statistic=475.0, p=0.116. Result is not statistically significant.
  Mann-Whitney U Test: U=174330.0, p=0.321. Result is not statistically significant.
  Kruskal-Wallis Test: H=0.987021052412665, p=0.320. Result is not statistically significant.
  95% CI for mean difference: -0.02 to 0.06





In [97]:
import csv
test_names = list(next(iter(results.values())).keys())
final_names = []

for name in test_names:
    if name != 'Bootstrap CI':
        final_names.extend([name+'_stats', name+'_p-value'])
    else:
        final_names.extend([name+'_lower', name+'_upper'])
        
test_names = next(iter(results.values())).keys()
# Open a new CSV file to write the results
with open('results_BUSI.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(['Model 1', 'Model 2']+final_names)
    
    # Write the data rows
    for (model1, model2), tests in results.items():
        test_values = []
        for test_name in tests:
            test_values.extend(tests[test_name])
        writer.writerow([model1, model2]+test_values)

In [130]:
model_pair = ('resnet50', 'vit-s16')
model1_preds = []
model2_preds = []
args.data_path = f'dataset/Combined/'
for fold_index in range(args.k_folds):
    train_loader, val_loader, test_loader = get_data_loaders(args, seed=fold_index + args.seed)

    args.arch = model_pair[0]
    model1 = init_model(args, device, 3)
    state_dict = torch.load(f'checkpoints/results_Combined/{args.arch}/fold_{fold_index}/best_checkpoint.pth')['model']
    model1.load_state_dict(state_dict)

    args.arch = model_pair[1]
    model2 = init_model(args, device, 3)
    state_dict = torch.load(f'checkpoints/results_Combined/{args.arch}/fold_{fold_index}/best_checkpoint.pth')['model']
    model2.load_state_dict(state_dict)

    model1.eval()
    model2.eval()
    fold_preds1 = []
    fold_preds2 = []
    for data, label in test_loader:
        data = data.to(device)
        label = label.to(device)
        with torch.no_grad():
            pred1 = model1(data)
            pred2 = model2(data)
        pred1 = (torch.argmax(pred1, axis=1) == label).to(torch.int32)
        pred2 = (torch.argmax(pred2, axis=1) == label).to(torch.int32)
        fold_preds1.append(pred1.cpu().numpy())
        fold_preds2.append(pred2.cpu().numpy())
    model1_preds.append(np.concatenate(fold_preds1))
    model2_preds.append(np.concatenate(fold_preds2))
# model1_preds = np.concatenate(model1_preds)
# model2_preds = np.concatenate(model2_preds)

# Summarize

You can use the following code to summarize the  provided csv files for the statistical signficance analysis.

In [1]:
import pandas as pd

In [2]:
p_val_threshold = 0.05
df = pd.read_csv('results_Combined.csv')
model_scores = {i:0 for i in list(df['Model 1'].unique())}
for i in range(df.shape[0]):
    test_stat = df.iloc[i]
    model_name = test_stat['Model 1']
    other_model_name = test_stat['Model 2']
    p_value = test_stat['T-test_p-value']
    t_test = test_stat['T-test_stats']
    if p_value<p_val_threshold:
        if t_test>0:
            model_scores[model_name] += 1  
            print(f"Model (p-value:{p_value:0.3f})-(t_test:{t_test:0.3f})    {model_name}  \t beats '{other_model_name}'")
        else:
            model_scores[model_name] -= 1     
model_scores

Model (p-value:0.044)-(t_test:2.018)    vit-b16  	 beats 'vgg16'
Model (p-value:0.013)-(t_test:2.500)    vit-b16  	 beats 'vit-ti16'
Model (p-value:0.004)-(t_test:2.902)    vssm-ti  	 beats 'vgg16'
Model (p-value:0.003)-(t_test:3.002)    vssm-ti  	 beats 'vit-ti16'
Model (p-value:0.014)-(t_test:2.469)    vssm-ti  	 beats 'vit-s32'
Model (p-value:0.022)-(t_test:2.301)    vssm-ti  	 beats 'vit-b32'
Model (p-value:0.037)-(t_test:2.090)    vssm-b  	 beats 'vgg16'
Model (p-value:0.015)-(t_test:2.438)    vssm-b  	 beats 'vit-ti16'


{'resnet50': 0,
 'vgg16': -3,
 'vit-ti16': -3,
 'vit-s16': 0,
 'vit-s32': -1,
 'vit-b16': 2,
 'vit-b32': -1,
 'vim-s': 0,
 'vssm-ti': 4,
 'vssm-s': 0,
 'vssm-b': 2}

In [4]:
p_val_threshold = 0.05
df = pd.read_csv('results_BUSI.csv')
model_scores = {i:0 for i in list(df['Model 1'].unique())}
for i in range(df.shape[0]):
    test_stat = df.iloc[i]
    model_name = test_stat['Model 1']
    other_model_name = test_stat['Model 2']
    p_value = test_stat['T-test_p-value']
    t_test = test_stat['T-test_stats']
    if p_value<p_val_threshold:
        if t_test>0:
            model_scores[model_name] += 1  
            print(f"Model (p-value:{p_value:0.3f})-(t_test:{t_test:0.3f})    {model_name}  \t beats '{other_model_name}'")
        else:
            model_scores[model_name] -= 1  
model_scores

Model (p-value:0.036)-(t_test:2.099)    vit-b16  	 beats 'vit-s32'
Model (p-value:0.011)-(t_test:2.535)    vim-s  	 beats 'vit-s32'
Model (p-value:0.032)-(t_test:2.148)    vssm-ti  	 beats 'resnet50'
Model (p-value:0.024)-(t_test:2.258)    vssm-ti  	 beats 'vgg16'
Model (p-value:0.002)-(t_test:3.086)    vssm-ti  	 beats 'vit-s32'
Model (p-value:0.027)-(t_test:2.223)    vssm-s  	 beats 'vit-s32'
Model (p-value:0.015)-(t_test:2.436)    vssm-b  	 beats 'resnet50'
Model (p-value:0.006)-(t_test:2.749)    vssm-b  	 beats 'vgg16'
Model (p-value:0.018)-(t_test:2.373)    vssm-b  	 beats 'vit-ti16'
Model (p-value:0.022)-(t_test:2.296)    vssm-b  	 beats 'vit-s16'
Model (p-value:0.000)-(t_test:3.819)    vssm-b  	 beats 'vit-s32'
Model (p-value:0.029)-(t_test:2.190)    vssm-b  	 beats 'vit-b32'


{'resnet50': -2,
 'vgg16': -2,
 'vit-ti16': -1,
 'vit-s16': -1,
 'vit-s32': -5,
 'vit-b16': 1,
 'vit-b32': -1,
 'vim-s': 1,
 'vssm-ti': 3,
 'vssm-s': 1,
 'vssm-b': 6}

In [5]:
p_val_threshold = 0.05
df = pd.read_csv('results_B.csv')
model_scores = {i:0 for i in list(df['Model 1'].unique())}
for i in range(df.shape[0]):
    test_stat = df.iloc[i]
    model_name = test_stat['Model 1']
    other_model_name = test_stat['Model 2']
    p_value = test_stat['T-test_p-value']
    t_test = test_stat['T-test_stats']
    if p_value<p_val_threshold:
        if t_test>0:
            model_scores[model_name] += 1  
            print(f"Model (p-value:{p_value:0.3f})-(t_test:{t_test:0.3f})    {model_name}  \t beats '{other_model_name}'")
        else:
            model_scores[model_name] -= 1  
model_scores

Model (p-value:0.034)-(t_test:2.142)    vit-s16  	 beats 'vit-b16'
Model (p-value:0.032)-(t_test:2.171)    vim-s  	 beats 'vit-ti16'
Model (p-value:0.019)-(t_test:2.368)    vim-s  	 beats 'vit-b16'
Model (p-value:0.011)-(t_test:2.582)    vssm-ti  	 beats 'resnet50'
Model (p-value:0.028)-(t_test:2.218)    vssm-ti  	 beats 'vgg16'
Model (p-value:0.004)-(t_test:2.915)    vssm-ti  	 beats 'vit-ti16'
Model (p-value:0.012)-(t_test:2.557)    vssm-ti  	 beats 'vit-s32'
Model (p-value:0.003)-(t_test:3.087)    vssm-ti  	 beats 'vit-b16'
Model (p-value:0.007)-(t_test:2.756)    vssm-ti  	 beats 'vit-b32'
Model (p-value:0.034)-(t_test:2.142)    vssm-ti  	 beats 'vssm-b'


{'resnet50': -1,
 'vgg16': -1,
 'vit-ti16': -2,
 'vit-s16': 1,
 'vit-s32': -1,
 'vit-b16': -3,
 'vit-b32': -1,
 'vim-s': 2,
 'vssm-ti': 7,
 'vssm-s': 0,
 'vssm-b': -1}