In [1]:
learning_rate = 0.003305526571026246
batch_size = 230
dropout_rate = 0.30806216013664534
num_classes = 4

In [2]:
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from utils_cells import get_images_list, transform_image, transform_target, resize_with_padding
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
import torchvision.transforms.functional as F
import torch
from torchvision import transforms
from torchvision.transforms import functional as F
import cv2
from sklearn.model_selection import train_test_split
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet18
from torchmetrics import Precision, Recall
import numpy as np
import datetime
import random
import time
import torchvision.models as models
import wandb

optuna.samplers.TPESampler(seed=2233)

import random

class ImageDataset(Dataset):
    def __init__(self, data_path, transform=None, target_transform=None, reduce=False):
        self.transform = transform
        self.target_transform = target_transform
        self.dataset = shuffle(self.load_dataset(data_path))

    def load_dataset(self, path):
        files = os.listdir(path)
        dataset_final = pd.DataFrame()
        dataset_final['filename'] = []
        dataset_final['class'] = []
        for filename in files:
            dataset = pd.DataFrame()
            if filename.endswith('.txt'):
                files = get_images_list(f'{path}/{filename}')
                dataset['filename'] = files
                dataset['class'] = filename.split('_')[1][:-3]
                dataset_final = pd.concat([dataset_final, dataset], ignore_index=True)
        return dataset_final                
                          
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        image = cv2.imread(f'{self.dataset["filename"].loc[idx]}')
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (32, 32), interpolation=cv2.INTER_CUBIC)
        #image = resize_with_padding(image, (32, 32))
        image = image.astype(np.float32)
        image = self.transform(image = image)['image'] if self.transform is not None else image

        target = self.dataset["class"].loc[idx]

        if target == 'normal.':
            target_ = [1, 0, 0, 0]
        elif target == 'inflamatory.':
            target_ = [0, 1, 0, 0]
        elif target == 'tumor.':
            target_ = [0, 0, 1, 0]
        elif target == 'other.':
            target_ = [0, 0, 0, 1]
        else:
            print(target)
        
        image = F.to_tensor(image)
        
       
     

        """To see transorms use:
            image, target = trainset[15]
            image = image.numpy()
            image=np.swapaxes(image,0,1)
            image=np.swapaxes(image,1,2)
            plt.imshow(image)"""

        return image.float(), torch.Tensor(np.array(target_, dtype=np.float32))





  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(2233)



from albumentations import (
    Compose,
    Resize,
    OneOf,
    RandomBrightness,
    RandomContrast,
    MotionBlur,
    MedianBlur,
    GaussianBlur,
    VerticalFlip,
    HorizontalFlip,
    ShiftScaleRotate,
    Normalize,
)

transform = Compose(
    [
        Normalize(mean=0, std=1),
        OneOf([RandomBrightness(limit=0.1, p=1), RandomContrast(limit=0.1, p=0.8)]),
        OneOf([MotionBlur(blur_limit=3), MedianBlur(blur_limit=3), GaussianBlur(blur_limit=3),], p=0.7,),
        VerticalFlip(p=0.5),
        HorizontalFlip(p=0.5),
    ]
)

transform_test = Compose(
    [Normalize(mean=0, std=1)]
)



In [4]:
trainset = ImageDataset(data_path='train_data', transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=3)

testset = ImageDataset(data_path='validation_data', transform=transform_test)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

class EfficientNetB0(nn.Module):
    def __init__(self, num_classes=4, dropout_rate=dropout_rate):
        super(EfficientNetB0, self).__init__()
        self.base_model = models.efficientnet_b0(pretrained=False)
        num_ftrs = self.base_model.classifier[1].in_features
        self.base_model.classifier = nn.Sequential(
            nn.Dropout(p=dropout_rate),  # Add dropout layer
            nn.Linear(num_ftrs, num_classes)
        )
    
    def forward(self, x):
        return self.base_model(x)
    
model = EfficientNetB0(num_classes=4, dropout_rate=dropout_rate)
total_params = sum(p.numel() for p in model.parameters())
total_params



4012672

In [5]:

run_name = f'efficient_net1__nopad_norm{datetime.datetime.now()}'
run_path = f'training_checkpoints/{run_name}'
wandb.init(project="cells", 
           entity="adamsoja",
          name=run_name)
set_seed(2233)
class MyModel(nn.Module):
    def __init__(self, model, learning_rate):
        super(MyModel, self).__init__()
        self.model = model
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode="min", factor=0.1, patience=7, min_lr=5e-6, verbose=True)
        self.step = 0
        self.metric_precision = Precision(task="multiclass", num_classes=num_classes, average=None).to('cuda')
        self.metric_recall = Recall(task="multiclass", num_classes=num_classes, average=None).to('cuda')
        self.train_loss = []
        self.valid_loss = []
        self.precision_per_epochs = []
        self.recall_per_epochs = []

    def forward(self, x):
        return self.model(x)

    def train_one_epoch(self, trainloader):
        self.step += 1
        self.train()
        for batch_idx, (inputs, labels) in enumerate(trainloader):
            inputs, labels = inputs.to('cuda'), labels.to('cuda')
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()
            _, preds = torch.max(outputs, 1)
            _, labels = torch.max(labels, 1)
            self.metric_precision(preds, labels)
            self.metric_recall(preds, labels)
            self.train_loss.append(loss.item())


        

        
        avg_loss = np.mean(self.train_loss)
        self.train_loss.clear()
        precision = self.metric_precision.compute()
        recall = self.metric_recall.compute()
        self.precision_per_epochs.append(precision)
        self.recall_per_epochs.append(recall)
        print(f'train_loss: {avg_loss}')
        print(f'train_precision: {precision}')
        print(f'train_recall: {recall}')

        wandb.log({'loss': avg_loss}, step=self.step)
        
        # Logowanie precision dla każdej klasy
        wandb.log({'Normal precision': precision[0].item()}, step=self.step)
        wandb.log({'Inflamatory precision': precision[1].item()}, step=self.step)
        wandb.log({'Tumor precision': precision[2].item()}, step=self.step)
        wandb.log({'Other precision': precision[3].item()}, step=self.step)
        
        # Logowanie recall dla każdej klasy
        wandb.log({'Normal recall': recall[0].item()}, step=self.step)
        wandb.log({'Inflamatory recall': recall[1].item()}, step=self.step)
        wandb.log({'Tumor recall': recall[2].item()}, step=self.step)
        wandb.log({'Other recall': recall[3].item()}, step=self.step)
        
        # Obliczanie głównych metryk
        main_metrics_precision = (precision[0].item() + precision[1].item() + precision[2].item() + precision[3].item()) / 4
        main_metrics_recall = (recall[0].item() + recall[1].item() + recall[2].item() + recall[3].item()) / 4
        
        # Logowanie głównych metryk
        wandb.log({'main_metrics_precision': main_metrics_precision}, step=self.step)
        wandb.log({'main_metrics_recall': main_metrics_recall}, step=self.step)

        precision_ = main_metrics_precision
        recall_ = main_metrics_recall
        
        if (precision_ + recall_) > 0:
            f1_score_val = 2 * (precision_ * recall_) / (precision_ + recall_)
        else:
            f1_score_val = 0
        
        wandb.log({'f1_score_val': f1_score_val}, step=self.step)

        
        
        self.metric_precision.reset()
        self.metric_recall.reset()


    

    def evaluate(self, testloader):
        self.eval()
        with torch.no_grad():
            for batch_idx, (inputs, labels) in enumerate(testloader):
                inputs, labels = inputs.to('cuda'), labels.to('cuda')
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                _, preds = torch.max(outputs, 1)
                _, labels = torch.max(labels, 1)
                self.metric_precision(preds, labels)
                self.metric_recall(preds, labels)
                self.valid_loss.append(loss.item())
    
        avg_loss = np.mean(self.valid_loss)
        self.scheduler.step(avg_loss)
        self.valid_loss.clear()
        precision = self.metric_precision.compute()
        recall = self.metric_recall.compute()
        print(f'val_loss: {avg_loss}')
        print(f'val_precision: {precision}')
        print(f'val_recall: {recall}')
        self.metric_precision.reset()
        self.metric_recall.reset()
    
        main_metrics_precision = (precision[0].item() + precision[1].item() + precision[2].item() + precision[3].item()) / 4
        
        main_metrics_recall = (recall[0].item() + recall[1].item() + recall[2].item() + recall[3].item()) / 4
        
        wandb.log({'val_loss': avg_loss}, step=self.step)
        
        wandb.log({'val_Normal precision': precision[0].item()}, step=self.step)
        wandb.log({'val_Inflamatory precision': precision[1].item()}, step=self.step)
        wandb.log({'val_Tumor precision': precision[2].item()}, step=self.step)
        wandb.log({'val_Other precision': precision[3].item()}, step=self.step)
        
        wandb.log({'val_Normal recall': recall[0].item()}, step=self.step)
        wandb.log({'val_Inflamatory recall': recall[1].item()}, step=self.step)
        wandb.log({'val_Tumor recall': recall[2].item()}, step=self.step)
        wandb.log({'val_Other recall': recall[3].item()}, step=self.step)
        
        wandb.log({'val_main_metrics_precision': main_metrics_precision}, step=self.step)
        wandb.log({'val_main_metrics_recall': main_metrics_recall}, step=self.step)

        precision_ = main_metrics_precision
        recall_ = main_metrics_recall
        
        if (precision_ + recall_) > 0:
            f1_score_val = 2 * (precision_ * recall_) / (precision_ + recall_)
        else:
            f1_score_val = 0
        
        wandb.log({'f1_score_val': f1_score_val}, step=self.step)
        
        

        for param_group in self.optimizer.param_groups:
            print(f"Learning rate: {param_group['lr']}")
        return avg_loss

my_model = MyModel(model=model, learning_rate=learning_rate)
my_model = my_model.to('cuda')
early_stop_patience = 15
num_epochs = 100
best_val_loss = float('inf')
for epoch in range(num_epochs):
    my_model.train_one_epoch(trainloader)
    val_loss = my_model.evaluate(testloader)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(my_model.state_dict(), f"{run_path}.pt")

    else:
        patience_counter += 1
    if patience_counter >= early_stop_patience:
        print(f"Early stopping at epoch {epoch} with best validation loss {best_val_loss}")
        break
my_model.load_state_dict(torch.load(f'{run_path}.pt'))


[34m[1mwandb[0m: Currently logged in as: [33madamsoja[0m. Use [1m`wandb login --relogin`[0m to force relogin




train_loss: 0.9516421472325044
train_precision: tensor([0.5269, 0.6703, 0.6219, 0.0539], device='cuda:0')
train_recall: tensor([0.5211, 0.7515, 0.6140, 0.0013], device='cuda:0')
val_loss: 1.006547550459455
val_precision: tensor([0.6281, 0.5653, 0.6566, 0.0000], device='cuda:0')
val_recall: tensor([0.3411, 0.9011, 0.6323, 0.0000], device='cuda:0')
Learning rate: 0.003305526571026246
train_loss: 0.8986515983540744
train_precision: tensor([0.5645, 0.6820, 0.6619, 0.5122], device='cuda:0')
train_recall: tensor([0.5551, 0.7696, 0.6529, 0.0091], device='cuda:0')
val_loss: 0.8553745480546927
val_precision: tensor([0.5912, 0.6712, 0.7064, 0.7684], device='cuda:0')
val_recall: tensor([0.5667, 0.8342, 0.6305, 0.0458], device='cuda:0')
Learning rate: 0.003305526571026246
train_loss: 0.8361220604595654
train_precision: tensor([0.5904, 0.7065, 0.6810, 0.6213], device='cuda:0')
train_recall: tensor([0.5769, 0.7816, 0.6857, 0.0599], device='cuda:0')
val_loss: 0.9351788194399522
val_precision: tensor(

<All keys matched successfully>

In [6]:
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import DataLoader
import numpy as np
import torch 
import torch.nn as nn

def test_report(model, dataloader):
    """Prints confusion matrix for testing dataset
    dataloader should be of batch_size=1."""

    y_pred = []
    y_test = []
    model.eval()
    with torch.no_grad():
        for data, label in dataloader:
            output = model(data)
            label = label.numpy()
            output = output.numpy()
            y_pred.append(np.argmax(output))
            y_test.append(np.argmax(label))
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

testset =ImageDataset(data_path='test_data', transform=transform_test, reduce=True)
dataloader = DataLoader(testset, batch_size=1, shuffle=True)

test_report(my_model.to('cpu'), dataloader)

[[716  99 180   5]
 [ 88 862  46   4]
 [118  42 832   8]
 [  6  10  11  73]]
              precision    recall  f1-score   support

           0       0.77      0.72      0.74      1000
           1       0.85      0.86      0.86      1000
           2       0.78      0.83      0.80      1000
           3       0.81      0.73      0.77       100

    accuracy                           0.80      3100
   macro avg       0.80      0.78      0.79      3100
weighted avg       0.80      0.80      0.80      3100



In [1]:
from scipy import stats
import numpy as np

# Example data: precision values for Class 0 under different approaches
precision_no_padding_no_norm = [0.78, 0.87, 0.80, 0.79]
precision_padding_norm = [0.77, 0.86, 0.81, 0.76]
precision_padding_no_norm = [0.77, 0.85, 0.79, 0.76]
precision_no_padding_norm = [0.77, 0.85, 0.78, 0.81]

# Paired t-test between two approaches
t_stat, p_value = stats.ttest_rel(precision_no_padding_no_norm, precision_padding_norm)
print(f"Paired t-test: t-statistic = {t_stat}, p-value = {p_value}")

# Wilcoxon signed-rank test between two approaches
w_stat, p_value = stats.wilcoxon(precision_no_padding_no_norm, precision_padding_norm)
print(f"Wilcoxon signed-rank test: statistic = {w_stat}, p-value = {p_value}")

# ANOVA across all approaches
data = np.array([precision_no_padding_no_norm, precision_padding_norm, precision_padding_no_norm, precision_no_padding_norm])
f_stat, p_value = stats.f_oneway(*data)
print(f"ANOVA: F-statistic = {f_stat}, p-value = {p_value}")

# Friedman test across all approaches
f_stat, p_value = stats.friedmanchisquare(precision_no_padding_no_norm, precision_padding_norm, precision_padding_no_norm, precision_no_padding_norm)
print(f"Friedman test: chi-squared = {f_stat}, p-value = {p_value}")


Paired t-test: t-statistic = 1.2247448713915892, p-value = 0.308068009250357
Wilcoxon signed-rank test: statistic = 2.0, p-value = 0.375
ANOVA: F-statistic = 0.12531328320802032, p-value = 0.9433148818773759
Friedman test: chi-squared = 4.852941176470589, p-value = 0.18288770966116996


In [2]:
import numpy as np
from scipy import stats

# Define the metrics for each class and approach
precision_no_padding_no_norm = [0.78, 0.87, 0.80, 0.79]
precision_padding_norm = [0.77, 0.86, 0.81, 0.76]
precision_padding_no_norm = [0.77, 0.85, 0.79, 0.76]
precision_no_padding_norm = [0.77, 0.85, 0.78, 0.81]

recall_no_padding_no_norm = [0.74, 0.88, 0.83, 0.81]
recall_padding_norm = [0.74, 0.87, 0.82, 0.75]
recall_padding_no_norm = [0.72, 0.87, 0.83, 0.68]
recall_no_padding_norm = [0.72, 0.86, 0.83, 0.73]

f1_no_padding_no_norm = [0.74, 0.86, 0.80, 0.77]
f1_padding_norm = [0.75, 0.87, 0.81, 0.75]
f1_padding_no_norm = [0.74, 0.86, 0.81, 0.72]
f1_no_padding_norm = [0.74, 0.86, 0.80, 0.77]

# Function to perform statistical tests
def perform_tests(data1, data2, data3, data4):
    results = {}

    # Paired t-tests
    t_stat, p_value = stats.ttest_rel(data1, data2)
    results['t-test (1 vs 2)'] = (t_stat, p_value)

    t_stat, p_value = stats.ttest_rel(data1, data3)
    results['t-test (1 vs 3)'] = (t_stat, p_value)

    t_stat, p_value = stats.ttest_rel(data1, data4)
    results['t-test (1 vs 4)'] = (t_stat, p_value)

    t_stat, p_value = stats.ttest_rel(data2, data3)
    results['t-test (2 vs 3)'] = (t_stat, p_value)

    t_stat, p_value = stats.ttest_rel(data2, data4)
    results['t-test (2 vs 4)'] = (t_stat, p_value)

    t_stat, p_value = stats.ttest_rel(data3, data4)
    results['t-test (3 vs 4)'] = (t_stat, p_value)

    # Wilcoxon signed-rank tests
    w_stat, p_value = stats.wilcoxon(data1, data2)
    results['Wilcoxon (1 vs 2)'] = (w_stat, p_value)

    w_stat, p_value = stats.wilcoxon(data1, data3)
    results['Wilcoxon (1 vs 3)'] = (w_stat, p_value)

    w_stat, p_value = stats.wilcoxon(data1, data4)
    results['Wilcoxon (1 vs 4)'] = (w_stat, p_value)

    w_stat, p_value = stats.wilcoxon(data2, data3)
    results['Wilcoxon (2 vs 3)'] = (w_stat, p_value)

    w_stat, p_value = stats.wilcoxon(data2, data4)
    results['Wilcoxon (2 vs 4)'] = (w_stat, p_value)

    w_stat, p_value = stats.wilcoxon(data3, data4)
    results['Wilcoxon (3 vs 4)'] = (w_stat, p_value)

    # ANOVA
    f_stat, p_value = stats.f_oneway(data1, data2, data3, data4)
    results['ANOVA'] = (f_stat, p_value)

    # Friedman Test
    f_stat, p_value = stats.friedmanchisquare(data1, data2, data3, data4)
    results['Friedman'] = (f_stat, p_value)

    return results

# Perform tests for each metric
print("Precision Tests:")
print(perform_tests(precision_no_padding_no_norm, precision_padding_norm, precision_padding_no_norm, precision_no_padding_norm))

print("\nRecall Tests:")
print(perform_tests(recall_no_padding_no_norm, recall_padding_norm, recall_padding_no_norm, recall_no_padding_norm))

print("\nF1-Score Tests:")
print(perform_tests(f1_no_padding_no_norm, f1_padding_norm, f1_padding_no_norm, f1_no_padding_norm))


Precision Tests:
{'t-test (1 vs 2)': (1.2247448713915892, 0.308068009250357), 't-test (1 vs 3)': (3.655630775069654, 0.03535284700251738), 't-test (1 vs 4)': (0.7924058156930613, 0.4860036297301704), 't-test (2 vs 3)': (1.5666989036012804, 0.21516994256954994), 't-test (2 vs 4)': (-0.14691063206231755, 0.8925198609712901), 't-test (3 vs 4)': (-0.7385489458759964, 0.5137127113616159), 'Wilcoxon (1 vs 2)': (2.0, 0.375), 'Wilcoxon (1 vs 3)': (0.0, 0.125), 'Wilcoxon (1 vs 4)': (3.0, 0.625), 'Wilcoxon (2 vs 3)': (0.0, 0.17971249487899976), 'Wilcoxon (2 vs 4)': (3.0, 1.0), 'Wilcoxon (3 vs 4)': (1.0, 0.6547208460185769), 'ANOVA': (0.12531328320802032, 0.9433148818773759), 'Friedman': (4.852941176470589, 0.18288770966116996)}

Recall Tests:
{'t-test (1 vs 2)': (1.4770978917519928, 0.23615367738664056), 't-test (1 vs 3)': (1.3211565181516332, 0.27817825276726316), 't-test (1 vs 4)': (1.7320508075688774, 0.18169011381620923), 't-test (2 vs 3)': (1.123902973898033, 0.3428716928579315), 't-test (2

  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


ValueError: zero_method 'wilcox' and 'pratt' do not work if x - y is zero for all elements.

In [5]:
import numpy as np
from scipy.stats import friedmanchisquare
from scikit_posthocs import posthoc_nemenyi_friedman

# Define the confusion matrices for each approach
conf_matrix_no_padding_no_norm = np.array([[740,  89, 159,  12],
                                          [ 77, 875,  45,   3],
                                          [128,  37, 828,   7],
                                          [  8,   5,   6,  81]])

conf_matrix_padding_norm = np.array([[742,  94, 156,   8],
                                     [ 87, 873,  34,   6],
                                     [130,  42, 818,  10],
                                     [  9,   9,   7,  75]])

conf_matrix_padding_no_norm = np.array([[718, 102, 170,  10],
                                        [ 84, 872,  40,   4],
                                        [123,  44, 826,   7],
                                        [ 13,  10,   9,  68]])

conf_matrix_no_padding_norm = np.array([[716,  99, 180,   5],
                                        [ 88, 862,  46,   4],
                                        [118,  42, 832,   8],
                                        [  6,  10,  11,  73]])

# Flatten the confusion matrices
flattened_matrices = {
    'No Padding, No Norm': conf_matrix_no_padding_no_norm.flatten(),
    'Padding & Norm': conf_matrix_padding_norm.flatten(),
    'Padding, No Norm': conf_matrix_padding_no_norm.flatten(),
    'No Padding, Norm': conf_matrix_no_padding_norm.flatten()
}

# Stack the flattened matrices for Friedman test
data = np.vstack(list(flattened_matrices.values()))

# Perform Friedman test
f_stat, p_value = friedmanchisquare(*data)
print(f"Friedman Test: chi-squared = {f_stat}, p-value = {p_value}")

# If Friedman test is significant, perform pairwise comparisons
if p_value < 0.05:
    print("\nPairwise Comparisons (Nemenyi test):")
    posthoc_results = posthoc_nemenyi_friedman(data.T)
    print(posthoc_results)


Friedman Test: chi-squared = 1.0384615384615268, p-value = 0.7919465161444407


In [6]:
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Define confusion matrices
conf_matrix_no_padding_no_norm = np.array([[740,  89, 159,  12],
                                          [ 77, 875,  45,   3],
                                          [128,  37, 828,   7],
                                          [  8,   5,   6,  81]])

conf_matrix_padding_norm = np.array([[742,  94, 156,   8],
                                     [ 87, 873,  34,   6],
                                     [130,  42, 818,  10],
                                     [  9,   9,   7,  75]])

conf_matrix_padding_no_norm = np.array([[718, 102, 170,  10],
                                        [ 84, 872,  40,   4],
                                        [123,  44, 826,   7],
                                        [ 13,  10,   9,  68]])

conf_matrix_no_padding_norm = np.array([[716,  99, 180,   5],
                                        [ 88, 862,  46,   4],
                                        [118,  42, 832,   8],
                                        [  6,  10,  11,  73]])

# Flatten the confusion matrices
flattened_matrices = {
    'No Padding, No Norm': conf_matrix_no_padding_no_norm.flatten(),
    'Padding & Norm': conf_matrix_padding_norm.flatten(),
    'Padding, No Norm': conf_matrix_padding_no_norm.flatten(),
    'No Padding, Norm': conf_matrix_no_padding_norm.flatten()
}

# Compute Cohen's Kappa for each pair of matrices
kappa_scores = {}
matrix_names = list(flattened_matrices.keys())

for i, name1 in enumerate(matrix_names):
    for j, name2 in enumerate(matrix_names):
        if i < j:  # Avoid redundant pairs
            kappa = cohen_kappa_score(flattened_matrices[name1], flattened_matrices[name2])
            kappa_scores[f"{name1} vs {name2}"] = kappa

# Print the results
for pair, kappa in kappa_scores.items():
    print(f"Cohen's Kappa for {pair}: {kappa:.4f}")


Cohen's Kappa for No Padding, No Norm vs Padding & Norm: -0.0119
Cohen's Kappa for No Padding, No Norm vs Padding, No Norm: 0.0588
Cohen's Kappa for No Padding, No Norm vs No Padding, Norm: -0.0119
Cohen's Kappa for Padding & Norm vs Padding, No Norm: -0.0199
Cohen's Kappa for Padding & Norm vs No Padding, Norm: 0.0476
Cohen's Kappa for Padding, No Norm vs No Padding, Norm: 0.1146


In [7]:
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.utils import resample

def bootstrap_kappa(x, y, n_iterations=1000):
    kappa_values = []
    for _ in range(n_iterations):
        x_resampled, y_resampled = resample(x, y)
        kappa = cohen_kappa_score(x_resampled, y_resampled)
        kappa_values.append(kappa)
    return np.array(kappa_values)

def calculate_p_value(observed_kappa, bootstrap_kappas):
    return np.mean(np.abs(bootstrap_kappas) >= np.abs(observed_kappa))

# Flattened matrices as per previous example
flattened_matrices = {
    'No Padding, No Norm': conf_matrix_no_padding_no_norm.flatten(),
    'Padding & Norm': conf_matrix_padding_norm.flatten(),
    'Padding, No Norm': conf_matrix_padding_no_norm.flatten(),
    'No Padding, Norm': conf_matrix_no_padding_norm.flatten()
}

# Compute Cohen’s Kappa and bootstrap for each pair of matrices
results = {}
for name1, matrix1 in flattened_matrices.items():
    for name2, matrix2 in flattened_matrices.items():
        if name1 < name2:  # avoid redundant pairs
            kappa = cohen_kappa_score(matrix1, matrix2)
            bootstrap_kappas = bootstrap_kappa(matrix1, matrix2)
            p_value = calculate_p_value(kappa, bootstrap_kappas)
            results[f"{name1} vs {name2}"] = (kappa, p_value)

# Print the results
for pair, (kappa, p_value) in results.items():
    print(f"Cohen's Kappa for {pair}: {kappa:.4f}, p-value: {p_value:.4f}")


Cohen's Kappa for No Padding, No Norm vs Padding & Norm: -0.0119, p-value: 0.4510
Cohen's Kappa for No Padding, No Norm vs Padding, No Norm: 0.0588, p-value: 0.6220
Cohen's Kappa for No Padding, No Norm vs No Padding, Norm: -0.0119, p-value: 0.4390
Cohen's Kappa for Padding & Norm vs Padding, No Norm: -0.0199, p-value: 0.4410
Cohen's Kappa for No Padding, Norm vs Padding & Norm: 0.0476, p-value: 0.4860
Cohen's Kappa for No Padding, Norm vs Padding, No Norm: 0.1146, p-value: 0.4240
