In [20]:
import os
from PIL import Image
import json

def load_images_from_directory(root_path: str):
    """
    Load images from a directory with subfolders named after ImageNet labels.
    Return a list of (image, label, filename) triples.
    """
    dataset = []
    
    # Iterate over each subfolder
    for label in os.listdir(root_path):
        label_path = os.path.join(root_path, label)
        
        # Check if it's indeed a folder
        if os.path.isdir(label_path):
            
            # Iterate over each image in the subfolder
            for image_file in os.listdir(label_path):
                image_path = os.path.join(label_path, image_file)
                
                # Check if it's an image file
                if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img = Image.open(image_path)
                    dataset.append((img, label, image_file))  # Add image filename here
    
    return dataset


current_dir = "/home/workstation/code/XAImethods/CAIN"

# Load ImageNet class index
with open(f"{current_dir}/imagenet/imagenet_class_index.json", "r") as f:
    imagenet_class_index = json.load(f)


label_to_index_description = {v[0]: (k, v[1]) for k, v in imagenet_class_index.items()}

#all
from tqdm import tqdm
import os
import numpy as np
from PIL import Image
import torch
from torchvision import transforms
from transformers import ViTForImageClassification
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from scipy.special import kl_div as scipy_kl_div
from scipy.special import softmax
from sklearn.metrics import precision_score, recall_score, f1_score

from difflib import get_close_matches

def closest_match(description: str, possible_descriptions: list) -> str:
    """
    Find the closest match for the given description in the list of possible descriptions.
    """
    matches = get_close_matches(description, possible_descriptions, n=1, cutoff=0.5)
    return matches[0] if matches else description

def predict_scores_for_classes(model, img_path):
    if not os.path.exists(img_path):
        return None
    img = Image.open(img_path)
    img = img.resize((384, 384))
    img_tensor = transforms.ToTensor()(img).unsqueeze(0).to(device)
    logits = model(img_tensor).logits[0].cpu().detach().numpy()
    return logits

def get_top_n_classes(scores, n=5):
    return set(np.argsort(scores)[-n:])

def calculate_jaccard_similarity(set1, set2):
    return len(set1.intersection(set2)) / len(set1.union(set2))

def calculate_dice_similarity(set1, set2):
    return 2 * len(set1.intersection(set2)) / (len(set1) + len(set2))

def calculate_cosine_similarity(vec1, vec2):
    similarity = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    normalized_similarity = (similarity + 1) / 2
    return normalized_similarity

def calculate_euclidean_distance(vec1, vec2):
    distance = pairwise_distances(vec1.reshape(1, -1), vec2.reshape(1, -1), metric='euclidean')[0][0]
    normalized_distance = distance / np.sqrt(len(vec1))
    return normalized_distance

def calculate_kl_divergence(vec1, vec2):
    prob1 = softmax(vec1)
    prob2 = softmax(vec2)
    
    kl_div_value = scipy_kl_div(prob1, prob2).sum()
    normalized_kl_div = kl_div_value / np.log(len(vec1))
    return normalized_kl_div

def calculate_weighted_jaccard_similarity(vec1, vec2):
    min_sum = sum([min(a, b) for a, b in zip(vec1, vec2)])
    max_sum = sum([max(a, b) for a, b in zip(vec1, vec2)])
    return min_sum / max_sum if max_sum != 0 else 0

def calculate_prediction_changes(original_scores, masked_scores, top_n_indices):
    # Assuming top_n_indices contains only one index for top1
    idx = top_n_indices[0]
    
    original_score = original_scores[idx]
    masked_score = masked_scores[idx]
    
    change = max(0, original_score - masked_score)  # Set to 0 if negative
    
    # Calculate the percentage change
    if original_score > 0:
        percentage = change / original_score
    elif original_score < 0:
        percentage = change / abs(original_score)
    else:
        percentage = 0
    
    return change, percentage




def calculate_similarities(original_scores, masked_scores, top_n_values=[1, 5, 10, 50], metrics=["jaccard", "dice", "cosine", "euclidean", "kl", "weighted_jaccard", "prediction_change"]):
    results = {}
    
    for n in top_n_values:
        top_n_original = get_top_n_classes(original_scores, n)
        top_n_masked = get_top_n_classes(masked_scores, n)
        top_n_original_indices = np.argsort(original_scores)[-n:]

        for metric in metrics:
            if metric == "jaccard":
                jaccard_similarity = calculate_jaccard_similarity(top_n_original, top_n_masked)
                results[f"Jaccard_Top_{n}"] = jaccard_similarity
            
            if metric == "dice":
                dice_similarity = calculate_dice_similarity(top_n_original, top_n_masked)
                results[f"Dice_Top_{n}"] = dice_similarity
                
            if metric == "cosine":
                cosine_sim = calculate_cosine_similarity(original_scores, masked_scores)
                results[f"Cosine_Top_{n}"] = cosine_sim

            if metric == "euclidean":
                euclidean_dist = calculate_euclidean_distance(original_scores, masked_scores)
                results[f"Euclidean_Top_{n}"] = euclidean_dist

            if metric == "kl":
                kl_div = calculate_kl_divergence(original_scores, masked_scores)
                results[f"KL_Top_{n}"] = kl_div

            if metric == "weighted_jaccard":
                for n in top_n_values:
                    original_top_n_indices = np.argsort(original_scores)[-n:]
                    original_top_n_values = original_scores[original_top_n_indices]
                    
                    masked_top_n_indices = np.argsort(masked_scores)[-n:]
                    masked_top_n_values = masked_scores[masked_top_n_indices]
                    
                    weighted_jaccard_sim = calculate_weighted_jaccard_similarity(original_top_n_values, masked_top_n_values)
                    results[f"Weighted_Jaccard_Top_{n}"] = weighted_jaccard_sim

            if metric == "prediction_change":
                prediction_change, prediction_change_percentage = calculate_prediction_changes(original_scores, masked_scores, top_n_original_indices)
                results["Prediction_Change_Top1"] = prediction_change
                results["Prediction_Change_Percentage_Top1"] = prediction_change_percentage

                
    return results


In [21]:
import os
from PIL import Image
import json
from torchvision import transforms
import torch
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from tqdm import tqdm
from transformers import CvtForImageClassification
import shutil

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CvtForImageClassification.from_pretrained('microsoft/cvt-13').to(device)
model.eval()

masking_root_folder = "/home/workstation/code/XAImethods/CAIN/evaluation_results/imagenet/val_images10k_attack/defocus_blur/1/facebook/convnext-tiny-224/GradCAM"

# Metrics to calculate
selected_metrics = ["kl", "prediction_change"]

true_labels = []
original_predictions = []
masked_predictions = []

possible_descriptions = list(label_to_index_description.keys())

for subfolder in tqdm(os.listdir(masking_root_folder)[:100]):
    subfolder_path = os.path.join(masking_root_folder, subfolder)
    if os.path.isdir(subfolder_path):
        original_image_path = os.path.join(subfolder_path, 'original.jpg')
        masked_image_path = os.path.join(subfolder_path, 'masked_image.jpg')
        
        # Extract true label from subfolder name
        true_label = subfolder.split('_')[-1]
        true_labels.append(label_to_index_description[true_label][0])
        
        original_scores = predict_scores_for_classes(model, original_image_path)
        masked_scores = predict_scores_for_classes(model, masked_image_path)

        original_pred = np.argmax(original_scores)
        masked_pred = np.argmax(masked_scores)
        
        # Convert description to closest matching label
        original_desc = model.config.id2label[original_pred]
        closest_original_desc = closest_match(original_desc, possible_descriptions)
        if closest_original_desc in label_to_index_description:
            original_predictions.append(label_to_index_description[closest_original_desc][0])
        else:
            original_predictions.append(-1)  # Placeholder value indicating an unmatched description
        
        masked_desc = model.config.id2label[masked_pred]
        closest_masked_desc = closest_match(masked_desc, possible_descriptions)
        if closest_masked_desc in label_to_index_description:
            masked_predictions.append(label_to_index_description[closest_masked_desc][0])
        else:
            masked_predictions.append(-1)  # Placeholder value indicating an unmatched description
        
        similarities = calculate_similarities(original_scores, masked_scores, metrics=selected_metrics)
        np.save(os.path.join(subfolder_path, 'similarity_metrics.npy'), similarities)

original_precision = precision_score(true_labels, original_predictions, average='micro', zero_division=1, labels=[x for x in range(len(label_to_index_description))])
original_recall = recall_score(true_labels, original_predictions, average='micro', zero_division=1)
original_f1 = f1_score(true_labels, original_predictions, average='micro')

masked_precision = precision_score(true_labels, masked_predictions, average='micro', zero_division=1, labels=[x for x in range(len(label_to_index_description))])
masked_recall = recall_score(true_labels, masked_predictions, average='micro', zero_division=1)
masked_f1 = f1_score(true_labels, masked_predictions, average='micro')

print(f"Original images - Precision: {original_precision}, Recall: {original_recall}, F1: {original_f1}")
print(f"Masked images - Precision: {masked_precision}, Recall: {masked_recall}, F1: {masked_f1}")


100%|██████████| 100/100 [00:05<00:00, 16.82it/s]


ValueError: Mix of label input types (string and number)

In [None]:
print(f"Original images - Precision: {original_precision:.5f}, Recall: {original_recall:.5f}, F1: {original_f1:.5f}")
print(f"Masked images - Precision: {masked_precision:.5f}, Recall: {masked_recall:.5f}, F1: {masked_f1:.5f}")


NameError: name 'original_precision' is not defined

In [None]:
print(original_predictions)

[]


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  # 这是一个用于数据可视化的Python库

# 初始化一个用于存储所有度量数据的字典
all_metrics = {}

# 指定保存.npy文件的根目录
root_folder = masking_root_folder

# 遍历根目录下的所有子目录
for subfolder in os.listdir(root_folder):
    subfolder_path = os.path.join(root_folder, subfolder)
    
    # 检查是否为目录
    if os.path.isdir(subfolder_path):
        npy_file_path = os.path.join(subfolder_path, 'similarity_metrics.npy')
        
        # 检查.npy文件是否存在
        if os.path.exists(npy_file_path):
            # 加载.npy文件
            metrics = np.load(npy_file_path, allow_pickle=True).item()
            
            # 将这些度量值添加到all_metrics字典中
            for key, value in metrics.items():
                if key not in all_metrics:
                    all_metrics[key] = []
                all_metrics[key].append(value)

# 绘制直方图和进行统计分析
for metric, values in all_metrics.items():
    if 'Prediction_Change_Top_1' not in metric and 'Prediction_Change_Percentage_Top_1' not in metric:
        plt.figure(figsize=(10, 6))
        
        # 使用Seaborn库绘制带有KDE的直方图
        sns.histplot(values, bins=20, kde=True)
        
        plt.title(f'Distribution of {metric}')
        plt.xlabel(metric)
        plt.ylabel('Frequency')
        
        # 显示图像
        plt.show()
        
        # 统计分析
        mean_value = np.mean(values)
        std_dev = np.std(values)
        median_value = np.median(values)
        quartiles = np.percentile(values, [25, 75])
        
        print(f"=== Statistical Summary for {metric} ===")
        print(f"Mean: {mean_value}")
        print(f"Standard Deviation: {std_dev}")
        print(f"Median: {median_value}")
        print(f"1st Quartile: {quartiles[0]}")
        print(f"3rd Quartile: {quartiles[1]}")
        print("\n")




In [None]:
# # Importing required libraries
# import matplotlib.pyplot as plt
# import seaborn as sns
# import numpy as np
# import os
# import pandas as pd

# # Helper function to read all similarity metrics saved as .npy files into a list of dictionaries
# def read_saved_metrics(root_folder):
#     all_similarities = []
#     for subfolder in os.listdir(root_folder):
#         subfolder_path = os.path.join(root_folder, subfolder)
#         if os.path.isdir(subfolder_path):
#             similarity_file_path = os.path.join(subfolder_path, 'similarity_metrics.npy')
#             if os.path.exists(similarity_file_path):
#                 similarities = np.load(similarity_file_path, allow_pickle=True).item()
#                 all_similarities.append(similarities)
#     return all_similarities

# # Root folder where all the similarity metrics are saved
# root_folder = "/home/workstation/code/XAImethods/hf_cam_dev/results/masked"  # Replace with your directory

# # Read the saved metrics
# all_similarities = read_saved_metrics(root_folder)

# # Convert the list of dictionaries to a DataFrame for easier manipulation
# df = pd.DataFrame(all_similarities)

# # Visualization and Analysis
# metrics_to_analyze = ["Jaccard_Top_1", "Dice_Top_1", "Cosine_Top_1", "Euclidean_Top_1", "KL_Top_1"]

# # Histograms
# for metric in metrics_to_analyze:
#     plt.figure(figsize=(10, 6))
#     sns.histplot(df[metric], bins=20, kde=True)
#     plt.title(f'Distribution of {metric}')
#     plt.xlabel(metric)
#     plt.ylabel('Frequency')
#     plt.show()

# # Boxplots/Violin plots
# for metric in metrics_to_analyze:
#     plt.figure(figsize=(10, 6))
#     sns.boxplot(x=df[metric])
#     plt.title(f'Boxplot of {metric}')
#     plt.xlabel(metric)
#     plt.show()

# # Heatmap for correlation
# correlation_matrix = df[metrics_to_analyze].corr()
# plt.figure(figsize=(10, 6))
# sns.heatmap(correlation_matrix, annot=True)
# plt.title('Heatmap of Correlations Between Metrics')
# plt.show()

# # Statistical Analysis
# for metric in metrics_to_analyze:
#     mean = df[metric].mean()
#     std_dev = df[metric].std()
#     median = df[metric].median()
#     q1 = df[metric].quantile(0.25)
#     q3 = df[metric].quantile(0.75)
    
#     print(f"{metric}: Mean = {mean}, Std Dev = {std_dev}, Median = {median}, Q1 = {q1}, Q3 = {q3}")

# # For further analysis like clustering or anomaly detection, you can proceed with df DataFrame
# # df now contains all your similarity metrics and can be used for advanced statistical methods.


In [None]:
# # Importing the necessary libraries to read and display .npy file contents
# import numpy as np

# def display_npy_file_content(npy_file_path):
#     """
#     Display the content of a .npy file.
#     """
#     if not os.path.exists(npy_file_path):
#         return f"File {npy_file_path} doesn't exist."
#     npy_content = np.load(npy_file_path, allow_pickle=True).item()
#     return npy_content

# # Sample usage
# # Assuming the file path is "/path/to/your/file.npy"
# file_path = "/home/workstation/code/XAImethods/hf_cam_dev/results/masked/ILSVRC2012_val_00000171/original_scores.npy"

# # Display the content
# content = display_npy_file_content(file_path)
# content
