# Import Dependencies

In [1]:
import os 
import numpy as np 
import cv2 as cv
import matplotlib.pyplot as plt

from pygments.formatters import img
from tqdm import tqdm

from skimage.metrics import structural_similarity

from preprocessing.edge_extraction import *
from feature_extraction import * 
from preprocessing.fourier_transform import * 
from preprocessing.image_conversion import * 
from clustering import *
from preprocessing.contrast_enhancement import *

# Pre-processing

To reduce noise in images of whole artworks and fragments, we initially considered using the Fourier transform to process the images in the frequency domain.

While converting an image from RGBA to grayscale simplifies processing, it results in the loss of RGB color and alpha channel data, which can be problematic if that information is needed later. Therefore, we chose to split the image into its primary color channels (excluding the alpha channel) and process each channel separately in the frequency domain. After filtering, we planned to reconstruct the filtered image by recombining the processed channels.

However, after several trials, we found that processing the channels separately led to significant information loss in one or more channels. Consequently, we decided to use the NLMeansDenoising filter instead.

Since our goal is to cluster fragments that belong to the same image, we focus on maintaining "continuity" along the fragment borders. Therefore, our process emphasizes the information present along these edges.

Steps:
1. Extract a working region from the borders of the fragment.
2. Filter out the transparent pixels from the working region.
3. Denoise the working region.

**CONSIDERATION**: Contrast enhancement.

# Feature Extraction

To extract relevant features from the fragments, we employ two methods:
- Color Histograms
- Gradient Jacobians

## Color Histograms

Color histograms are graphical representations of the distribution of colors in an image. They quantify the number of pixels that have specific color values, effectively capturing the color composition of the image. By analyzing the color histograms of image fragments, we can compare and cluster similar fragments based on their color distributions.

**This technique is particularly useful for identifying and matching regions of images that share similar color patterns**.

In [None]:
restore_data("optimal_data", "data")

# Structural Similarity (example)

In [None]:
from skimage.metrics import structural_similarity
import cv2
import numpy as np

before = cv2.imread('data/5.38.35.png')
after = cv2.imread('references/5.37.jpg')

max_w = max(before.shape[0], after.shape[0])
max_h = max(before.shape[1], after.shape[1])

before = cv2.resize(before, (max_w, max_h))
after = cv2.resize(after, (max_w, max_h))

# Convert images to grayscale
before_gray = cv2.cvtColor(before, cv2.COLOR_BGR2GRAY)
after_gray = cv2.cvtColor(after, cv2.COLOR_BGR2GRAY)

# Compute SSIM between two images
(score, diff) = structural_similarity(before_gray, after_gray, full=True)
print("Image similarity", score)

# The diff image contains the actual image differences between the two images
# and is represented as a floating point data type in the range [0,1]
# so we must convert the array to 8-bit unsigned integers in the range
# [0,255] before we can use it with OpenCV
diff = (diff * 255).astype("uint8")

# Threshold the difference image, followed by finding contours to
# obtain the regions of the two input images that differ
thresh = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
contours = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0] if len(contours) == 2 else contours[1]

mask = np.zeros(before.shape, dtype='uint8')
filled_after = after.copy()

for c in contours:
    area = cv2.contourArea(c)
    if area > 40:
        x,y,w,h = cv2.boundingRect(c)
        cv2.rectangle(before, (x, y), (x + w, y + h), (36,255,12), 2)
        cv2.rectangle(after, (x, y), (x + w, y + h), (36,255,12), 2)
        cv2.drawContours(mask, [c], 0, (0,255,0), -1)
        cv2.drawContours(filled_after, [c], 0, (0,255,0), -1)

cv2.imshow('before', before)
cv2.imshow('after', after)
cv2.imshow('diff',diff)
cv2.imshow('mask',mask)
cv2.imshow('filled after',filled_after)
cv2.waitKey(0)

In [2]:
threshold = 5
references_path = "references"
data_dir = "./data"
optimal_data_dir = "./optimal_data"

In [17]:
working_region_fragments_dataset = create_dataset(img_dir=data_dir, threshold=5)
original_fragments_dataset = create_dataset(img_dir=data_dir, extract_borders=False)
reference_images_ids = [reference.split(".")[1] for reference in tqdm(os.listdir(references_path))]
reference_images = [cv.imread(os.path.join(references_path, reference), cv.IMREAD_UNCHANGED) for reference in tqdm(os.listdir(references_path))]

Creating dataset: 100%|██████████| 328/328 [00:03<00:00, 93.87it/s] 
Creating dataset: 100%|██████████| 328/328 [00:12<00:00, 25.75it/s]
100%|██████████| 8/8 [00:00<00:00, 284359.59it/s]
100%|██████████| 8/8 [00:00<00:00, 69.18it/s]


In [5]:
working_region_color_histograms = compute_color_histograms(working_region_fragments_dataset)
ssim_scores = []

for fragment in tqdm(original_fragments_dataset):
    max_w = max(fragment.shape[0], reference_images[0].shape[0])
    max_h = max(fragment.shape[1], reference_images[0].shape[1])

    fragment = cv.resize(fragment, (max_w, max_h))
    reference = cv.resize(reference_images[0], (max_w, max_h))

    # Convert images to grayscale
    fragment_gray = cv.cvtColor(fragment, cv.COLOR_BGR2GRAY)
    reference_gray = cv.cvtColor(reference, cv.COLOR_BGR2GRAY)
    (score, diff) = structural_similarity(fragment_gray, reference_gray, full=True)
    ssim_scores.append(score)


X = []

for idx, histogram in enumerate(working_region_color_histograms):
    X.append(np.concatenate((histogram, np.array([ssim_scores[idx]]))))
X

Computing color histograms: 100%|██████████| 328/328 [00:00<00:00, 39024.53it/s]
100%|██████████| 328/328 [00:02<00:00, 122.70it/s]


[array([0.00835655, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00278552,
        0.00835655, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.04735376, 0.00278552,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00557103, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.32

In [18]:
def f12(precision_score: float, recall_score: float) -> float:
    """
    Calculates F1 score given precision and recall.

    Args:
        precision_score (float): Precision score.
        recall_score (float): Recall score.

    Returns:
        float: F1 score.
    """
    return 2 * (precision_score * recall_score) / (precision_score + recall_score) if (
            precision_score + recall_score) else 0


def recall2(reference_image_id: int, root_dir: str, cluster_dirs_exp: List[str], ext: str = ".png") -> float:
    """
    Calculates recall given a reference image ID, root directory, and an excluded cluster directory.

    Args:
        reference_image_id (int): The ID of the reference image.
        root_dir (str): Root directory containing subdirectories.
        cluster_dirs_exp (List[str]): Names of the excluded cluster directories.
        ext (str, optional): File extension to filter images (default is ".png").

    Returns:
        float: Recall score (true positives / (true positives + false negatives)).
    """
    tp = 0
    for cluster_dir in cluster_dirs_exp:
        for root, _, files in os.walk(os.path.join(root_dir, cluster_dir)):
            for filename in files:
                if not filename.endswith(ext):
                    continue
                if filename.split(".")[1] == str(reference_image_id):
                    tp += 1

    fn = 0
    for dirpath, dirnames, filenames in os.walk(root_dir):
        dirnames[:] = [d for d in dirnames if d not in cluster_dirs_exp]
        for filename in filenames:
            if not filename.endswith(ext):
                continue
            if filename.split(".")[1] == str(reference_image_id):
                fn += 1
    total = tp + fn

    return tp / total if total else 0


def compute_metrics2(reference_image_id: int, root_dir: str, ext: str = ".png", output_file: str = None) -> dict:
    """
    Calculates precision scores for each cluster directory given a reference image ID and a root directory containing images.
    Also calculates the recall for the cluster directory with the highest precision.

    Args:
        reference_image_id (int): The ID of the reference image.
        root_dir (str): Path to the directory containing the clusters.
        ext (str, optional): File extension to filter images (default is ".png").
        output_file (str, optional): Path to the file where metrics will be saved (default is None).

    Returns:
        dict: A dictionary containing:
            - "max_item": A tuple with the directory having the highest precision score and the score itself.
            - "precision_scores": A dictionary with precision scores for each cluster directory.
            - "recall": The recall score for the cluster directory with the highest precision.
    """
    f1_scores = {}
    first_dir = True

    for dirpath, dirnames, filenames in os.walk(root_dir):
        if first_dir:
            first_dir = False
            continue
        tp = 0
        for filename in filenames:
            if not filename.endswith(ext):
                continue
            if filename.split(".")[1] == str(reference_image_id):
                tp += 1
        precision_score = tp / len(filenames) if filenames else 0
        recall_score = recall2(reference_image_id=reference_image_id, root_dir=root_dir,
                              cluster_dirs_exp=[dirpath.split(os.path.sep)[-1]], ext=ext)
        f1_scores[dirpath.split(os.path.sep)[-1]] = recall_score

    max_value = max(f1_scores.values())
    cluster_dirs = [dirpath for dirpath, score in f1_scores.items() if score == max_value]
    max_items = [(dirpath, score) for dirpath, score in f1_scores.items() if score == max_value]

    metrics = {
        "max_items": max_items,
        "f1_scores": f1_scores,
    }

    if output_file is not None:
        json.dump(metrics, open(output_file, "w"))
    return metrics

In [25]:
import pickle
from sklearn.cluster import KMeans


os.makedirs(optimal_data_dir, exist_ok=True)
c = 0
while len(reference_images_ids) > 0:
    kmeans = KMeans(n_clusters=2, random_state=42)
    fit_kmeans = kmeans.fit(X)
    
    create_cluster_dirs(data_dir="./data", output_dir="clusters/kmeans/colors_ssim", labels=fit_kmeans.labels_)
    scores = {}
    scores[reference_images_ids[0]] = compute_metrics2(reference_images_ids[0], "clusters/kmeans/colors_ssim")
    print(scores)
    threshold = 0.45
    opt_clusters = {}
    for reference_id, d in scores.items():
        max_items = d["max_items"]
        for max_item in max_items:
            if max_item[1] >= threshold:
                if reference_id in opt_clusters:
                    opt_clusters[reference_id].append(max_item[0])
                else:
                    opt_clusters[reference_id] = [max_item[0]]

    print(opt_clusters)
    if len(opt_clusters) == 0 or c != 0:
        break
    
    # # move the optimal clusters to another path and reinitiate the clustering process without those fragments
    opt_dir = "optimal_clusters/kmeans/colors_ssim"
    os.makedirs(opt_dir, exist_ok=True)
    
    for reference_id, cluster_dirs in opt_clusters.items():
        reference_dir = os.path.join(opt_dir, reference_id)
        os.makedirs(reference_dir, exist_ok=True)
        for cluster_dir in cluster_dirs:
            img_dir = os.path.join("clusters/kmeans/colors_ssim", cluster_dir)
            for filename in os.listdir(img_dir):
                shutil.copy(os.path.join(img_dir, filename), os.path.join(reference_dir, filename))
                shutil.move(os.path.join(data_dir, filename), os.path.join(optimal_data_dir, filename))
            shutil.rmtree(img_dir)
        reference_images_ids.remove(reference_id)
    c = 1

Creating cluster dirs: 100%|██████████| 328/328 [00:00<00:00, 2280.67it/s]


{'34': {'max_items': [('cluster_1', 1.0)], 'f1_scores': {'cluster_1': 1.0, 'cluster_0': 0.0}}}
{'34': ['cluster_1']}


Creating cluster dirs:  41%|████▏     | 136/328 [00:00<00:00, 1578.10it/s]


IndexError: list index out of range

In [24]:
restore_data(optimal_data_dir, data_dir)