In [None]:
import cv2
import numpy as np
import pandas as pd
import os
from collections import Counter
from tqdm import tqdm

In [None]:
BASE_PATH = ""

train_csv_path = os.path.join(BASE_PATH, "Dataset/Index/Train.csv")
train_df = pd.read_csv(train_csv_path)

In [None]:
def estimate_background_color_top(image, height=10, width=2):
    """
    Estimates the background color by sampling a vertical slice from the top
    left and right corners of the image, and then taking the mode of those pixels.
    """
    h, w, _ = image.shape
    # Grab a 2x10 patch from top left and top right.
    patch_left = image[0:height, 0:width, :]        # shape (height, width, 3)
    patch_right = image[0:height, w-width:w, :]       # shape (height, width, 3)
    # Combine the two patches.
    combined = np.concatenate([patch_left.reshape(-1, 3),
                               patch_right.reshape(-1, 3)], axis=0)
    # Convert each pixel to a tuple and compute the mode.
    pixel_list = [tuple(pixel) for pixel in combined]
    mode_color = Counter(pixel_list).most_common(1)[0][0]
    return np.array(mode_color, dtype=np.uint8)

def remove_background(image, tolerance=30, height=10, width=2):
    """
    Computes a gentle background mask based on the difference from a background
    color estimated from the top corner slices using the mode.
    Pixels that differ from the estimated color by more than 'tolerance'
    (Euclidean distance) are considered foreground.
    """
    bg_color = estimate_background_color_top(image, height, width)
    diff = np.linalg.norm(image.astype(np.float32) - bg_color.astype(np.float32), axis=2)
    mask = diff > tolerance  # True for foreground pixels
    # Smooth the mask slightly.
    mask = mask.astype(np.uint8) * 255
    kernel = np.ones((3, 3), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=1)
    return mask.astype(bool)

In [None]:
def compute_image_stats(image_path, tolerance=30):
    image = cv2.imread(image_path)
    if image is None:
        return None, None
    # Remove background mask.
    mask = remove_background(image, tolerance)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    if np.sum(mask)==0:
        return None, None
    brightness = np.mean(gray[mask])
    hist, _ = np.histogram(gray[mask], bins=256, range=(0,256))
    hist = hist.astype(np.float32)
    if hist.sum() > 0:
        hist /= hist.sum()
    return brightness, hist

In [None]:
brightness_values = []
histograms = np.zeros((256,), dtype=np.float32)
num_images = min(50000, len(train_df))
image_paths = train_df['filepath'].sample(num_images).tolist()
image_paths = [os.path.join(BASE_PATH, p) for p in image_paths]

In [None]:
for image_path in tqdm(image_paths, desc="Processing images"):
    brightness, hist = compute_image_stats(image_path, tolerance=30)
    if brightness is not None:
        brightness_values.append(brightness)
        histograms += hist

mean_brightness = np.mean(brightness_values)
mean_histogram = histograms / len(brightness_values)

In [None]:
stats_df = pd.DataFrame({
    "mean_brightness": [mean_brightness],
    "histogram": [list(mean_histogram)]
})
stats_save_path = os.path.join(BASE_PATH, "morphii_train_stats.csv")
stats_df.to_csv(stats_save_path, index=False)
print(f"Saved statistics to {stats_save_path}")