## Notebook: Extracting Default Image and Metadata Features

This notebook builds on the dataset demo you provided. It extracts two features per instance:

- **Image Feature**: Computes the average color (RGB) of the image.
- **Metadata Feature**: Encodes the primary type (Type1) as a numeric value.

Both features are saved to dedicated directories for later use.

In [6]:
# --- Imports ---
import os
import time
import json
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image as keras_image

# --- Path Definitions ---
# Base directories as per your Docker mount configuration
NOTEBOOK_DIR = Path(os.getcwd())
REPO_ROOT = NOTEBOOK_DIR.parent
DATASET_PATH = REPO_ROOT / "data"

# Image and metadata directories
IMAGE_DIR = DATASET_PATH / "images"
METADATA_PATH = DATASET_PATH / "metadata" / "images.json"

# Create directories if they don't exist
IMAGE_DIR.mkdir(parents=True, exist_ok=True)
METADATA_PATH.parent.mkdir(parents=True, exist_ok=True)

# Directories for storing extracted features
FEATURE_DIR = DATASET_PATH / "features"

# Create feature directories if they don't exist
FEATURE_DIR.mkdir(parents=True, exist_ok=True)

print("Imports and path configurations are set up:")
print(f" - Notebook Dir: {NOTEBOOK_DIR}")
print(f" - Repository Root: {REPO_ROOT}")
print(f" - Dataset Path: {DATASET_PATH}")
print(f" - Image Dir: {IMAGE_DIR}")
print(f" - Metadata Path: {METADATA_PATH}")
print(f" - Feature Dir: {FEATURE_DIR}")

Imports and path configurations are set up:
 - Notebook Dir: /app/notebooks
 - Repository Root: /app
 - Dataset Path: /app/data
 - Image Dir: /app/data/images
 - Metadata Path: /app/data/metadata/images.json
 - Feature Dir: /app/data/features


### Define Default Feature Extraction Functions

**Image Feature Extraction:** We open each image using PIL and compute the average pixel values across all pixels.

**Metadata Feature Extraction:** We extract the primary type (`Type1`) from the metadata and encode it using a simple mapping.

Feel free to modify these functions to compute more complex features later.

In [2]:
# Load a pre-trained ResNet50 model without the final classification layer.
# Global average pooling ensures we get a fixed-size embedding vector.
base_model = ResNet50(weights="imagenet", include_top=False, pooling="avg")


def extract_image_feature(image_path, target_size=(224, 224)):
    """
    Uses a pre-trained ResNet50 to compute a robust embedding for the image.

    Args:
        image_path (str): Path to the image file.
        target_size (tuple): Target size for resizing the image before feature extraction.

    Returns:
        A numpy array representing the extracted image feature.
    """
    try:
        with Image.open(image_path) as img:
            # Ensure image is in RGB
            if img.mode != "RGB":
                img = img.convert("RGB")
            # Resize image to target size (ResNet50 expects 224x224)
            img = img.resize(target_size)
            # Convert image to array and preprocess
            x = keras_image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)
            # Extract features
            features = base_model.predict(x)
            return features.flatten()
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return np.full(base_model.output_shape[1], np.nan)  # return a vector of NaNs


def extract_metadata_feature(instance_name, metadata):
    """
    Upgraded metadata feature extractor for generic datasets.
    Uses simple heuristics to extract features from the metadata record.

    For each key in the record (sorted for consistency):
      - Numeric values are used directly.
      - Strings are encoded via a simple hash (modulo operation, normalized to [0, 1]).
      - Booleans are encoded as 1.0 or 0.0.
      - Other types are ignored.

    If no features are found, returns a single NaN.

    Args:
        instance_name (str): The key used to find the metadata record (assumed to be lower-cased).
        metadata (dict): The metadata dictionary.

    Returns:
        A numpy array of extracted metadata features.
    """
    record = metadata.get(instance_name.lower(), {})
    features = []
    for key in sorted(record.keys()):
        value = record[key]
        if isinstance(value, (int, float)):
            features.append(float(value))
        elif isinstance(value, bool):
            features.append(1.0 if value else 0.0)
        elif isinstance(value, str):
            # Normalize a hash value of the string to [0, 1]
            features.append((hash(value) % 1000) / 1000.0)
        # Ignore other types for simplicity
    if not features:
        features = [np.nan]
    return np.array(features)

### Extract Features for All Images

**Image features**: Saved in `data/features/image_features.npz`

In [3]:
# Get image file names and paths
image_files = [
    f for f in os.listdir(IMAGE_DIR) if f.lower().endswith((".png", ".jpg", ".jpeg"))
]
image_paths = [os.path.join(IMAGE_DIR, f) for f in image_files]
# Preserve image names (without extension)
image_names = [os.path.splitext(f)[0] for f in image_files]
num_images = len(image_paths)
batch_size = 256
total_batches = int(np.ceil(num_images / batch_size))
print(
    f"Found {num_images} images. Processing in {total_batches} batches of up to {batch_size} images each."
)


def process_image_with_name(file_path):
    """
    Reads, decodes, resizes, and preprocesses an image.
    Also extracts the file name from the path.
    """
    img = tf.io.read_file(file_path)
    img = tf.image.decode_image(img, channels=3)
    # Set the shape explicitly so that tf.image.resize works properly
    img.set_shape([None, None, 3])
    img = tf.image.resize(img, (224, 224))
    img = preprocess_input(img)
    # Extract filename from path (remove directory)
    file_name = tf.strings.regex_replace(file_path, r".*[\\/]", "")
    return img, file_name


# Build the dataset using tf.data API
ds = tf.data.Dataset.from_tensor_slices(image_paths)
ds = ds.map(process_image_with_name, num_parallel_calls=tf.data.AUTOTUNE)
ds = ds.batch(batch_size)
ds = ds.prefetch(tf.data.AUTOTUNE)

# Load pre-trained ResNet50 model without the classification head.
print("Loading pre-trained ResNet50 model...")
base_model = ResNet50(weights="imagenet", include_top=False, pooling="avg")

all_image_features = {}
batch_index = 0
start_total = time.time()

# Process images in batches
for batch_imgs, batch_file_names in ds:
    start_batch = time.time()
    features_batch = base_model.predict(batch_imgs)
    batch_time = time.time() - start_batch
    # Convert file names tensor to numpy array of byte strings
    batch_file_names = batch_file_names.numpy()
    for i, file_name in enumerate(batch_file_names):
        name = file_name.decode("utf-8")
        name = os.path.splitext(name)[0]
        all_image_features[name] = features_batch[i]
    batch_index += 1
    print(f"Batch {batch_index}/{total_batches} processed in {batch_time:.2f} sec.")

total_time = time.time() - start_total
print(f"Extracted image features for {num_images} images in {total_time:.2f} sec.")

# Reorder features to match original ordering
image_feature_list = [all_image_features[name] for name in image_names]

# Save image features as a compressed NumPy file (.npz)
img_features_array = np.vstack(image_feature_list)
img_features_path = FEATURE_DIR / "image_features.npz"
np.savez_compressed(
    img_features_path, image_names=image_names, features=img_features_array
)
print(f"Saved image features to {img_features_path}")

Found 809 images. Processing in 4 batches of up to 256 images each.
Loading pre-trained ResNet50 model...
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2s/step
Batch 1/4 processed in 16.10 sec.
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step
Batch 2/4 processed in 14.80 sec.
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2s/step
Batch 3/4 processed in 14.99 sec.
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2s/step
Batch 4/4 processed in 3.62 sec.
Extracted image features for 809 images in 49.73 sec.


2025-02-22 14:54:33.248593: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Saved image features to /app/data/features/image_features.npz


# Metadata Feature Extraction Cell

**Metadata features**: Saved in `features/features/metadata_features.csv`

In [4]:
# Assume metadata is loaded from the JSON file
import json

with open(METADATA_PATH, "r") as f:
    metadata = json.load(f)


# Assume extract_metadata_feature() is defined as follows:
def extract_metadata_feature(instance_name, metadata):
    """
    Upgraded metadata feature extractor for generic datasets.
    Uses simple heuristics to extract features from the metadata record.

    For each key in the record (sorted for consistency):
      - Numeric values are used directly.
      - Strings are encoded via a simple hash (modulo operation, normalized to [0, 1]).
      - Booleans are encoded as 1.0 or 0.0.
      - Other types are ignored.

    If no features are found, returns a single NaN.

    Args:
        instance_name (str): The key used to find the metadata record (assumed lower-cased).
        metadata (dict): The metadata dictionary.

    Returns:
        A numpy array of extracted metadata features.
    """
    record = metadata.get(instance_name.lower(), {})
    features = []
    for key in sorted(record.keys()):
        value = record[key]
        if isinstance(value, (int, float)):
            features.append(float(value))
        elif isinstance(value, bool):
            features.append(1.0 if value else 0.0)
        elif isinstance(value, str):
            # Normalize hash value to [0, 1]
            features.append((hash(value) % 1000) / 1000.0)
    if not features:
        features = [np.nan]
    return np.array(features)


# --- Metadata Feature Extraction ---
metadata_feature_list = []
for name in image_names:
    meta_feature = extract_metadata_feature(name, metadata)
    metadata_feature_list.append(meta_feature)

feature_lengths = [len(f) for f in metadata_feature_list]

# Determine the maximum feature length and pad if necessary
max_len = max(feature_lengths)
print(f"Maximum metadata feature length: {max_len}")

padded_metadata_features = [
    np.pad(f, (0, max_len - len(f)), mode="constant", constant_values=np.nan)
    for f in metadata_feature_list
]

# Create a DataFrame and save as CSV
df_meta = pd.DataFrame(np.vstack(padded_metadata_features), index=image_names)
meta_features_path = FEATURE_DIR / "metadata_features.csv"
df_meta.to_csv(meta_features_path)
print(f"Saved metadata features to {meta_features_path}")

Maximum metadata feature length: 4
Saved metadata features to /app/data/features/metadata_features.csv
