Progress Report 2: Image Processing and CNN for Bounding Box Prediction
1. Converting images to a simulated infrared version.
2. Extracting and consolidating annotation data from XML files.
3. Setting up a Convolutional Neural Network (CNN) to predict bounding boxes for specific objects (e.g., faces) in an image.

In [1]:
import os
import cv2
import numpy as np


Image Conversion to Simulated Infrared

In [None]:
import os
import cv2
import numpy as np

# Parameters
input_folder = './train_data'  # Folder containing the original images
output_folder = './train_data_infrared'  # Folder to save infrared images

# Function to convert an image to simulated infrared
def convert_to_infrared(image):
    # Normalize the image to range [0, 1]
    image = image / 255.0

    # Create an empty array for the infrared image
    infrared_image = np.zeros_like(image)

    # Enhance the red channel and reduce the green and blue channels to simulate infrared effect
    infrared_image[:, :, 0] = image[:, :, 2] * 1.5  # Red channel enhanced
    infrared_image[:, :, 1] = image[:, :, 1] * 0.5  # Green channel reduced
    infrared_image[:, :, 2] = image[:, :, 0] * 0.2  # Blue channel further reduced

    # Clip values to range [0, 1] and convert to uint8
    infrared_image = np.clip(infrared_image, 0, 1)
    infrared_image_uint8 = (infrared_image * 255).astype(np.uint8)

    return infrared_image_uint8

# Create the output directory for infrared images if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Apply the infrared conversion to all images in the input folder
for filename in os.listdir(input_folder):
    input_path = os.path.join(input_folder, filename)

    # Check if the file is an image
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        # Read the image
        image = cv2.imread(input_path)

        # Check if the image was loaded successfully
        if image is not None:
            # Convert to infrared
            infrared_image = convert_to_infrared(image)

            # Construct the output path for the infrared image
            output_path = os.path.join(output_folder, filename)

            # Save the infrared image
            cv2.imwrite(output_path, infrared_image)

            print(f'Saved infrared image: {output_path}')
        else:
            print(f'Failed to load image: {filename}')

print('All images have been converted to simulated infrared and saved.')


Extracting Annotations from XML Files

In [3]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

# Directory containing the XML files
xml_dir = './train_data'
# List to store the combined data
data = []

# Loop through all files in the directory
for xml_file in os.listdir(xml_dir):
    if xml_file.endswith('.xml'):
        tree = ET.parse(os.path.join(xml_dir, xml_file))
        root = tree.getroot()
        
        # Extract file name
        filename = root.find('filename').text

        # Extract bounding boxes
        for obj in root.findall('object'):
            label = obj.find('name').text
            xmin = int(obj.find('bndbox/xmin').text)
            ymin = int(obj.find('bndbox/ymin').text)
            xmax = int(obj.find('bndbox/xmax').text)
            ymax = int(obj.find('bndbox/ymax').text)
            
            # Append the data
            data.append([filename, label, xmin, ymin, xmax, ymax])

# Convert the data to a DataFrame
df = pd.DataFrame(data, columns=['filename', 'label', 'xmin', 'ymin', 'xmax', 'ymax'])

# Save to CSV
df.to_csv('./train_data/combined_annotations.csv', index=False)

Loading and Preparing Data for CNN

In [7]:
AUGMENTED_FOLDERS = ['high_noise', 'little_noise', 'moderate_noise']  # Augmented folders inside 'train_data'
IMAGE_DIR = './train_data'  # Directory for original and augmented images
INFRARED_DIR = './train_data_infrared'  # Directory for infrared images
IMAGE_SIZE = (512, 512)  # Resize images to this size


def load_image_and_label(image_folder, row,base_dir):
    image_path = os.path.join(base_dir, image_folder, row['filename'])  # Construct the full path to the image
    image = cv2.imread(image_path)  # Load the image
    if image is None:
        print(f"Image {image_path} not found!")
        return None, None
    image = cv2.resize(image, IMAGE_SIZE)  # Resize the image to 512x512
    label = np.array([row['xmin'], row['ymin'], row['xmax'], row['ymax']])  # Extract the bounding box coordinates
    return image, label

# Initialize lists to store images and labels
images = []
labels = []

# Load the annotations from the CSV file
annotations = pd.read_csv('./train_data/combined_annotations.csv')

# Load original, augmented, and infrared images
for _, row in annotations.iterrows():
    # Load original image
    image, label = load_image_and_label('', row, base_dir=IMAGE_DIR)  # Use IMAGE_DIR as the base for original images
    if image is not None:
        images.append(image)
        labels.append(label)

    # Load augmented images from each noise folder
    for folder in AUGMENTED_FOLDERS:
        augmented_image, _ = load_image_and_label(folder, row, base_dir=IMAGE_DIR)  # Same base_dir for augmented images
        if augmented_image is not None:
            images.append(augmented_image)
            labels.append(label)

    # Load infrared image
    infrared_image, _ = load_image_and_label('', row, base_dir=INFRARED_DIR)  # Use INFRARED_DIR as the base for infrared images
    if infrared_image is not None:
        images.append(infrared_image)
        labels.append(label)

# Convert lists to numpy arrays and normalize images
images = np.array(images, dtype='float32') / 255.0  # Normalize images to [0, 1]
labels = np.array(labels, dtype='float32')

print(f"Total images loaded: {len(images)}")
print(f"Total labels loaded: {len(labels)}")

Total images loaded: 100
Total labels loaded: 100


 Creating a CNN Model for Bounding Box Prediction

In [8]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense

# Function to create the CNN model
def create_model(input_shape=(512, 512, 3)):
    inputs = Input(shape=input_shape)

    # Convolutional layers
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D((2, 2))(x)

    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)

    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)

    # Fully connected layers
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    x = Dense(256, activation='relu')(x)
    output = Dense(4, activation='linear')(x)  # Output layer for bounding box coordinates

    model = Model(inputs=inputs, outputs=output)
    return model

# Create and compile the model
model = create_model()
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

# Display model architecture
model.summary()


2024-10-06 14:20:29.768238: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
