In [1]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import MeanIoU


2024-11-02 12:35:55.128833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730547355.206512   25230 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730547355.227323   25230 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-02 12:35:55.385213: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import tensorflow as tf

# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import tensorflow as tf
import os
import matplotlib.pyplot as plt


pixel_to_class = {0: 0,
 15: 1,
 19: 2,
 34: 3,
 38: 4,
 52: 5,
 53: 6,
 57: 7,
 72: 8,
 75: 9,
 76: 10,
 90: 11,
 94: 12,
 109: 13,
 113: 14,
 128: 15,
 133: 16,
 147: 17,
 151: 18,
 220: 19}

def convert_to_class_ids(image, mapping):
    # Create an output array of the same shape as the image, initialized to zero
    class_ids = np.zeros_like(image, dtype=np.int32)
    
    # Iterate through the dictionary and map pixel values to class IDs
    for pixel_value, class_id in mapping.items():
        class_ids[image == pixel_value] = class_id
    
    return class_ids

    
def load_pascal_voc_images(image_dir, mask_dir, target_size=(128, 128), num_classes=20):
    image_files = sorted(os.listdir(image_dir))
    mask_files = sorted(os.listdir(mask_dir))
    
    images = []
    masks = []
    values = set()
    for img_file, mask_file in zip(image_files, mask_files):
        # Load and preprocess the image
        img = load_img(os.path.join(image_dir, img_file), target_size=target_size)
        img = img_to_array(img) / 255.0  # Normalize image
        images.append(img)
        
        # Load and preprocess the mask
        mask = load_img(os.path.join(mask_dir, mask_file), target_size=target_size, color_mode='grayscale')
        mask = img_to_array(mask).astype(np.int32)  # Convert mask to integer format
        mask = np.squeeze(mask, axis=-1)  # Remove last channel dimension if it's (height, width, 1)

        # print(tf.reduce_sum(mask))
        # Replace out-of-bound values with 0 (background class)
        mask = convert_to_class_ids(mask, pixel_to_class)

        # print(np.unique(mask))
        mask[mask > 20] = 0
        
        # One-hot encode the mask
        mask = tf.keras.utils.to_categorical(mask, num_classes=num_classes)
        for v in np.unique(mask):
            values.add(v)
        # print(tf.reduce_sum(mask))
        masks.append(mask)
        
    print(values)
    return np.array(images), np.array(masks)

In [None]:
# Load data
image_dir = '../data/trainval/VOCdevkit/VOC2012/JPEGImages'
mask_dir = '../data/trainval/VOCdevkit/VOC2012/SegmentationClass'
images, masks = load_pascal_voc_images(image_dir, mask_dir)

set()


In [6]:
import tensorflow as tf
from tensorflow.keras import layers, Model

def unet_model(input_shape=(128, 128, 3), num_classes=20):
    inputs = layers.Input(shape=input_shape)
    
    # Encoder
    conv1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    conv1 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(conv1)
    pool1 = layers.MaxPooling2D((2, 2))(conv1)
    
    conv2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(pool1)
    conv2 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(conv2)
    pool2 = layers.MaxPooling2D((2, 2))(conv2)
    
    conv3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(pool2)
    conv3 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(conv3)
    pool3 = layers.MaxPooling2D((2, 2))(conv3)
    
    conv4 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(pool3)
    conv4 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(conv4)
    pool4 = layers.MaxPooling2D((2, 2))(conv4)
    
    # Bottleneck
    conv5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(pool4)
    conv5 = layers.Conv2D(1024, (3, 3), activation='relu', padding='same')(conv5)
    
    # Decoder
    up6 = layers.Conv2DTranspose(512, (2, 2), strides=(2, 2), padding='same')(conv5)
    up6 = layers.concatenate([up6, conv4])
    conv6 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(up6)
    conv6 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(conv6)
    
    up7 = layers.Conv2DTranspose(256, (2, 2), strides=(2, 2), padding='same')(conv6)
    up7 = layers.concatenate([up7, conv3])
    conv7 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(up7)
    conv7 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(conv7)
    
    up8 = layers.Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(conv7)
    up8 = layers.concatenate([up8, conv2])
    conv8 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(up8)
    conv8 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(conv8)
    
    up9 = layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(conv8)
    up9 = layers.concatenate([up9, conv1])
    conv9 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(up9)
    conv9 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(conv9)
    
    outputs = layers.Conv2D(num_classes, (1, 1), activation='softmax')(conv9)
    
    model = Model(inputs, outputs)
    return model

In [7]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import MeanIoU

# Instantiate the model
model = unet_model(input_shape=(128, 128, 3), num_classes=20)

# Compile the model
model.compile(optimizer=Adam(),
              loss=CategoricalCrossentropy(),
              metrics=[MeanIoU(num_classes=20)])

# Train the model
history = model.fit(images, masks, epochs=50, batch_size=8, validation_split=0.2)

I0000 00:00:1730547367.831858   25230 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10202 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/50


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None, 128, 128), output.shape=(None, 128, 128, 20)

In [None]:
# Evaluate model performance
loss, iou = model.evaluate(images, masks)
print(f"Test Loss: {loss}, Test IoU: {iou}")

# Save model
model.save('unet_model.h5')