<a href="https://colab.research.google.com/github/aml7hawaiiedu/CCAPLandCoverProject/blob/main/CCAP_UNET_Fall2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install rasterio scikit-image tensorflow keras gdown
!pip install transformers

In [3]:
import os
import glob
import gdown
import zipfile
import cv2
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shutil
import rasterio
import rasterio.plot
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import to_categorical
from keras.models import load_model
from keras.utils import Sequence, to_categorical
from sklearn.model_selection import train_test_split
from skimage.transform import resize
from skimage.util import random_noise
from scipy import ndimage
from scipy.ndimage import label as nd_label
from scipy.ndimage import generic_filter
from scipy.stats import mode

# Additional code can be added here if needed

In [4]:
directory_path = '/content/image_subsets'
os.makedirs(directory_path, exist_ok=True)

In [5]:
zip_files = glob.glob('/content/drive/MyDrive/wetland_unet/UNET_Image_Chips/imagechip_trainingdata/*.zip')
extract_dir = '/content/image_subsets' # destination directory
for zip_file in zip_files:
    base_name = os.path.basename(zip_file)[:-4]
    unzip_dir = os.path.join(extract_dir, base_name)
    os.makedirs(unzip_dir, exist_ok=True)
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(unzip_dir)

In [6]:
csv_files = glob.glob('/content/drive/MyDrive/wetland_unet/UNET_Image_Chips/imagechip_trainingdata/*.csv')
csv_list = []
for csv_file in csv_files:
    base_name = os.path.basename(csv_file)[:-4]
    csv_dir = os.path.join(extract_dir, base_name)
    base_csv = pd.read_csv(csv_file)
    base_csv['subset'] = base_name
    csv_list.append(base_csv)
    index_csv = pd.concat(csv_list, ignore_index=True)

In [None]:
# csv_files

In [None]:
# index_csv

In [83]:
unique_rows_df = index_csv.groupby(['tif_name', 'label', 'subset']).size().reset_index(name='Count')
unique_rows_df = index_csv.groupby(['tif_name', 'label', 'subset']).agg({'percent': 'mean'}).reset_index()

In [None]:
# unique_rows_df

In [85]:
pivot_df = unique_rows_df.pivot(index=['tif_name', 'subset'], columns='label', values='percent').reset_index().fillna(0)

In [None]:
# pivot_df

In [None]:
# index_csv

In [86]:
merged_df = index_csv.merge(pivot_df, on=['tif_name', 'subset'], how='left')
merged_df.drop(columns=['label'], inplace=True)

In [None]:
# merged_df

In [87]:
sum_df = merged_df.drop_duplicates(subset=['tif_name', 'subset'])

In [None]:
# sum_df

In [None]:
total_rows = len(sum_df)
train_fraction = 0.9 # modify this to set the training percentage
train_rows = int(total_rows * train_fraction)
val_rows = total_rows - train_rows

random_assignment = np.array([0] * train_rows + [1] * val_rows)
np.random.seed(42)
np.random.shuffle(random_assignment)
sum_df['random_split'] = random_assignment

In [None]:
agg_df = sum_df.groupby(['random_split']).agg({13: 'mean',14:'mean',15: 'mean',16:'mean',17: 'mean',18:'mean'}).reset_index()
print(agg_df)

In [None]:
# sum_df

In [None]:
sum_df['Images_path']="/content/image_subsets/"+sum_df['subset']+'/Images/'+sum_df['tif_name']
sum_df['Labels_path']="/content/image_subsets/"+sum_df['subset']+'/Labels/'+sum_df['tif_name']

In [95]:
# Loading data from geotiff files
def load_data(files):
    data = []
    for file in files:
        with rasterio.open(file) as src:
            band_data = []
            for band in src.read():
                band_data.append(band)
            data.append(np.dstack(band_data))
    return np.array(data)

In [None]:
# load_data([i for i in  sum_df['Images_path'][0:5]])

In [None]:
# # example to show the width, height and bands of the images
# def get_image_shapes_in_folders(folder_paths):
#     image_shapes = []
#     for folder_path in folder_paths:
#         for root, dirs, files in os.walk(folder_path):
#             # Sort the files alphabetically
#             files = sorted(files)
#             for file in files:
#                 if file.endswith('.tif') or file.endswith('.jpg') or file.endswith('.png'):
#                     image_path = os.path.join(root, file)
#                     try:
#                         with rasterio.open(image_path) as src:
#                             width, height = src.width, src.height
#                             band_count = src.count  # Number of bands in the image
#                             image_shapes.append((file, width, height, band_count))
#                     except Exception as e:
#                         print(f"Error getting shape of image '{file}': {e}")
#     return image_shapes

# folder_paths = ["/content/image_subsets/Hawaii_2005_005_subset/Images"]

# shapes = get_image_shapes_in_folders(folder_paths)
# for shape in shapes:
#     file, width, height, band_count = shape
#     print(f" {file[:-4]}: {width}, {height}, {band_count}")

In [120]:
def load_and_reshape_image(image_path, img_height, img_width):
    with rasterio.open(image_path) as src:
        image = src.read()
        return image

In [None]:
# [load_and_reshape_image(i,512,512) for i in sum_df['Labels_path'][0:5]]

In [None]:
def load_images_and_labels(image_files, label_files, img_height, img_width, num_classes):
  images = []
  labels = []

  # image_files = glob.glob(os.path.join(image_files, "*.tif"))
  for image_file in image_files:
      image = load_and_reshape_image(image_file, img_height, img_width)
      images.append(image)

  # label_files = glob.glob(os.path.join(label_files, "*.tif"))
  for label_file in label_files:
      label = load_and_reshape_image(label_file, img_height, img_width)
      label -= 1  # adjust labels to be in the range 0-8 instead of 1-9
      label = to_categorical(label, num_classes=num_classes)   # one-hot encode the labels
      labels.append(label)

  return np.array(images), np.array(labels)

In [None]:
# load_images_and_labels(sum_df['Images_path'][0:5], sum_df['Labels_path'][0:5],512,512,21)

In [None]:
class DataGenerator(Sequence):
    def __init__(self, image_files, label_files, img_height, img_width, batch_size, num_classes):
        self.image_files = image_files
        self.label_files = label_files
        self.img_height = img_height
        self.img_width = img_width
        self.batch_size = batch_size
        self.num_classes = num_classes
        # self.noise = noise

    def __len__(self):
        return int(np.ceil(len(self.image_files) / self.batch_size))

    def __getitem__(self, index):
        batch_files = self.image_files[index * self.batch_size : (index + 1) * self.batch_size]
        batch_images, batch_labels = self.load_images_and_labels(batch_files)
        return batch_images, batch_labels

    def load_and_reshape_image(self, image_path):
        with rasterio.open(image_path) as src:
            image = src.read()
            image = image.transpose((1, 2, 0))
            if image.shape[0] != self.img_height or image.shape[1] != self.img_width:
                image = cv2.resize(image, (self.img_width, self.img_height), interpolation=cv2.INTER_NEAREST)
            if len(image.shape) == 3 and image.shape[2] == 1:
                image = np.squeeze(image, axis=2)

            return image

    def load_images_and_labels(self, image_files):
        images = []
        labels = []

        for image_file in image_files:
            image = self.load_and_reshape_image(image_file)
            image[image <= -3e+38] = np.nan

            # # Replace NaN values with the mean of the non-NaN pixels
            if np.any(np.isnan(image)):
                nan_mask = np.isnan(image)
                image[nan_mask] = np.nanmean(image)

            # # Replace Inf values with the mean of the non-Inf pixels
            if np.any(np.isinf(image)):
                inf_mask = np.isinf(image)
                image[inf_mask] = np.nanmean(image)

            # Convert to float
            image = image.astype(np.float32)

            # # Z-score normalization
            # mean = np.mean(image, axis=(0, 1), keepdims=True)
            # std = np.std(image, axis=(0, 1), keepdims=True)
            # # mean[mean < 0]
            # std[std < 0] = 0

            # # Normalize with epsilon to prevent divide by zero
            # epsilon = 1e-7

            # image = (image - mean) / (std + epsilon)

            images.append(image)

        for image_file in image_files:
            label_file = image_file.replace("Images", "Labels")
            label = self.load_and_reshape_image(label_file)
            label -= 1
            label = to_categorical(label, num_classes=self.num_classes)
            labels.append(label)

        return np.array(images), np.array(labels)

In [None]:
# import TenserFlow classes and functions
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import models
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras.layers import BatchNormalization

# from tensorflow.python.keras.layers.normalization import BatchNormalization
# from tensorflow.python.keras.layers.normalization import BatchNormalization

# from tensorflow.keras.layers import BatchNormalization
# from keras.layers.normalization.batch_normalization import BatchNormalization
# from tensorflow.python.keras.layers import BatchNormalization

# U-Net model for image segmentation.
# Encoder and decoder conncted by a center block.
# Encoder downsamples the input image while capturing its features.
# Decoder upsamples the encoded image to generate a segmentation map.
def conv_block(input_tensor, num_filters):
	encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(input_tensor)
	encoder = layers.BatchNormalization()(encoder)
	encoder = layers.Activation('relu')(encoder)
	encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(encoder)
	encoder = layers.BatchNormalization()(encoder)
	encoder = layers.Activation('relu')(encoder)
	return encoder

def encoder_block(input_tensor, num_filters):
	encoder = conv_block(input_tensor, num_filters)
	encoder_pool = layers.MaxPooling2D((2, 2), strides=(2, 2))(encoder)
	return encoder_pool, encoder

def decoder_block(input_tensor, concat_tensor, num_filters):
	decoder = layers.Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same')(input_tensor)
	decoder = layers.concatenate([concat_tensor, decoder], axis=-1)
	decoder = layers.BatchNormalization()(decoder)
	decoder = layers.Activation('relu')(decoder)
	decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder)
	decoder = layers.BatchNormalization()(decoder)
	decoder = layers.Activation('relu')(decoder)
	decoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder)
	decoder = layers.BatchNormalization()(decoder)
	decoder = layers.Activation('relu')(decoder)
	return decoder

def get_model():
	inputs = layers.Input(shape=[KERNEL_SIZE, KERNEL_SIZE, len(BANDS)]) # 256
	encoder0_pool, encoder0 = encoder_block(inputs, 32) # 128
	encoder1_pool, encoder1 = encoder_block(encoder0_pool, 64) # 64
	encoder2_pool, encoder2 = encoder_block(encoder1_pool, 128) # 32
	encoder3_pool, encoder3 = encoder_block(encoder2_pool, 256) # 16
	encoder4_pool, encoder4 = encoder_block(encoder3_pool, 512) # 8
	center = conv_block(encoder4_pool, 1024) # center
	decoder4 = decoder_block(center, encoder4, 512) # 16
	decoder3 = decoder_block(decoder4, encoder3, 256) # 32
	decoder2 = decoder_block(decoder3, encoder2, 128) # 64
	decoder1 = decoder_block(decoder2, encoder1, 64) # 128
	decoder0 = decoder_block(decoder1, encoder0, 32) # 256
	outputs = layers.Conv2D(25, (1, 1), activation='softmax')(decoder0)

	model = models.Model(inputs=[inputs], outputs=[outputs])

	model.compile(
		optimizer=optimizers.get(OPTIMIZER),
		loss=losses.get(LOSS),
		metrics=[metrics.get(metric) for metric in METRICS])

	return model

In [None]:
def create_model(img_size, num_classes):
    inputs = keras.Input(shape=(img_size[0], img_size[1], 4))  # Change the number of channels to 4

    # Entry block
    x = layers.Conv2D(32, 3, strides=2, padding="same")(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)

    previous_block_activation = x  # Set aside residual

    # Blocks 1, 2, 3 are identical apart from the feature depth.
    for filters in [64, 128, 256]:
        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(filters, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.Activation("relu")(x)
        x = layers.SeparableConv2D(filters, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.MaxPooling2D(3, strides=2, padding="same")(x)

        # Project residual
        residual = layers.Conv2D(filters, 1, strides=2, padding="same")(
            previous_block_activation
        )
        x = layers.add([x, residual])  # Add back residual
        previous_block_activation = x  # Set aside next residual

    for filters in [256, 128, 64, 32]:
        x = layers.Activation("relu")(x)
        x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.Activation("relu")(x)
        x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
        x = layers.BatchNormalization()(x)

        x = layers.UpSampling2D(2)(x)

        residual = layers.UpSampling2D(2)(previous_block_activation)
        residual = layers.Conv2D(filters, 1, padding="same")(residual)
        x = layers.add([x, residual])  # Add back residual
        previous_block_activation = x  # Set aside next residual

    # Add a per-pixel classification layer
    outputs = layers.Conv2D(num_classes, 3, activation="softmax", padding="same")(x)

    # Define the model
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="adam", loss='sparse_categorical_crossentropy', metrics=['accuracy'])  # updated to sparse categorical cross-entropy loss
    return model

In [None]:
def predict_and_save_segments(input_folder, output_folder, model, img_height, img_width):
      # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get a list of input files
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.tif')]

    for filename in input_files:
              # Read input image
        image_path = os.path.join(input_folder, filename)

        with rasterio.open(image_path) as src:
            # Read image data and reshape
            image = src.read()  # Read all bands

        image = load_and_reshape_image(image_path,img_height, img_width)
        image = image.astype(np.uint8)
        # print(image.shape)
        masks = mask_generator.generate(image)

        flat_mask = show_anns(masks)
        imagery_file = rasterio.open(image_path)
        imagery_transform = imagery_file.transform
        reshaped_image = rasterio.plot.reshape_as_raster(flat_mask)
        reshaped_image = reshaped_image[0]
        # Get metadata from the input image
        # print(reshaped_image.shape)
        meta = src.meta

        # Update metadata for the output image
        meta.update(count=1, dtype=reshaped_image.dtype)

        # Create output path
        output_path = os.path.join(output_folder, filename)

        # Write all 9 prediction channels as separate bands
        with rasterio.open(output_path, 'w', **meta) as dst:
            # for i in range(9):
            dst.write(reshaped_image,1)  # Write each channel as a separate band

        print(f"Saved prediction for {filename}")

    print("Prediction and saving completed.")

In [None]:
def predict_and_save(input_folder, output_folder, model, img_height, img_width):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get a list of input files
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.tif')]

    for filename in input_files:
        # Read input image
        input_path = os.path.join(input_folder, filename)
        with rasterio.open(input_path) as src:
            # Read image data and reshape
            image = src.read()  # Read all bands

            image[image <= -3e+38] = np.nan

            # # Replace NaN values with the mean of the non-NaN pixels
            if np.any(np.isnan(image)):
                nan_mask = np.isnan(image)
                image[nan_mask] = np.nanmean(image)

            # # Replace Inf values with the mean of the non-Inf pixels
            if np.any(np.isinf(image)):
                inf_mask = np.isinf(image)
                image[inf_mask] = np.nanmean(image)

            # Convert to float
            image = image.astype(np.float32)

            # # Z-score normalization
            # mean = np.mean(image, axis=(0, 1), keepdims=True)
            # std = np.std(image, axis=(0, 1), keepdims=True)
            # # mean[mean < 0]
            # std[std < 0] = 0

            # # Normalize with epsilon to prevent divide by zero
            # epsilon = 1e-7

            # image = (image - mean) / (std + epsilon)

            image = np.transpose(image, (1, 2, 0))  # Transpose to (height, width, bands)
            image = cv2.resize(image, (img_width, img_height), interpolation=cv2.INTER_NEAREST)
            image = np.expand_dims(image, axis=0)  # Add batch dimension
            # # Perform prediction
            prediction = model.predict(image)
            prediction[prediction <= 0] = np.nan
            prediction = prediction*255
            prediction = prediction.astype(np.uint8)

        # Get metadata from the input image
        meta = src.meta

        # Update metadata for the output image
        meta.update(count=26, dtype=prediction.dtype,nodata = 0)
        # meta.
        # Create output path
        output_path = os.path.join(output_folder, filename)

        # Write all 9 prediction channels as separate bands
        with rasterio.open(output_path, 'w', **meta) as dst:
            for i in range(25):
                dst.write(prediction[0, :, :, i], i + 1)  # Write each channel as a separate band

            # Add a 10th band containing the argmax of the 9 channels
            argmax_band = np.argmax(prediction[0], axis=-1)
            dst.write(argmax_band, 26)

        print(f"Saved prediction for {filename}")

    print("Prediction and saving completed.")

In [None]:
KERNEL_SIZE = 512
BANDS = range(7)
KERNEL_SHAPE = [KERNEL_SIZE, KERNEL_SIZE]
OPTIMIZER = 'adam'
LOSS = 'categorical_crossentropy'
METRICS = ['categorical_accuracy']



In [None]:
model = get_model()
# print(model.summary())

In [None]:
# predict_and_save(input_folder, output_folder, model, img_height, img_width)
predict_and_save('/content/image_subsets/Kauai_subset/Images/', '/content/Kauai_Predicts/', model, 512, 512)

Saved prediction for 10241_1537.tif
Saved prediction for 17921_1537.tif
Saved prediction for 9217_6145.tif
Saved prediction for 9217_8193.tif
Saved prediction for 13825_3073.tif
Saved prediction for 18433_8705.tif
Saved prediction for 8193_10241.tif
Saved prediction for 10753_6145.tif
Saved prediction for 16897_1025.tif
Saved prediction for 20481_7169.tif
Saved prediction for 3585_12289.tif
Saved prediction for 10753_7169.tif
Saved prediction for 8193_9729.tif
Saved prediction for 13825_11265.tif
Saved prediction for 10753_2561.tif
Saved prediction for 5633_12801.tif
Saved prediction for 8193_3585.tif
Saved prediction for 18945_2561.tif
Saved prediction for 6657_10753.tif
Saved prediction for 14849_8705.tif
Saved prediction for 10753_6657.tif
Saved prediction for 9217_513.tif
Saved prediction for 10753_1025.tif
Saved prediction for 12801_4609.tif
Saved prediction for 14337_8193.tif
Saved prediction for 11265_4097.tif
Saved prediction for 8193_7169.tif
Saved prediction for 16385_8193.ti

In [None]:
# sum_df

In [None]:
# data_gen_test = DataGenerator(sum_df['Images_path'][0:5], sum_df['Labels_path'][0:5], 512, 512, 2, 22)
# Create the data generator
training_data_generator = DataGenerator(sum_df['Images_path'][sum_df['random_split']==0], sum_df['Labels_path'][sum_df['random_split']==0], 512, 512, 4, 25)
validation_data_generator = DataGenerator(sum_df['Images_path'][sum_df['random_split']==1], sum_df['Labels_path'][sum_df['random_split']==1], 512, 512, 4, 25)


In [None]:
batch_images, batch_labels = training_data_generator.__getitem__(0)

In [None]:
batch_images.shape

(4, 512, 512, 7)

In [None]:
batch_labels.shape

(4, 512, 512, 25)

In [None]:
# Train the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy','categorical_accuracy'])
callbacks = [
    keras.callbacks.ModelCheckpoint("landcover_segmentation.h5", save_best_only=True)
]

model.fit(training_data_generator, validation_data=validation_data_generator, epochs=100, callbacks=callbacks,shuffle=True)
# model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=16, epochs=10)



Epoch 1/100

  saving_api.save_model(


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

KeyboardInterrupt: ignored

In [None]:
# Train the model
# history = model.fit(training_data_generator, validation_data=validation_data_generator, epochs=10)

# Evaluate the model
loss, accuracy = model.evaluate(validation_data_generator)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)



ValueError: ignored