We first import the required libraries for tiling generation and footprint overlaying.

In [18]:
#Suppress warnings
import warnings
warnings.filterwarnings('ignore')

#For GeoTIFF images
import rasterio
from rasterio.plot import show
from osgeo import gdal, ogr, osr
import geopandas as gpd

#For Visualisation
from matplotlib import pyplot as plt
import matplotlib.image as img
from matplotlib.pyplot import figure
from PIL import Image
%matplotlib inline

# Others
import os
import shutil
import zipfile
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import csv

Here, we sample tiles around a particular ROI in both pre and post event images

Sampling regions were selected by avoiding cloud covers and coastal areas as they are largely missing from the validation test

We ensured a balanced mix of residential and commerical buildings in our training set. Images with mostly residential, mostly commerical and both were also ensured. We try to also have a good mix of images with and without vegetation cover present in the nearby areas as nearby damaged vegetation can bias either of the two models, i.e., no-damage-pre-event-model will likely fail to predict any buildings and damaged-inclusive-post-event-model will likely fail to predict some damage in buildings with nearby seemingly, or fairly, undamaged green areas.

Selected sampling regions are available in the git folder (**TO DO!!!**)

Manually selected sampling cells from the post event image have the following x and y axis values. These are indicated such that the top left corner of the image is the origin, positive x is along the left, positive y is along the bottom, and the bottom right corner is (1,1).
1. xmin: 382, xmax: 452, ymin: 278, ymax: 349
2. xmin: 259, xmax: 331, ymin: 435, ymax: 506
3. xmin: 276, xmax: 347, ymin: 596, ymax: 667
4. xmin: 39, xmax: 78, ymin: 766, ymax: 799
5. xmin: 192, xmax: 230, ymin: 637, ymax: 670

In [2]:
#each cell corresponds to the xmin, xmax, ymin, ymax value taken from above
selected_cells = [[382, 452, 278, 349],
                  [259, 331, 435, 506],
                  [276, 347, 596, 667],
                  [39,78,766,799],
                  [192,230,637,670]] 
x_limit = 642 #inferred from the post event image screenshot
y_limit = 1234 #inferred from the post event image screenshot
selected_cells = [[xmin/x_limit, xmax/x_limit, ymin/y_limit, ymax/y_limit] 
                  for xmin, xmax, ymin, ymax in selected_cells]

We no modify the generate tiles function given to use by passing the above selected_cells matrix, to make sure we only generate the tiles which are within our selected regions of interest.

In [3]:
def sample_tiles(input_file,output_dir,grid_x,grid_y,roi_matrix):
    ds = gdal.Open(input_file)

    # Get image size and number of bands
    width = ds.RasterXSize
    height = ds.RasterYSize
    num_bands = ds.RasterCount

    # Calculate number of tiles in each dimension
    num_tiles_x = (width // grid_x)
    num_tiles_y = (height // grid_y)

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over each tile and save as a separate TIFF image
    for xmin, xmax, ymin, ymax in roi_matrix:
        for i in tqdm(range(int(xmin*num_tiles_x),int(xmax*num_tiles_x))):
            for j in range(int(ymin*num_tiles_y),int(ymax*num_tiles_y)):
                x_offset = i *  grid_x
                y_offset = j *  grid_y

                tile_width = min(grid_x, width - x_offset)
                tile_height = min(grid_y, height - y_offset)

                tile = []
                for band in range(1, num_bands + 1):
                    tile_data = ds.GetRasterBand(band).ReadAsArray(x_offset, y_offset, tile_width, tile_height)
                    tile.append(tile_data)

                # Create output filename
                output_file = os.path.join(output_dir, f"tile_{i}_{j}.tif")
                
                # Create an output TIFF file with same CRS and band values range
                driver = gdal.GetDriverByName("GTiff")
                options = ['COMPRESS=DEFLATE', 'PREDICTOR=2', 'TILED=YES']
                out_ds = driver.Create(output_file, tile_width, tile_height, num_bands, 
                           ds.GetRasterBand(1).DataType, options=options)

                # Set the geotransform
                geotransform = list(ds.GetGeoTransform())
                geotransform[0] = geotransform[0] + x_offset * geotransform[1]
                geotransform[3] = geotransform[3] + y_offset * geotransform[5]
                out_ds.SetGeoTransform(tuple(geotransform))

                # Set the projection
                out_ds.SetProjection(ds.GetProjection())

                # Write each band to the output file
                for band in range(1, num_bands + 1):
                    out_band = out_ds.GetRasterBand(band)
                    out_band.WriteArray(tile[band - 1])

                # Close the output file
                out_ds = None

    print("Tiles generation completed.")

In [4]:
input_file = "./Post_Event_San_Juan.tif"
output_dir = "./Post_Event_Sampling_In_TIFF"
grid_x = 512
grid_y = 512
# sample_tiles(input_file, output_dir,grid_x,grid_y,selected_cells)

In [5]:
def visualize_samples(input_file,output_dir,grid_x,grid_y,roi):
    ds = gdal.Open(input_file)

    # Get image size and number of bands
    width = ds.RasterXSize
    height = ds.RasterYSize
    num_bands = ds.RasterCount

    # Calculate number of tiles in each dimension
    num_tiles_x = (width // grid_x)
    num_tiles_y = (height // grid_y)

    tif_files = []
    for xmin, xmax, ymin, ymax in [roi]:
        for i in range(int(xmin*num_tiles_x),int(xmax*num_tiles_x)):
            for j in range(int(ymin*num_tiles_y),int(ymax*num_tiles_y)):

                tif_files.append(os.path.join(output_dir, f"tile_{i}_{j}.tif"))
            
    fig, ax = plt.subplots(len(tif_files), 1, figsize=(6, 2 * len(tif_files)))
    
    # Loop through each .tif file and plot it along the column
    for i, tif_file in enumerate(tif_files):
        # Load the image using PIL
        img = Image.open(tif_file)

        # Display the image
        ax[i].imshow(img)
        ax[i].set_title(tif_file)
        ax[i].axis('off')

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()           

In [6]:
input_file = "./Post_Event_San_Juan.tif"
output_dir = "./Post_Event_Sampling_In_TIFF"
grid_x = 512
grid_y = 512
# visualize_samples(input_file,output_dir,grid_x,grid_y,selected_cells[4])

In [7]:
# Read the CSV file
csv_path = './post_event_annotation_selected_tiles.csv'
df = pd.read_csv(csv_path)

# Exclude the first row (column names) from the analysis
data = df.iloc[1:]

# Split the data into two sets randomly
# akash_set, mike_set = train_test_split(data, test_size=0.5, random_state=42)

In [8]:
# akash_set.to_csv('./akash_set_post_event_512_512.csv', index=False)

In [9]:
# mike_set.to_csv('./mike_set_post_event_512_512.csv', index=False)

Manually selected sampling cells from the pre event image have the following x and y axis values. These are indicated such that the top left corner of the image is the origin, positive x is along the left, positive y is along the bottom, and the bottom right corner is (1,1).
1. xmin: 195, xmax: 319, ymin: 261, ymax: 334
2. xmin: 135, xmax: 213, ymin: 415, ymax: 481
3. xmin: 128, xmax: 206, ymin: 557, ymax: 627
4. xmin: 389, xmax: 466, ymin: 546, ymax: 618
5. xmin: 268, xmax: 345, ymin: 620, ymax: 691
6. xmin: 248, xmax: 325, ymin: 720, ymax: 795

In [10]:
#each cell corresponds to the xmin, xmax, ymin, ymax value taken from above
pre_event_selected_cells = [[195, 319, 261, 334],
                  [135, 213, 415, 481],
                  [128, 206, 557, 627],
                  [389,466,546,618],
                  [268,345,620,691],
                           [248,325,720,795]] 
x_limit = 636 #inferred from the pre event image screenshot
y_limit = 1240 #inferred from the pre event image screenshot
pre_event_selected_cells = [[xmin/x_limit, xmax/x_limit, ymin/y_limit, ymax/y_limit] 
                  for xmin, xmax, ymin, ymax in pre_event_selected_cells]

In [11]:
input_file = "./Pre_Event_San_Juan.tif"
output_dir = "./Pre_Event_Sampling_In_TIFF"
grid_x = 512
grid_y = 512
# sample_tiles(input_file, output_dir,grid_x,grid_y,pre_event_selected_cells)

In [12]:
input_file = "./Pre_Event_San_Juan.tif"
output_dir = "./Pre_Event_Sampling_In_TIFF"
grid_x = 512
grid_y = 512
# visualize_samples(input_file,output_dir,grid_x,grid_y,pre_event_selected_cells[1])

In [21]:
def convert_tiff_to_jpeg_and_rename(input_dir,output_dir,dataset_path,prefix,start):
    # check if output_dir exists, if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    # Specify the path to the CSV file
    csv_file_path = dataset_path
    
    # Directory containing the TIFF files
    tiff_directory = input_dir
    
    number = start
    
    file_extension = '.jpg'
    
    with open(csv_file_path, 'r') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader, None)
        for row in tqdm(csv_reader, desc="Processing Dataset"):
            i, j = map(int, row)
            file_name = f"tile_{i}_{j}.tif"
            tiff_file_path = os.path.join(tiff_directory, file_name)
            img = Image.open(tiff_file_path)
            # check if image is RGB mode, if not convert it
            if img.mode != 'RGB':
                img = img.convert('RGB')
                
            output_filename = f"{prefix}{number:04}{file_extension}"
            
            img.save(os.path.join(output_dir, output_filename), 'JPEG')
            
            number += 1
    
    print("Conversion from TIFF to JPEG completed.")

In [22]:
input_dir = './Post_Event_Sampling_In_TIFF/'
output_dir = './Post_Event_Samples_Akash_512_512'
dataset_path = './akash_set_post_event_512_512.csv'
prefix = 'Post_Event_'
start = 0
convert_tiff_to_jpeg_and_rename(input_dir,output_dir,dataset_path,prefix,start)

Processing Dataset: 46it [00:00, 52.70it/s]

Conversion from TIFF to JPEG completed.





For the images which are being annotated, we now generate the same tiles but those which have building footprints overlaid on them so that the annotation process becomes easier.

In [23]:
def sample_tiles_with_footprints(input_file,output_dir,grid_x,grid_y,dataset_path,shapefile_path,start,prefix):
    ds = gdal.Open(input_file)

    # Get image size and number of bands
    width = ds.RasterXSize
    height = ds.RasterYSize
    num_bands = ds.RasterCount

    # Calculate number of tiles in each dimension
    num_tiles_x = (width // grid_x)
    num_tiles_y = (height // grid_y)

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the shapefile and get the layer
    shapefile_ds = ogr.Open(shapefile_path)
    shp_layer = shapefile_ds.GetLayer()
    
     # Specify the path to the CSV file
    csv_file_path = dataset_path
    
    number = start
    
    file_extension = '.tif'
    
    with open(csv_file_path, 'r') as csvfile:
        
        csv_reader = csv.reader(csvfile)
        next(csv_reader, None)
        
        for row in tqdm(csv_reader, desc="Processing Dataset"):
            i, j = map(int, row)
            x_offset = i *  grid_x
            y_offset = j *  grid_y
            tile_width = min(grid_x, width - x_offset)
            tile_height = min(grid_y, height - y_offset)
            tile = []
            for band in range(1, num_bands + 1):
                tile_data = ds.GetRasterBand(band).ReadAsArray(x_offset, y_offset, tile_width, tile_height)
                tile.append(tile_data)
            output_file = os.path.join(output_dir, f"{prefix}{number:04}{file_extension}")
            
            # Create an output TIFF file with same CRS and band values range
            driver = gdal.GetDriverByName("GTiff")
            options = ['COMPRESS=DEFLATE', 'PREDICTOR=2', 'TILED=YES']
            out_ds = driver.Create(output_file, tile_width, tile_height, num_bands, 
                           ds.GetRasterBand(1).DataType, options=options)
            
            # Set the geotransform
            geotransform = list(ds.GetGeoTransform())
            geotransform[0] = geotransform[0] + x_offset * geotransform[1]
            geotransform[3] = geotransform[3] + y_offset * geotransform[5]
            out_ds.SetGeoTransform(tuple(geotransform))
            
            out_ds.SetProjection(ds.GetProjection())
            
            # Write each band to the output file
            for band in range(1, num_bands + 1):
                out_band = out_ds.GetRasterBand(band)
                out_band.WriteArray(tile[band - 1])
                
            gdal.RasterizeLayer(out_ds, [1], shp_layer, burn_values=[255], options=["ALL_TOUCHED=TRUE"])
                
            out_ds = None
            
            number += 1
    print("Generating sampled tiles with footprints overlaid completed.")

In [24]:
input_file = './Post_Event_San_Juan.tif'
output_dir = './Post_Event_Samples_Akash_512_512/'
grid_x = 512
grid_y = 512
dataset_path = './akash_set_post_event_512_512.csv'
shapefile_path = './provided_footprints/building_footprint_roi_challenge.shp'
start = 0
prefix = 'Post_Event_'
sample_tiles_with_footprints(input_file,output_dir,grid_x,grid_y,dataset_path,shapefile_path,start,prefix)

Processing Dataset: 46it [01:51,  2.42s/it]

Generating sampled tiles with footprints overlaid completed.





In [26]:
def map_post_samples_to_pre(input_file,output_dir,grid_x,grid_y,dataset_path,start,prefix):
    ds = gdal.Open(input_file)

    # Get image size and number of bands
    width = ds.RasterXSize
    height = ds.RasterYSize
    num_bands = ds.RasterCount

    # Calculate number of tiles in each dimension
    num_tiles_x = (width // grid_x)
    num_tiles_y = (height // grid_y)

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
     # Specify the path to the CSV file
    csv_file_path = dataset_path
    
    number = start
    
    file_extension = '.tif'
    
    with open(csv_file_path, 'r') as csvfile:
        
        csv_reader = csv.reader(csvfile)
        next(csv_reader, None)
        
        for row in tqdm(csv_reader, desc="Processing Dataset"):
            i, j = map(int, row)
            x_offset = i *  grid_x
            y_offset = j *  grid_y
            tile_width = min(grid_x, width - x_offset)
            tile_height = min(grid_y, height - y_offset)
            tile = []
            for band in range(1, num_bands + 1):
                tile_data = ds.GetRasterBand(band).ReadAsArray(x_offset, y_offset, tile_width, tile_height)
                tile.append(tile_data)
            output_file = os.path.join(output_dir, f"{prefix}{number:04}{file_extension}")
            
            # Create an output TIFF file with same CRS and band values range
            driver = gdal.GetDriverByName("GTiff")
            options = ['COMPRESS=DEFLATE', 'PREDICTOR=2', 'TILED=YES']
            out_ds = driver.Create(output_file, tile_width, tile_height, num_bands, 
                           ds.GetRasterBand(1).DataType, options=options)
            
            # Set the geotransform
            geotransform = list(ds.GetGeoTransform())
            geotransform[0] = geotransform[0] + x_offset * geotransform[1]
            geotransform[3] = geotransform[3] + y_offset * geotransform[5]
            out_ds.SetGeoTransform(tuple(geotransform))
            
            out_ds.SetProjection(ds.GetProjection())
            
            # Write each band to the output file
            for band in range(1, num_bands + 1):
                out_band = out_ds.GetRasterBand(band)
                out_band.WriteArray(tile[band - 1])
                                
            out_ds = None
            
            number += 1
    print("Generated pre event tiles for post event samples.")

In [27]:
input_file = './Pre_Event_San_Juan.tif'
output_dir = './Post_Event_Samples_Akash_512_512/'
grid_x = 512
grid_y = 512
dataset_path = './akash_set_post_event_512_512.csv'
start = 0
prefix = 'Pre_Event_'
map_post_samples_to_pre(input_file,output_dir,grid_x,grid_y,dataset_path,start,prefix)

Processing Dataset: 46it [00:01, 26.05it/s]

Generated pre event tiles for post event samples.





In [29]:
# !tar -cvf Post_Event_Samples_Akash_512_512.tar ./Post_Event_Samples_Akash_512_512/