# Preprocessing

In [1]:
import os
import pandas as pd

In [2]:
def count_pre_images(directory, df):
    disaster_counts = {disaster: 0 for disaster in df['disaster'].unique()}

    for filename in os.listdir(directory):
        if filename.endswith('.png'):
            parts = filename.split('_')
            if len(parts) == 2:
                uid, stage = parts
                stage = stage.split('.')[0]
                if stage == "pre":
                    disaster_type = df.loc[df['uid'] == uid, 'disaster'].values
                    if len(disaster_type) > 0:
                        disaster_type = disaster_type[0]
                        disaster_counts[disaster_type] += 1

    for disaster, count in disaster_counts.items():
        print(f"Disaster type: {disaster}, Pre images: {count}")

In [3]:
def count_post_images(directory, df):
    disaster_counts = {disaster: 0 for disaster in df['disaster'].unique()}

    for filename in os.listdir(directory):
        if filename.endswith('.png'):
            parts = filename.split('_')
            if len(parts) == 2:
                uid, stage = parts
                stage = stage.split('.')[0]
                if stage == "post":
                    disaster_type = df.loc[df['uid'] == uid, 'disaster'].values
                    if len(disaster_type) > 0:
                        disaster_type = disaster_type[0]
                        disaster_counts[disaster_type] += 1

    for disaster, count in disaster_counts.items():
        print(f"Disaster type: {disaster}, Post images: {count}")

In [None]:
directory = './tier1/cropped_square_buildings/'
df = pd.read_csv('building_polygons_metadata.csv')
count_pre_images(directory, df)

# Image Generator

In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
from torchvision import transforms
import os
from PIL import Image
import tifffile as tiff

In [2]:
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super(ResidualBlock, self).__init__()
        self.conv_block = nn.Sequential(
            nn.Conv2d(channels, channels, kernel_size=3, padding=1, stride=1),
            nn.InstanceNorm2d(channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(channels, channels, kernel_size=3, padding=1, stride=1),
            nn.InstanceNorm2d(channels)
        )

    def forward(self, x):
        return x + self.conv_block(x)

class Generator(nn.Module):
    def __init__(self, input_channels=9):  # 3 for RGB + 6 for disaster label ## CHANGE FOR DIFFERENT LABELING TECHNIQUE
        super(Generator, self).__init__()
        
        # Layer 1
        self.layer1 = nn.Sequential(
            nn.Conv2d(input_channels, 64, kernel_size=7, padding=3, stride=1),
            nn.InstanceNorm2d(64),
            nn.ReLU(inplace=True)
        )
        
        # Layer 2
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=4, padding=1, stride=2),
            nn.InstanceNorm2d(128),
            nn.ReLU(inplace=True)
        )
        
        # Layer 3
        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=4, padding=1, stride=2),
            nn.InstanceNorm2d(256),
            nn.ReLU(inplace=True)
        )
        
        # Layers 4-9: Residual blocks
        self.residual_blocks = nn.Sequential(
            *[ResidualBlock(256) for _ in range(6)]
        )
        
        # Layer 10: Deconv
        self.layer10 = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
            nn.InstanceNorm2d(128),
            nn.ReLU(inplace=True)
        )
        
        # Layer 11: Deconv
        self.layer11 = nn.Sequential(
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.InstanceNorm2d(64),
            nn.ReLU(inplace=True)
        )
        
        # Layer 12: Output layer
        self.layer12 = nn.Sequential(
            nn.Conv2d(64, 3, kernel_size=7, padding=3, stride=1),
            nn.Tanh()
        )

    def forward(self, x, disaster_label):
        # Concatenate image with label
        disaster_label = disaster_label.view(disaster_label.size(0), -1, 1, 1)
        disaster_label = disaster_label.repeat(1, 1, x.size(2), x.size(3))
        x = torch.cat([x, disaster_label], dim=1)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.residual_blocks(x)
        x = self.layer10(x)
        x = self.layer11(x)
        x = self.layer12(x)
        
        return x 

In [3]:
def generate_fake_image(pre_image, disaster_onehot, checkpoint_path, device):
    generator = Generator().to(device)
    
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}")
    
    checkpoint = torch.load(checkpoint_path, map_location=device)
    generator.load_state_dict(checkpoint['generator_state_dict'])
    generator.eval()

    pre_image = pre_image.to(device).unsqueeze(0)
    disaster_onehot = disaster_onehot.to(device).unsqueeze(0)

    with torch.no_grad():
        fake_image = generator(pre_image, disaster_onehot)

    pre_image_np = pre_image.squeeze(0).cpu().permute(1, 2, 0).numpy()
    pre_image_np = (pre_image_np * 255).clip(0, 255).astype('uint8')

    fake_image_np = fake_image.squeeze(0).cpu().permute(1, 2, 0).numpy()
    fake_image_np = (fake_image_np * 255).clip(0, 255).astype('uint8')

    plt.figure(figsize=(8, 4))
    plt.subplot(1, 2, 1)
    plt.imshow(pre_image_np)
    plt.title("Original Pre Image")
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(fake_image_np)
    plt.title("Generated Fake Image")
    plt.axis('off')

    plt.tight_layout()
    plt.show()

    return fake_image

In [4]:
def load_pre_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ])
    image = Image.open(image_path).convert('RGB')
    return transform(image)

In [5]:
def one_hot_disaster(disaster_name):
    
    disaster_to_idx = {
        'flooding': 0,
        'wind': 1,
        'earthquake': 2,
        'tsunami': 3,
        'fire': 4,
        'volcano': 5
    }
    if disaster_name not in disaster_to_idx:
        raise ValueError(f"Invalid disaster name: {disaster_name}. Must be one of {list(disaster_to_idx.keys())}")
    
    index = disaster_to_idx[disaster_name]
    one_hot = torch.zeros(len(disaster_to_idx))
    one_hot[index] = 1.0
    return one_hot

In [6]:
def save_generated_image(generated_image, output_path):
    generated_image = generated_image.squeeze(0).cpu()
    generated_image = transforms.ToPILImage()(generated_image)

    generated_image.save(output_path)
    print(f"Saved generated image: {output_path}")

In [7]:
checkpoint_path = 'checkpoints/latest.pth'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Rebalancing

In [11]:
def rebalance_disaster_classes(directory, df, checkpoint_path, output_directory, device='cuda'):
    # Get the post image counts for each disaster type
    post_counts = df[df['stage'] == 'post'].groupby('disaster').size()
    max_post_count = post_counts.max()

    # Generate the list of filenames for pre-images from the DataFrame
    df['filename'] = df['uid'].astype(str) + "_" + df['stage'].astype(str) + ".png"
    
    # Get all filenames from the directory that end with '.png'
    all_files = [f for f in os.listdir(directory) if f.endswith('.png')]
    
    # Filter the DataFrame for matching filenames
    df_filtered = df[df['filename'].isin(all_files)]

    # Generate a dictionary of pre-image filenames for each disaster
    pre_images = {disaster: [] for disaster in df['disaster'].unique()}
    
    # Populate the pre_images dictionary with filenames for pre-images
    for filename in all_files:
        parts = filename.split('_')
        if len(parts) == 2:
            uid, stage = parts
            stage = stage.split('.')[0]
            if stage == "pre":
                disaster_type = df.loc[df['uid'] == uid, 'disaster'].values[0]
                pre_images[disaster_type].append(filename)
    
    # Generate fake post images for underrepresented disaster types
    for disaster_type, count in post_counts.items():
        if count < max_post_count:
            # Collect pre-images from other disaster types
            available_pre_images = [
                filename for other_disaster, filenames in pre_images.items() 
                if other_disaster != disaster_type for filename in filenames
            ]
            
            needed_fake_images = max_post_count - count
            print(f"Generating {needed_fake_images} fake post images for {disaster_type} using pre images from other disasters.")

            for i in range(needed_fake_images):
                pre_image_filename = available_pre_images[i % len(available_pre_images)]
                pre_image_path = os.path.join(directory, pre_image_filename)
                generated_image = generate_fake_image(pre_image_path, disaster_type, checkpoint_path, device)
                
                # Extract the 'uid' from the pre-image filename
                uid = pre_image_filename.split('_')[0]
                fake_post_image_filename = f"{uid}_post_{disaster_type}.png"
                
                output_path = os.path.join(output_directory, fake_post_image_filename)
                save_generated_image(generated_image, output_path)

In [12]:
directory = './tier1/cropped_square_buildings/'
df = pd.read_csv('building_polygons_metadata.csv')
checkpoint_path = 'checkpoints/latest.pth'
output_directory = './tier1/BulidingGAN_Generated/BuildingGAN_latest/'

rebalance_disaster_classes(directory, df, checkpoint_path, output_directory)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x14db6b02d100>>
Traceback (most recent call last):
  File "/ext3/miniconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 

# Rebalanced Data

In [None]:
def recheck_disaster_value_counts(csv_path, original_images_dict, generated_images_dict):
    df = pd.read_csv(csv_path)
    original_filenames = []
    for disaster_type, filenames in original_images_dict.items():
        original_filenames.extend(filenames)
    
    generated_filenames = []
    for disaster_type, filenames in generated_images_dict.items():
        generated_filenames.extend(filenames)
    
    # Combine both the original and generated filenames into one list
    all_filenames = original_filenames + generated_filenames
    
    # Create a new column in the DataFrame for filenames
    df['filename'] = df['uid'].astype(str) + "_" + df['stage'].astype(str) + ".png"
    
    # Filter the DataFrame to only include rows with filenames in the all_filenames list
    df_filtered = df[df['filename'].isin(all_filenames)]
    
    # Get the value counts of disaster types in the filtered DataFrame
    disaster_counts = df_filtered['disaster'].value_counts()
    
    # Print the disaster counts
    for disaster_type, count in disaster_counts.items():
        print(f"Disaster type: {disaster_type}, Total post images: {count}")


# Smaller dataset

In [1]:
def get_disaster_counts(csv_path, image_dict):
    df = pd.read_csv(csv_path)
    df['filename'] = df['uid'].astype(str) + "_" + df['stage'].astype(str) + ".png"
    post_df = df[df['stage'] == 'post']
    post_df = post_df[post_df['filename'].isin(image_dict)]
    return post_df['disaster'].value_counts()

In [3]:
import os
import pandas as pd

directory = set(os.listdir('./tier1/cropped_square_buildings/'))
metadata = 'building_polygons_metadata.csv'

get_disaster_counts(metadata, directory)

disaster
wind          27466
flooding      27293
earthquake    22531
tsunami       22164
fire          17264
volcano         767
Name: count, dtype: int64

In [11]:
import os
import pandas as pd
import random
import shutil

def save_random_post_images(csv_path, directory, image_folder, output_dir='./tier1/subset/', num_images=1000):
    os.makedirs(output_dir, exist_ok=True)
    
    df = pd.read_csv(csv_path)
    df['filename'] = df['uid'].astype(str) + "_" + df['stage'].astype(str) + ".png"
    post_df = df[df['stage'] == 'post']
    post_image_files = [f for f in directory if f in post_df['filename'].values]

    for disaster_type in ['wind', 'flooding', 'earthquake']:
        disaster_df = post_df[post_df['disaster'] == disaster_type]
        sampled_images = disaster_df.sample(n=min(num_images, len(disaster_df)), random_state=42)['filename']
        disaster_dir = os.path.join(output_dir, disaster_type)
        os.makedirs(disaster_dir, exist_ok=True)
        
        for filename in sampled_images:
            if filename in post_image_files:
                image_path = os.path.join(image_folder, filename)
                dest_image_path = os.path.join(disaster_dir, filename)
                shutil.copy(image_path, dest_image_path)
                
        print(f"Saved {len(sampled_images)} images for disaster type: {disaster_type}")

In [12]:
directory = './tier1/cropped_square_buildings/'
metadata = 'building_polygons_metadata.csv'
num_images = 1000
output_dir ='./tier1/subset/'

save_random_post_images(metadata, directory, num_images, output_dir)

Saved 1000 images for disaster type: wind
Saved 1000 images for disaster type: flooding
Saved 1000 images for disaster type: earthquake


In [13]:
def save_random_post_images(csv_path, directory, image_folder, output_dir='./tier1/subset/', num_images=500):
    os.makedirs(output_dir, exist_ok=True)
    
    df = pd.read_csv(csv_path)
    df['filename'] = df['uid'].astype(str) + "_" + df['stage'].astype(str) + ".png"
    post_df = df[df['stage'] == 'post']
    post_image_files = [f for f in directory if f in post_df['filename'].values]

    for disaster_type in ['tsunami', 'fire', 'volcano']:
        disaster_df = post_df[post_df['disaster'] == disaster_type]
        sampled_images = disaster_df.sample(n=min(num_images, len(disaster_df)), random_state=42)['filename']
        disaster_dir = os.path.join(output_dir, disaster_type)
        os.makedirs(disaster_dir, exist_ok=True)
        
        for filename in sampled_images:
            if filename in post_image_files:
                image_path = os.path.join(image_folder, filename)
                dest_image_path = os.path.join(disaster_dir, filename)
                shutil.copy(image_path, dest_image_path)
                
        print(f"Saved {len(sampled_images)} images for disaster type: {disaster_type}")


directory = './tier1/cropped_square_buildings/'
metadata = 'building_polygons_metadata.csv'
num_images = 500
output_dir ='./tier1/subset/'
save_random_post_images(metadata, directory, num_images, output_dir)

Saved 500 images for disaster type: tsunami
Saved 500 images for disaster type: fire
Saved 500 images for disaster type: volcano


In [17]:
def generation(csv_path, pre_image_folder, output_dir='./tier1/subset/', num_images=500):
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(csv_path)
    df['filename'] = df['uid'].astype(str) + "_" + df['stage'].astype(str) + ".png"
    pre_df = df[df['stage'] == 'pre']

    pre_image_files = [f for f in os.listdir(pre_image_folder) if f.endswith('.png')]
    disaster_types = ['tsunami', 'fire', 'volcano']
    for disaster_type in disaster_types:
        selected_pre_images = random.sample(pre_image_files, min(num_images, len(pre_image_files)))
        disaster_dir = os.path.join(output_dir, disaster_type)
        os.makedirs(disaster_dir, exist_ok=True)

        for filename in selected_pre_images:
            pre_image_path = os.path.join(pre_image_folder, filename)
            fake_image_tensor = generate_fake_image(load_pre_image(pre_image_path), one_hot_disaster(disaster_type), checkpoint_path, device)
            
            fake_image_filename = filename.replace('.png', '_fake.png')
            fake_image_path = os.path.join(disaster_dir, fake_image_filename)
            save_fake_image(fake_image_tensor, fake_image_path)

def save_fake_image(fake_image_tensor, save_path):
    if fake_image_tensor.is_cuda:
        fake_image_tensor = fake_image_tensor.cpu()
    fake_image_tensor = fake_image_tensor.squeeze(0)
    fake_image_tensor = fake_image_tensor.mul(255).clamp(0, 255).byte()
    fake_image_tensor = fake_image_tensor.permute(1, 2, 0)
    fake_image = Image.fromarray(fake_image_tensor.numpy())
    fake_image.save(save_path)
    print(f"Saved fake image: {save_path}")

In [1]:
import os
import pandas as pd
import random
import shutil

generation('building_polygons_metadata.csv', './tier1/cropped_square_buildings/')

NameError: name 'generation' is not defined