In [91]:
import os
import numpy as np
import random
import pandas as pd
from PIL import Image, ImageFilter
from tqdm import tqdm
import tensorflow as tf
from skimage.util import random_noise
from skimage.color import rgb2gray, gray2rgb
from dataset_utils import crop_and_resize, combine_and_mask

In [78]:
cub_dir = '../data/cub-200-2011'
places_dir = '../data/val_large'
output_dir = '../data/dataset'
dataset_name = 'validation_waterbird_gaussian_TO_95'

target_places_ids = [
    [36, 150],  # Land backgrounds ['bamboo_forest', 'forest/broadleaf']
    [243, 205]] # Water backgrounds ['ocean', 'lake/natural']

val_frac = 0.2             # What fraction of the training data to use as validation
confounder_strength = 0.95 # Determines relative size of majority vs. minority groups

In [79]:
images_path = os.path.join(cub_dir,'CUB_200_2011', 'CUB_200_2011', 'images.txt')

df = pd.read_csv(
    images_path,
    sep=" ",
    header=None,
    names=['img_id', 'img_filename'],
    index_col='img_id')

### Set up labels of waterbirds vs. landbirds
# We consider water birds = seabirds and waterfowl.
species = np.unique([img_filename.split('/')[0].split('.')[1].lower() for img_filename in df['img_filename']])
water_birds_list = [
    'Albatross', # Seabirds
    'Auklet',
    'Cormorant',
    'Frigatebird',
    'Fulmar',
    'Gull',
    'Jaeger',
    'Kittiwake',
    'Pelican',
    'Puffin',
    'Tern',
    'Gadwall', # Waterfowl
    'Grebe',
    'Mallard',
    'Merganser',
    'Guillemot',
    'Pacific_Loon'
]

water_birds = {}
for species_name in species:
    water_birds[species_name] = 0
    for water_bird in water_birds_list:
        if water_bird.lower() in species_name:
            water_birds[species_name] = 1
species_list = [img_filename.split('/')[0].split('.')[1].lower() for img_filename in df['img_filename']]
df['y'] = [water_birds[species] for species in species_list]

### Assign train/tesst/valid splits
# In the original CUB dataset split, split = 0 is test and split = 1 is train
# We want to change it to
# split = 0 is train,
# split = 1 is val,
# split = 2 is test

train_test_df =  pd.read_csv(
    os.path.join(cub_dir, 'CUB_200_2011', 'CUB_200_2011', 'train_test_split.txt'),
    sep=" ",
    header=None,
    names=['img_id', 'split'],
    index_col='img_id')

df = df.join(train_test_df, on='img_id')
test_ids = df.loc[df['split'] == 0].index
train_ids = np.array(df.loc[df['split'] == 1].index)
val_ids = np.random.choice(
    train_ids,
    size=int(np.round(val_frac * len(train_ids))),
    replace=False)

df.loc[train_ids, 'split'] = 0
df.loc[val_ids, 'split'] = 1
df.loc[test_ids, 'split'] = 2

### Assign confounders (place categories)

# Confounders are set up as the following:
# Y = 0, C = 0: confounder_strength
# Y = 0, C = 1: 1 - confounder_strength
# Y = 1, C = 0: 1 - confounder_strength
# Y = 1, C = 1: confounder_strength

df['place'] = 0
train_ids = np.array(df.loc[df['split'] == 0].index)
val_ids = np.array(df.loc[df['split'] == 1].index)
test_ids = np.array(df.loc[df['split'] == 2].index)
for split_idx, ids in enumerate([train_ids, val_ids, test_ids]):
    for y in (0, 1):
        if split_idx == 0: # train
            if y == 0:
                pos_fraction = 1 - confounder_strength
            else:
                pos_fraction = confounder_strength
        else:
            pos_fraction = 0.5
        subset_df = df.loc[ids, :]
        y_ids = np.array((subset_df.loc[subset_df['y'] == y]).index)
        pos_place_ids = np.random.choice(
            y_ids,
            size=int(np.round(pos_fraction * len(y_ids))),
            replace=False)
        df.loc[pos_place_ids, 'place'] = 1

for split, split_label in [(0, 'train'), (1, 'val'), (2, 'test')]:
    print(f"{split_label}:")
    split_df = df.loc[df['split'] == split, :]
    print(f"waterbirds are {np.mean(split_df['y']):.3f} of the examples")
    print(f"y = 0, c = 0: {np.mean(split_df.loc[split_df['y'] == 0, 'place'] == 0):.3f}, n = {np.sum((split_df['y'] == 0) & (split_df['place'] == 0))}")
    print(f"y = 0, c = 1: {np.mean(split_df.loc[split_df['y'] == 0, 'place'] == 1):.3f}, n = {np.sum((split_df['y'] == 0) & (split_df['place'] == 1))}")
    print(f"y = 1, c = 0: {np.mean(split_df.loc[split_df['y'] == 1, 'place'] == 0):.3f}, n = {np.sum((split_df['y'] == 1) & (split_df['place'] == 0))}")
    print(f"y = 1, c = 1: {np.mean(split_df.loc[split_df['y'] == 1, 'place'] == 1):.3f}, n = {np.sum((split_df['y'] == 1) & (split_df['place'] == 1))}")


train:
waterbirds are 0.226 of the examples
y = 0, c = 0: 0.950, n = 3525
y = 0, c = 1: 0.050, n = 186
y = 1, c = 0: 0.050, n = 54
y = 1, c = 1: 0.950, n = 1030
val:
waterbirds are 0.246 of the examples
y = 0, c = 0: 0.500, n = 452
y = 0, c = 1: 0.500, n = 452
y = 1, c = 0: 0.498, n = 147
y = 1, c = 1: 0.502, n = 148
test:
waterbirds are 0.222 of the examples
y = 0, c = 0: 0.500, n = 2255
y = 0, c = 1: 0.500, n = 2255
y = 1, c = 0: 0.500, n = 642
y = 1, c = 1: 0.500, n = 642


In [80]:
### Assign places to train, val, and test set
place_ids_df = pd.read_csv(
    os.path.join(places_dir, 'places365_val.txt'),
    sep=" ",
    header=None,
    names=['image_name', 'place_id'],
    index_col='place_id')

In [81]:
water_imgs = set()
land_imgs = set()

for i, place_type in enumerate(target_places_ids):
    for id in place_type:
        if i == 0:
            land_imgs.update(place_ids_df[place_ids_df.index == id]['image_name'])
        if i ==1:
            water_imgs.update(place_ids_df[place_ids_df.index == id]['image_name'])

In [82]:
df['water_img'] = np.random.choice(list(water_imgs), size=len(df))
df['land_img'] = np.random.choice(list(land_imgs), size=len(df))
df['place_image'] = df['place']*df['water_img'] + (1-df['place'])*df['land_img']

In [83]:
df

Unnamed: 0_level_0,img_filename,y,split,place,water_img,land_img,place_image
img_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,001.Black_footed_Albatross/Black_Footed_Albatr...,1,2,1,Places365_val_00028916.jpg,Places365_val_00018646.jpg,Places365_val_00028916.jpg
2,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,Places365_val_00015826.jpg,Places365_val_00001447.jpg,Places365_val_00015826.jpg
3,001.Black_footed_Albatross/Black_Footed_Albatr...,1,2,0,Places365_val_00024562.jpg,Places365_val_00001133.jpg,Places365_val_00001133.jpg
4,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,Places365_val_00014632.jpg,Places365_val_00011509.jpg,Places365_val_00014632.jpg
5,001.Black_footed_Albatross/Black_Footed_Albatr...,1,0,1,Places365_val_00031410.jpg,Places365_val_00009251.jpg,Places365_val_00031410.jpg
...,...,...,...,...,...,...,...
11784,200.Common_Yellowthroat/Common_Yellowthroat_00...,0,1,0,Places365_val_00021344.jpg,Places365_val_00025089.jpg,Places365_val_00025089.jpg
11785,200.Common_Yellowthroat/Common_Yellowthroat_00...,0,2,1,Places365_val_00024797.jpg,Places365_val_00017936.jpg,Places365_val_00024797.jpg
11786,200.Common_Yellowthroat/Common_Yellowthroat_00...,0,2,1,Places365_val_00011315.jpg,Places365_val_00013996.jpg,Places365_val_00011315.jpg
11787,200.Common_Yellowthroat/Common_Yellowthroat_00...,0,0,0,Places365_val_00007471.jpg,Places365_val_00027339.jpg,Places365_val_00027339.jpg


In [84]:
### Write dataset to disk
output_subfolder = os.path.join(output_dir, dataset_name)
os.makedirs(output_subfolder, exist_ok=True)

df.to_csv(os.path.join(output_subfolder, 'metadata.csv'))

for i in tqdm(df.index):
    # Load bird image and segmentation
    img_path = os.path.join(cub_dir, 'CUB_200_2011', 'CUB_200_2011', 'images', df.loc[i, 'img_filename'])
    seg_path = os.path.join(cub_dir, 'segmentations', df.loc[i, 'img_filename'].replace('.jpg','.png'))
    img_np = np.asarray(Image.open(img_path).convert('RGB'))
    seg_np = np.asarray(Image.open(seg_path).convert('RGB')) / 255

    # Load place background
    # Skip front /
    place_path = os.path.join(places_dir, df.loc[i, 'place_image'])
    place = Image.open(place_path).convert('RGB')

    if df.loc[i, 'split'] == 0:
        # place = gray2rgb(rgb2gray(np.array(place)))
        place = random_noise(np.array(place), mode='gaussian', mean=0, var=0.05, clip=True)   
        # place = random_noise(np.array(place), mode='s&p', salt_vs_pepper=0.5, clip=True)
        # place = random_noise(np.array(place), mode='speckle', mean=0, var=0.05, clip=True)
        place = Image.fromarray((place * 255).astype(np.uint8)) 

    img_black = Image.fromarray(np.around(img_np * seg_np).astype(np.uint8))
    combined_img = combine_and_mask(place, seg_np, img_black)

    output_path = os.path.join(output_subfolder, df.loc[i, 'img_filename'])
    os.makedirs('/'.join(output_path.split('/')[:-1]), exist_ok=True)

    combined_img.save(output_path)

100%|██████████| 11788/11788 [13:07<00:00, 14.97it/s]


In [86]:
# type_subfolder = os.path.join(output_dir, "water")
# os.makedirs(type_subfolder, exist_ok=True)

# for img in water_imgs:
#     place_path = os.path.join(places_dir, img)
#     output_path = os.path.join(type_subfolder, img)
#     place = Image.open(place_path).convert('RGB')
#     place.save(output_path)

In [87]:
# type_subfolder = os.path.join(output_dir, "land")
# os.makedirs(type_subfolder, exist_ok=True)

# for img in land_imgs:
#     place_path = os.path.join(places_dir, img)
#     output_path = os.path.join(type_subfolder, img)
#     place = Image.open(place_path).convert('RGB')
#     place.save(output_path)

In [90]:
water_img = "Places365_val_00000906.jpg"
bird = "012.Yellow_headed_Blackbird/Yellow_Headed_Blackbird_0008_8756.jpg"
output_subfolder = os.path.join(output_dir, "test")
os.makedirs(output_subfolder, exist_ok=True)

# Load bird image and segmentation
img_path = os.path.join(cub_dir, 'CUB_200_2011', 'CUB_200_2011', 'images', bird)
seg_path = os.path.join(cub_dir, 'segmentations', bird.replace('.jpg','.png'))
img_np = np.asarray(Image.open(img_path).convert('RGB'))
seg_np = np.asarray(Image.open(seg_path).convert('RGB')) / 255

# Load place background
# Skip front /
place_path = os.path.join(places_dir, water_img)
place = Image.open(place_path).convert('RGB')

img_black = Image.fromarray(np.around(img_np * seg_np).astype(np.uint8))
combined_img = combine_and_mask(place, seg_np, img_black)
output_path = os.path.join(output_subfolder, "original.jpg")
combined_img.save(output_path)

gaussian_place = random_noise(np.array(place), mode='gaussian', mean=0, var=0.05, clip=True) 
gaussian_place = Image.fromarray((gaussian_place * 255).astype(np.uint8)) 
combined_img = combine_and_mask(gaussian_place, seg_np, img_black)
output_path = os.path.join(output_subfolder, "gaussian.jpg")
combined_img.save(output_path)

sandp_place = random_noise(np.array(place), mode='s&p', salt_vs_pepper=0.5, clip=True)
sandp_place = Image.fromarray((sandp_place * 255).astype(np.uint8)) 
combined_img = combine_and_mask(sandp_place, seg_np, img_black)
output_path = os.path.join(output_subfolder, "sandp.jpg")
combined_img.save(output_path)

speckle_place = random_noise(np.array(place), mode='speckle', mean=0, var=0.05, clip=True)
speckle_place = Image.fromarray((speckle_place * 255).astype(np.uint8)) 
combined_img = combine_and_mask(speckle_place, seg_np, img_black)
output_path = os.path.join(output_subfolder, "speckle.jpg")
combined_img.save(output_path)

gray_place = gray2rgb(rgb2gray(np.array(place)))
gray_place = Image.fromarray((gray_place * 255).astype(np.uint8)) 
combined_img = combine_and_mask(gray_place, seg_np, img_black)
output_path = os.path.join(output_subfolder, "gray.jpg")
combined_img.save(output_path)

blur_place = place.filter(ImageFilter.BLUR)
combined_img = combine_and_mask(blur_place, seg_np, img_black)
output_path = os.path.join(output_subfolder, "blur.jpg")
combined_img.save(output_path)

# place = gray2rgb(rgb2gray(np.array(place)))
        # place = random_noise(np.array(place), mode='s&p', salt_vs_pepper=0.5, clip=True)
        # place = random_noise(np.array(place), mode='speckle', mean=0, var=0.05, clip=True)
        # place = Image.fromarray((place * 255).astype(np.uint8))  