## Build DFR samples for CelebA

In [21]:
import pandas as pd
import os
import numpy as np
def attr_idx(attr_name):
    return attr_names.get_loc(attr_name)
root_dir = "../datasets"
target_name = "Blond_Hair"
metadata_csv_name = "list_attr_celeba.csv"
confounder_names = ['Male']
attrs_df = pd.read_csv(
            os.path.join(root_dir, "celeba", metadata_csv_name))
        # Split out filenames and attribute names
data_dir = os.path.join(root_dir, "celeba", "img_align_celeba")
filename_array = attrs_df["image_id"].values
attrs_df = attrs_df.drop(labels="image_id", axis="columns")
attr_names = attrs_df.columns.copy()

# Then cast attributes to numpy array and set them to 0 and 1
# (originally, they're -1 and 1)
attrs_df = attrs_df.values
attrs_df[attrs_df == -1] = 0

# Get the y values
target_idx = attr_idx(target_name)
y_array = attrs_df[:, target_idx]
n_classes = 2

# Map the confounder attributes to a number 0,...,2^|confounder_idx|-1
confounder_idx = [attr_idx(a) for a in confounder_names]
n_confounders = len(confounder_idx)
confounders = attrs_df[:, confounder_idx]
confounder_array = np.matmul(
    confounders.astype(int),
    np.power(2, np.arange(len(confounder_idx))))

# Map to groups
n_groups = n_classes * pow(2, len(confounder_idx))
group_array = (y_array * (n_groups / 2) +
                    confounder_array).astype("int")

# Read in train/val/test splits
split_df = pd.read_csv(
    os.path.join(root_dir, "celeba", "list_eval_partition.csv"))
split_array = split_df["partition"].values
split_dict = {
    "train": 0,
    "val": 1,
    "test": 2,
}

In [20]:
split_df

Unnamed: 0,image_id,partition
0,000001.jpg,0
1,000002.jpg,0
2,000003.jpg,0
3,000004.jpg,0
4,000005.jpg,0
...,...,...
202594,202595.jpg,2
202595,202596.jpg,2
202596,202597.jpg,2
202597,202598.jpg,2


In [9]:
attrs_df

array([[0, 1, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1]])

In [11]:
attr_names

Index(['5_o_Clock_Shadow', 'Arched_Eyebrows', 'Attractive', 'Bags_Under_Eyes',
       'Bald', 'Bangs', 'Big_Lips', 'Big_Nose', 'Black_Hair', 'Blond_Hair',
       'Blurry', 'Brown_Hair', 'Bushy_Eyebrows', 'Chubby', 'Double_Chin',
       'Eyeglasses', 'Goatee', 'Gray_Hair', 'Heavy_Makeup', 'High_Cheekbones',
       'Male', 'Mouth_Slightly_Open', 'Mustache', 'Narrow_Eyes', 'No_Beard',
       'Oval_Face', 'Pale_Skin', 'Pointy_Nose', 'Receding_Hairline',
       'Rosy_Cheeks', 'Sideburns', 'Smiling', 'Straight_Hair', 'Wavy_Hair',
       'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick',
       'Wearing_Necklace', 'Wearing_Necktie', 'Young'],
      dtype='object')

In [22]:
split_df = pd.read_csv(
    os.path.join(root_dir, "celeba", "list_eval_partition.csv"))

In [23]:
split_df

Unnamed: 0,image_id,partition
0,000001.jpg,0
1,000002.jpg,0
2,000003.jpg,0
3,000004.jpg,0
4,000005.jpg,0
...,...,...
202594,202595.jpg,2
202595,202596.jpg,2
202596,202597.jpg,2
202597,202598.jpg,2


In [25]:
# Perform inner join on the 'key' column
metadata_csv_name = "list_attr_celeba.csv"
confounder_names = ['Male']
attrs_df = pd.read_csv(
            os.path.join(root_dir, "celeba", metadata_csv_name))


In [26]:
result = pd.merge(attrs_df, split_df, on='image_id', how='inner')

In [28]:
result[['image_id','Male','Blond_Hair','partition']]

Unnamed: 0,image_id,Male,Blond_Hair,partition
0,000001.jpg,-1,-1,0
1,000002.jpg,-1,-1,0
2,000003.jpg,1,-1,0
3,000004.jpg,-1,-1,0
4,000005.jpg,-1,-1,0
...,...,...,...,...
202594,202595.jpg,-1,1,2
202595,202596.jpg,1,1,2
202596,202597.jpg,1,-1,2
202597,202598.jpg,-1,-1,2


In [55]:
# Group by 'Male' and 'Blond_Hair' columns
grouped = result[result['partition']==0].groupby(['Male', 'Blond_Hair'])

# Get the minimum number of rows across all groups
min_rows = grouped.size().min()

# Subsample each group to have the same number of rows
subsampled_df = grouped.apply(lambda x: x.sample(min_rows, random_state=42))

# Reset index
subsampled_df.reset_index(drop=True, inplace=True)

print(subsampled_df)

        image_id  5_o_Clock_Shadow  Arched_Eyebrows  Attractive  \
0     049226.jpg                -1               -1           1   
1     070383.jpg                -1               -1           1   
2     052012.jpg                -1               -1           1   
3     032348.jpg                -1               -1          -1   
4     119344.jpg                -1               -1          -1   
...          ...               ...              ...         ...   
5543  127526.jpg                -1               -1          -1   
5544  132131.jpg                 1               -1           1   
5545  150770.jpg                -1               -1          -1   
5546  100666.jpg                -1               -1          -1   
5547  131762.jpg                -1               -1          -1   

      Bags_Under_Eyes  Bald  Bangs  Big_Lips  Big_Nose  Black_Hair  ...  \
0                  -1    -1     -1         1        -1          -1  ...   
1                  -1    -1     -1         1 

In [56]:
print(subsampled_df.groupby(['Male', 'Blond_Hair']).size())

Male  Blond_Hair
-1    -1            1387
       1            1387
 1    -1            1387
       1            1387
dtype: int64


In [57]:
print(result.groupby(['Male', 'Blond_Hair']).size())

Male  Blond_Hair
-1    -1            89931
       1            28234
 1    -1            82685
       1             1749
dtype: int64


In [58]:
print(result[result['partition']==0].shape)

(162770, 42)


In [59]:
1370/162770

0.008416784419733366

In [60]:
r = pd.concat([subsampled_df, result[result['partition']!=0]])

In [61]:
print(r.groupby(['partition','Male', 'Blond_Hair']).size())

partition  Male  Blond_Hair
0          -1    -1            1387
                  1            1387
            1    -1            1387
                  1            1387
1          -1    -1            8535
                  1            2874
            1    -1            8276
                  1             182
2          -1    -1            9767
                  1            2480
            1    -1            7535
                  1             180
dtype: int64


In [62]:
selected_df = r.drop(columns=['partition'])
selected_df.to_csv("list_attr_celeba_dfr.csv",index=False)

In [63]:
r[['image_id','partition']].to_csv("list_eval_partition_dfr.csv",index=False)

!pwd

## Sample from Datasets

In [71]:
from PIL import Image
from os import listdir
# Load your images
image_paths = []
root = "pics/wb"
for f in listdir(root):
    image_paths.append(f)
#image_paths = ['image1.jpg', 'image2.jpg', 'image3.jpg']  # Replace with your image paths
def resize_image(image, target_size):
    return image.resize(target_size)

# Load your images and resize them to the same size
target_size = (50, 50)  # Specify the target size for resizing
images = [resize_image(Image.open(f"{root}/{path}"), target_size) for path in image_paths]

# Specify the number of rows and columns for tiling
num_rows = 3
num_columns = 5

# Calculate the width and height of each tile
tile_width = target_size[0]
tile_height = target_size[1]

# Create a new blank image with the calculated dimensions
new_image_width = tile_width * num_columns
new_image_height = tile_height * num_rows
new_image = Image.new('RGB', (new_image_width, new_image_height))

# Paste each image into the new image, tiling row by row
for i, image in enumerate(images):
    row_index = i // num_columns
    col_index = i % num_columns
    x_offset = col_index * tile_width
    y_offset = row_index * tile_height
    new_image.paste(image, (x_offset, y_offset))

# Save the new image
new_image.save('tiled_image.jpg')

In [66]:
from IPython.display import display
display("tiled_image.jpg")

'tiled_image.jpg'

In [74]:
import torch
root = "../datasets/MNISTCIFAR/MNIST_CIFAR_0.0.pth"
a = torch.load(root)

In [78]:
a['train']['data'][0:15].shape

torch.Size([15, 3, 64, 32])

In [82]:
import torch
from torchvision import transforms
from PIL import Image

# Function to resize tensors to the same size and convert to PIL images
def tensor_to_image(tensor, target_size):
    tensor = transforms.functional.resize(tensor, target_size)
    tensor = transforms.functional.to_pil_image(tensor)
    return tensor

# Sample PyTorch tensors (replace with your actual tensors)
tensor1 = torch.rand(3, 100, 150)  # Example tensor shape: 3 channels, 100 height, 150 width
tensor2 = torch.rand(3, 120, 180)
tensor3 = torch.rand(3, 80, 120)
tensors = a['train']['data'][0:15]

# Specify the target 4size for resizing
target_size = (50, 50)

# Resize tensors to the same size and convert to PIL images
images = [tensor_to_image(tensor, target_size) for tensor in tensors]

# Specify the number of rows and columns for tiling
num_rows = 3
num_columns = 5

# Calculate the width and height of each tile
tile_width = target_size[0]
tile_height = target_size[1]

# Create a new blank image with the calculated dimensions
new_image_width = tile_width * num_columns
new_image_height = tile_height * num_rows
new_image = Image.new('RGB', (new_image_width, new_image_height))

# Paste each image into the new image, tiling row by row
for i, image in enumerate(images):
    row_index = i // num_columns
    col_index = i % num_columns
    x_offset = col_index * tile_width
    y_offset = row_index * tile_height
    new_image.paste(image, (x_offset, y_offset))

# Save the new image
new_image.save('tiled_image.jpg')