### Generate noisy datasets
Please make sure that datasets are downloaded and properly placed in `./data/`.

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def gen_noise_set(path, name = "metadata"):
    for noise in [0.0, 0.02, 0.04, 0.06, 0.08, 0.10]:
        csv = pd.read_csv(os.path.join(path, f"{name}.csv"))
        print(csv.columns)
        csv['gth'] = csv['y']
        for split in range(3):
            split_len = csv.loc[(csv["split"] == split)].shape[0]
            print("#### split = {}, len = {} ####".format(split, split_len))
            sampled_idxs = []
            for y in range(2):
                subdf = csv.loc[(csv["split"] == split) & (csv["y"] == y)]
                group_dfs = [subdf.loc[subdf["place"] == i] for i in range(2)]
                n_sample = round(subdf.shape[0] * noise)
                if split == 0:
                    n_flips =  [n_sample // 2, (n_sample + 1) // 2]  # num of data to be flip in each group of this label
                else:
                    place_rate = group_dfs[0].shape[0] / subdf.shape[0]
                    n_flips = [int(n_sample * place_rate), n_sample - int(n_sample * place_rate)]        

                print("n_sample: {}, Num of data {}".format(n_flips, [w.shape[0] for w in group_dfs]))
                assert np.all([n <= w.shape[0] for n, w in zip(n_flips, group_dfs)])
                sampled_idxs += [np.random.choice(
                    w.index.to_numpy(), size = n, replace = False) for n, w in zip(n_flips, group_dfs)]

            sampled_idxs = np.concatenate(sampled_idxs)    
            csv.loc[sampled_idxs, "y"] = csv.loc[sampled_idxs, "y"].map(lambda x: 1 - x)
            y_noise = csv.loc[(csv["split"] == split) & (csv["gth"] != csv["y"])].shape[0] / split_len * 100
            p_noise = csv.loc[(csv["split"] == split) & (csv["place"] != csv["y"])].shape[0] / split_len * 100
            print(f"split = {split}, core noise = {y_noise:.2f}%, spurious noise = {p_noise:.2f}%")

        core_noise = int(100 * noise)
        csv.to_csv(os.path.join(path, f"metadata_{core_noise}.csv"), index = False)

For Waterbirds dataset, please download the dataset to `./data/waterbirds_v1.0/` with `metadata.csv` inside.
Run the following cell to obtain Waterbirds with noise.

In [None]:
path = "./data/waterbirds_v1.0"
gen_noise_set(path)

For CelebA dataset, please download the dataset to `./data/celebA_v1.0/` with `metadata.csv` inside.
Run the following cell to obtain CelebA with noise. 
We first subsample the dataset to make noise level consistent.

In [None]:
csv = pd.read_csv("./data/celebA_v1.0/metadata.csv")
print("Sub-sampling data ...")
lis = []
for split in range(3):
    df = csv.loc[csv["split"] == split]
    bc = df.groupby(["y", "place"]).agg({"split": "count"}).reset_index()["split"].to_numpy()
    g1_num = int(bc[3] / bc[2] * bc[1])
    print("original group 1 num is {}, project to {}".format(bc[0], g1_num))
    assert g1_num <= bc[0]
    index = df.loc[(df["y"] == 0) & (df["place"] == 0)].index
    lis.append(df.drop(index[g1_num:]))
    print(lis[-1].groupby(["y", "place"]).agg({"split": "count"}).reset_index()["split"].to_numpy())
df = pd.concat(lis, ignore_index=True)
df.to_csv("./data/celebA_v1.0/metadata_subsample.csv", index = False)

path = "./data/celebA_v1.0"
gen_noise_set(path, "metadata_subsample")