## Build DFR samples for Waterbirds

In [21]:
import pandas as pd
import os
import numpy as np

root_dir = "../datasets/"
metadata_csv_name = "metadata.csv"
for spur in [50, 625, 75, 875, 95]:
    ds = f"waterbird_complete{spur}_forest2water2"
    attrs_df = pd.read_csv(
                os.path.join(root_dir, ds, metadata_csv_name))
            # Split out filenames and attribute names
    data_dir = os.path.join(root_dir, ds)
    
    # Group by 'Male' and 'Blond_Hair' columns
    grouped = attrs_df[attrs_df['split']==0].groupby(['y', 'place'])
    n = len(attrs_df[attrs_df['split']==0])
    # Get the minimum number of rows across all groups
    min_rows = grouped.size().min()
    
    # Subsample each group to have the same number of rows
    subsampled_df = grouped.apply(lambda x: x.sample(min_rows, random_state=42))
    
    # Reset index
    subsampled_df.reset_index(drop=True, inplace=True)
    
    r = pd.concat([subsampled_df, attrs_df[attrs_df['split']!=0]])
    r.to_csv(os.path.join(data_dir,"metadata_dfr.csv"),index=False)
    
    attrs_df = pd.read_csv(
                os.path.join(data_dir,"metadata_dfr.csv"))
    print(f"For {ds}, % of original size: {100*len(subsampled_df)/n:.2f}%:")
    print(attrs_df.groupby([
        'split','y', 'place']).size())

For waterbird_complete50_forest2water2, % of original size: 46.30%:
split  y  place
0      0  0         555
          1         555
       1  0         555
          1         555
1      0  0         465
          1         466
       1  0         134
          1         134
2      0  0        2255
          1        2255
       1  0         642
          1         642
dtype: int64
For waterbird_complete625_forest2water2, % of original size: 35.70%:
split  y  place
0      0  0         428
          1         428
       1  0         428
          1         428
1      0  0         481
          1         481
       1  0         119
          1         118
2      0  0        2255
          1        2255
       1  0         642
          1         642
dtype: int64
For waterbird_complete75_forest2water2, % of original size: 22.52%:
split  y  place
0      0  0         270
          1         270
       1  0         270
          1         270
1      0  0         451
          1         451
 

In [3]:
35.70/46.3, 22.52/46.3, 11.76/46.3, 4.67/46.3

(0.7710583153347733,
 0.4863930885529158,
 0.25399568034557235,
 0.10086393088552917)

split  y  place
0      0  0         555
          1         555
       1  0         555
          1         555
1      0  0         465
          1         466
       1  0         134
          1         134
2      0  0        2255
          1        2255
       1  0         642
          1         642
dtype: int64


In [9]:
attrs_df

array([[0, 1, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1]])

In [59]:
1370/162770

0.008416784419733366