In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv('/vol/biomedic3/bglocker/mammo/mammo-net/data/embed-non-negative.csv')

  df = pd.read_csv('/vol/biomedic3/bglocker/mammo/mammo-net/data/embed-non-negative.csv')


### 1. filter data
* keep only top 3 manufacturers
* remove subjects w/ more than one density value assigned to them 
* add pseudo-subgroup labels
* balance binary density classes

In [27]:
#  filtering

# FFDM only
df = df[df['FinalImageType'] == '2D']

# Female only
df = df[df['GENDER_DESC'] == 'Female']

# Remove unclear breast density cases
df = df[df['tissueden'].notna()]
df = df[df['tissueden'] < 5]

# MLO and CC only        
df = df[df['ViewPosition'].isin(['MLO','CC'])]

# Remove spot compression or magnificiation
df = df[df['spot_mag'].isna()]

In [28]:
#binarize density 
df['density_binary'] = 0

df.loc[df['tissueden'] == 1, 'density_binary'] = 0
df.loc[df['tissueden'] == 2, 'density_binary'] = 0
df.loc[df['tissueden'] == 3, 'density_binary'] = 1
df.loc[df['tissueden'] == 4, 'density_binary'] = 1

In [29]:
print(df.density.value_counts(dropna=False))
print(df.density_binary.value_counts(dropna=False))

density
B    108146
C    100353
A     29947
D     13007
Name: count, dtype: int64
density_binary
0    138093
1    113360
Name: count, dtype: int64


In [30]:
print(df.Manufacturer.value_counts(dropna=False, normalize=True))

Manufacturer
HOLOGIC, Inc.           0.890250
GE MEDICAL SYSTEMS      0.083173
FUJIFILM Corporation    0.024915
GE HEALTHCARE           0.001662
Name: proportion, dtype: float64


In [31]:
# remove GE HEALTHCARE
df = df[df['Manufacturer']!='GE HEALTHCARE']

In [32]:
print(df.Manufacturer.value_counts(dropna=False, normalize=True))
print(df.Manufacturer.value_counts(dropna=False))

Manufacturer
HOLOGIC, Inc.           0.891732
GE MEDICAL SYSTEMS      0.083311
FUJIFILM Corporation    0.024957
Name: proportion, dtype: float64
Manufacturer
HOLOGIC, Inc.           223856
GE MEDICAL SYSTEMS       20914
FUJIFILM Corporation      6265
Name: count, dtype: int64


In [None]:
# check if all subjects have a single value for density

attr = 'density'
check = df.groupby('empi_anon', as_index=False)[attr].nunique()
print(check.density.sum())
print(len(check))

22741
20760


In [None]:
# they don't - see how many subjects have more than one density value assigned
print(check.density.value_counts())

density
1    18788
2     1963
3        9
Name: count, dtype: int64


In [36]:
# only include subjects with a single density value
single_density_ids = check[check['density']==1]['empi_anon']
print(single_density_ids)
print(len(single_density_ids))

0        10000879
1        10009146
2        10015693
3        10019048
4        10023113
           ...   
20755    99967713
20756    99980766
20757    99986224
20758    99996622
20759    99999564
Name: empi_anon, Length: 18788, dtype: int64
18788


In [37]:
df = df[df['empi_anon'].isin(single_density_ids)]

check = df.groupby('empi_anon', as_index=False)[attr].nunique()
print(check.density.sum())
print(len(check))

18788
18788


In [38]:
# add pseudo-subgroup labels 
df['pseudo_subgroup'] = np.random.randint(low=0, high=3, size = len(df))

In [39]:
df.pseudo_subgroup.value_counts()

pseudo_subgroup
2    70429
0    70329
1    70215
Name: count, dtype: int64

In [42]:
print(df.density.value_counts(dropna=False))
print(df.density_binary.value_counts(dropna=False))

density
B    91658
C    86570
A    23270
D     9475
Name: count, dtype: int64
density_binary
0    114928
1     96045
Name: count, dtype: int64


In [48]:
# undersample to balance classes

def undersample_class(df, label_to_undersample, n_to_drop):

    n_0 = df.density_binary.value_counts(dropna=False).iloc[0]
    n_1 = df.density_binary.value_counts(dropna=False).iloc[1]
    print(f'original ratio: {n_1/n_0}')


    class_df = df.loc[df['density_binary']==label_to_undersample]
    # print(len(class_df))
    inds = class_df.sample(n=n_to_drop, random_state=42).index


    df = df.drop(inds)

    # print(df.density_binary.value_counts())

    n_0 = df.density_binary.value_counts(dropna=False).iloc[0]
    n_1 = df.density_binary.value_counts(dropna=False).iloc[1]
    print(f'new ratio: {n_1/n_0}')

    return df

In [49]:
n_0 = df.density_binary.value_counts(dropna=False).iloc[0]
n_1 = df.density_binary.value_counts(dropna=False).iloc[1]
print(n_0)
print(n_1)

114928
96045


In [50]:
n_0_to_drop = n_0-n_1
df1 = df.copy()
df1 = undersample_class(df1, 0, n_0_to_drop)
print(df1.density_binary.value_counts())

original ratio: 0.8356971321174996
new ratio: 1.0
density_binary
1    96045
0    96045
Name: count, dtype: int64


In [None]:
# save baseline dataframe
df1.to_csv('/vol/biomedic3/es525/mammo-label-bias/modified_metadata/embed-non-negative_singledensity_3manu_pseudo_balanced.csv')

### 2. add label bias

In [57]:
def add_subgroup_label_bias_to_subclass(df, initial_subclass_label, target_class_label, bias_proportion, attribute, subgroup):

    # get df w/ desired initial density subclass label + attribute combination
    filtered_df = df.loc[(df['density']==initial_subclass_label)&(df[attribute]==subgroup)]

    # one row per subject
    subjects_df = filtered_df.drop_duplicates(subset=['empi_anon'])

    print(f'number of unique IDS: {len(subjects_df)}')
    print(f'number of images corresponding to unique IDs = {len(filtered_df[filtered_df['empi_anon'].isin(subjects_df['empi_anon'].values)])}')

    # sample desired amount of those subjects to have wrong labels 
    sample_df = subjects_df.sample(frac=bias_proportion, random_state=42)
    sample_ids = sample_df['empi_anon'].values

    final_sample = filtered_df[filtered_df['empi_anon'].isin(sample_ids)]
    final_sample_idx = final_sample.index # indices to change biased target for

    print(f'sample: {bias_proportion} of {len(subjects_df)} = {len(sample_ids)}')
    print(f'number of images corresponding to {len(sample_ids)} unique IDs = {len(final_sample)}')

    # change targets for all images in subject sample
    df.loc[:, 'biased_target'] = df.loc[:, 'density_binary']
    df.loc[final_sample_idx, 'biased_target'] = target_class_label

    print(df.density_binary.value_counts())
    print(df.biased_target.value_counts())


    return df

In [58]:
initial_subclass_label = 'C'
target_class_label = 0
attribute = 'Manufacturer'
subgroup = 'GE MEDICAL SYSTEMS'
bias_proportion = 0.3

df2 = df1.copy()
df2 = add_subgroup_label_bias_to_subclass(df2, initial_subclass_label, target_class_label, bias_proportion, attribute, subgroup)

number of unique IDS: 1346
number of images corresponding to unique IDs = 7590
sample: 0.3 of 1346 = 404
number of images corresponding to 404 unique IDs = 2383
density_binary
1    96045
0    96045
Name: count, dtype: int64
biased_target
0    98428
1    93662
Name: count, dtype: int64


In [None]:
df2.to_csv('/vol/biomedic3/es525/mammo-label-bias/modified_metadata/labelbias_subjectlevel_3manu_gems_c_30.csv')

In [61]:
initial_subclass_label = 'C'
target_class_label = 0
attribute = 'Manufacturer'
subgroup = 'HOLOGIC, Inc.'
bias_proportion = 0.3

df3 = df1.copy()
df3 = add_subgroup_label_bias_to_subclass(df3, initial_subclass_label, target_class_label, bias_proportion, attribute, subgroup)

number of unique IDS: 7184
number of images corresponding to unique IDs = 76720
sample: 0.3 of 7184 = 2155
number of images corresponding to 2155 unique IDs = 22599
density_binary
1    96045
0    96045
Name: count, dtype: int64
biased_target
0    118644
1     73446
Name: count, dtype: int64


In [None]:
df3.to_csv('/vol/biomedic3/es525/mammo-label-bias/modified_metadata/labelbias_subjectlevel_3manu_hologic_c_30.csv')

In [63]:
initial_subclass_label = 'C'
target_class_label = 0
attribute = 'Manufacturer'
subgroup = 'FUJIFILM Corporation'
bias_proportion = 0.3

df4 = df1.copy()
df4 = add_subgroup_label_bias_to_subclass(df4, initial_subclass_label, target_class_label, bias_proportion, attribute, subgroup)

number of unique IDS: 455
number of images corresponding to unique IDs = 2260
sample: 0.3 of 455 = 136
number of images corresponding to 136 unique IDs = 651
density_binary
1    96045
0    96045
Name: count, dtype: int64
biased_target
0    96696
1    95394
Name: count, dtype: int64


In [None]:
df4.to_csv('/vol/biomedic3/es525/mammo-label-bias/modified_metadata/labelbias_subjectlevel_3manu_fuji_c_30.csv')

### pseudo-subgroup label noise

In [65]:
initial_subclass_label = 'C'
target_class_label = 0
attribute = 'pseudo_subgroup'
subgroup = 0
bias_proportion = 0.3

df5 = df1.copy()
df5 = add_subgroup_label_bias_to_subclass(df5, initial_subclass_label, target_class_label, bias_proportion, attribute, subgroup)

number of unique IDS: 7123
number of images corresponding to unique IDs = 28715
sample: 0.3 of 7123 = 2137
number of images corresponding to 2137 unique IDs = 8503
density_binary
1    96045
0    96045
Name: count, dtype: int64
biased_target
0    104548
1     87542
Name: count, dtype: int64


In [None]:
df5.to_csv('/vol/biomedic3/es525/mammo-label-bias/modified_metadata/labelbias_subjectlevel_3manu_pseudo0_c_30.csv')