In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize



# Generate simple synthetic datasets 
Goal: test how dataset size and number of features impact bias-robustness

In [233]:
mu1, mu2 = .5, -.5
mu_x2 = 0
mu_x3_1, mu_x3_2 =-.5,.5 # corresponds to after _ (difference between)
mu_x4_1, mu_x4_2 = 1, -1
mu_x5_1, mu_x5_2 = 0,0
sigma = 1

mus_pos = [mu1, mu_x2, mu_x3_1, mu_x4_1, mu_x5_1]
mus_neg = [mu2, mu_x2, mu_x3_2, mu_x4_2, mu_x5_2]

dataset_sizes = [100, 250, 500] # how many of each label? (total size = 2 * this)
features = [3,4,5]

np.random.seed(42)

In [234]:
for num_features in features:    
    for dataset_size in dataset_sizes:
        x_pos = []
        x_neg = []
        for i in range(dataset_size):
            for j in range(num_features):
                x = np.random.normal(mus_pos[j], sigma)
                x_pos.append(x)
                x = np.random.normal(mus_neg[j], sigma)
                x_neg.append(x)
            
        y_neg = [-1 for i in range(len(x_neg))]
        y_pos = [1 for i in range(len(x_pos))]
        x = x_neg + x_pos
        y = y_neg + y_pos
        df = pd.DataFrame(x)
        df['label'] = y
        df.to_csv("synth_" + str(dataset_size) + "_" + str(num_features) + ".txt", index=False)

# Generate demographics data
Goal: how does demographic group size impact robustness?

In [14]:
np.random.seed(42)

In [2]:
def generate_data(mu1, mu2, mu3, mu4, cov1, cov2, cov3, cov4, num1, num2, num3, num4):
    dataset_size = 250 # 1000 data points total
    x_pos = []
    x_neg = []
    for i in range(num1):
        new_sample = np.random.multivariate_normal(mu1, cov1)
        new_sample = np.append(new_sample, 1)
        x_pos.append(new_sample)

    for i in range(num2):
        new_sample = np.random.multivariate_normal(mu2, cov2)
        new_sample = np.append(new_sample, 0)
        x_pos.append(new_sample)

    for i in range(num3):
        new_sample = np.random.multivariate_normal(mu3, cov3)
        new_sample = np.append(new_sample, 1)
        x_neg.append(new_sample)

    for i in range(num4):
        new_sample = np.random.multivariate_normal(mu4, cov4)
        new_sample = np.append(new_sample, 0)
        x_neg.append(new_sample)
    return x_pos, x_neg

In [24]:
mu1 = np.array([-1, 1]) 
mu2 = np.array([-2, 1]) 
mu3 = np.array([2, -1]) 
mu4 = np.array([1, -1]) 
cov1, cov2 = np.array([[0.5, 0.1], [0.1, 0.5]]), np.array([[0.5, 0.1], [0.1, 0.5]])
cov3, cov4 = np.array([[0.5, 0.1], [0.1, 0.5]]), np.array([[0.5, 0.1], [0.1, 0.5]])

min_group_size = [100,200,300,400,500,600,700,800,900,1000]

In [25]:
for size in min_group_size:
    x_pos, x_neg = generate_data(mu1, mu2, mu3, mu4, cov1, cov2, cov3, cov4, 2000-size, size, 2000-size, size)
    x1_coord_neg = [i[0] for i in x_neg]
    x2_coord_neg = [i[1] for i in x_neg]
    x1_coord_pos = [i[0] for i in x_pos]
    x2_coord_pos = [i[1] for i in x_pos]
    label_neg = [i[2] for i in x_neg]
    label_pos = [i[2] for i in x_pos]

    y_neg = [-1 for i in range(len(x_neg))]
    y_pos = [1 for i in range(len(x_pos))]
    x = x_neg + x_pos
    y = y_neg + y_pos
    df = pd.DataFrame(x)
    df['label'] = y
    df.to_csv("demo_" + str(size) + ".csv",index=False)