This script generates a synthetic loan application dataset known to be biased toward female applicants. We use Figure 1 from [Karimi et al. (2020)](https://arxiv.org/pdf/2002.06278.pdf) as the base data generating model (DGM). It is for a fake **Loan Application** process such that:
- $Y$ is known as the decision-maker is known too;
- $X_1$ is annual salary;
- $X_2$ is account balance; and
- $A$ is gender.

In [1]:
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# set working directory
wrk_dir = os.path.dirname(os.getcwd())
# set data path
data_path = wrk_dir + '\\' + 'data' + '\\'

### Version 1: Causal Sufficiency (without $A$)
With the DAG: $X_1->Y$; $X_1->X_2$; and $X_2->Y$.

In [3]:
# overall params
np.random.seed(2022)
n = 5000

# (hyper)parameters
# p_rate = 10
# n_mu = 0.0
# n_sigma = 1.0
lambda_1 = 10000
lambda_2 = 2500
beta_1 = (3/10)
beta_2 = 5
#epsilon = 225000

u1 = lambda_1*np.random.poisson(lam=10, size=n)
u2 = lambda_2*np.random.normal(loc=0.0, scale=1.0, size=n)

# annual salary
x1 = u1
# account balance
x2 = beta_1*x1 + u2
# loan approval
y = np.sign(x1 + beta_2*x2 - 225000)

In [4]:
# make a dataset for the observables and unobservables
d = {'LoanApproval': y, 
     'AnnualSalary': x1, 
     'AccountBalance': x2,
     'u1': u1,
     'u2': u2}
data = pd.DataFrame(d)
data.head(5)

Unnamed: 0,LoanApproval,AnnualSalary,AccountBalance,u1,u2
0,-1.0,50000,14026.847358,50000,-973.152642
1,1.0,120000,36940.097383,120000,940.097383
2,-1.0,90000,23564.129008,90000,-3435.870992
3,-1.0,80000,27596.570524,80000,3596.570524
4,1.0,210000,62294.22162,210000,-705.77838


In [5]:
# check for negative values
print(data.shape)
data = data[(data['AnnualSalary'] >= 0) & (data['AccountBalance'] >= 0)]
print(data.shape)

(5000, 5)
(5000, 5)


In [None]:
# store in data folder
data.to_csv(data_path + '\\' + 'LoanApplication_v1.csv', sep='|', index=False)

### Version 2: Causal Sufficiency (with $A$)
With the DAG: $X_1->Y$; $X_1->X_2$; $X_2->Y$; $A -> X_1$; and $A -> X_2$.

In [6]:
# create systematic penalties around A
np.random.seed(2020) # to avoid identical draws as the Us
n = 5000

# Let A be Gender
p_men = 0.65
uA = random.choices(population=[0, 1], weights=[p_men, (1 - p_men)], k=n)
A = np.asanyarray(uA) # where A=1 represents female

bias_a1 = (-1500)*np.random.poisson(lam=10, size=n)
bias_a2 = (-300)*np.random.chisquare(df=4, size=n)

In [7]:
# overall params
np.random.seed(2022)
n = 5000

# previous hyperparams
lambda_1 = 10000
lambda_2 = 2500
beta_1 = (3/10)
beta_2 = 5
#epsilon = 225000

u1 = lambda_1*np.random.poisson(lam=10, size=n)
u2 = lambda_2*np.random.normal(loc=0.0, scale=1.0, size=n)

# annual salary
n_x1 = u1 + A*bias_a1
# account balance
n_x2 = beta_1*n_x1 + u2 + A*bias_a2
# loan approval
n_y = np.sign(n_x1 + beta_2*n_x2 - 225000)

In [8]:
# make a dataset for the observables and unobservables
n_d = {'LoanApproval': n_y, 
     'AnnualSalary': n_x1, 
     'AccountBalance': n_x2,
     'u1': u1,
     'u2': u2,
     'Gender': A}
n_data = pd.DataFrame(n_d)
n_data.head(5)

Unnamed: 0,LoanApproval,AnnualSalary,AccountBalance,u1,u2,Gender
0,-1.0,35000,7947.67809,50000,-973.152642,1
1,1.0,108000,32234.738442,120000,940.097383,1
2,-1.0,90000,23564.129008,90000,-3435.870992,0
3,-1.0,80000,27596.570524,80000,3596.570524,0
4,1.0,210000,62294.22162,210000,-705.77838,0


In [9]:
# check for negative values
print(n_data.shape)
n_data = n_data[(n_data['AnnualSalary'] >= 0) & (n_data['AccountBalance'] >= 0)]
print(n_data.shape)

(5000, 6)
(4994, 6)


In [None]:
# store in data folder
n_data.to_csv(data_path + '\\' + 'LoanApplication_v2.csv', sep='|', index=False)