In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import rpy2.robjects as robjects
import rpy2.robjects.pandas2ri as pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr, isinstalled
import rpy2.robjects.packages as rpackages
from rpy2.robjects import r
import sys

import sys
sys.path.insert(0, '/Users/alanma/Documents/CFA_python')
import faircause.faircause as faircause


In [22]:
# Ensure faircause is installed (either using conda or R)

print("Is faircause installed now?", isinstalled('faircause'))


Is faircause installed now? True


In [23]:
# import
base = importr('base')
faircause = importr('faircause')

# Load census dataset
data = robjects.r('''
    data("gov_census", package = "faircause")
    gov_census[seq_len(20000), ]  # Take first 20000 rows as in the vignette
''')

# Rename columns to match the vignette
data.columns = ['sex', 'age', 'race', 'hispanic_origin', 'citizenship', 'nativity',
                'marital', 'family_size', 'children', 'education_level', 'english_level',
                'salary', 'hours_worked', 'weeks_worked', 'occupation', 'industry',
                'economic_region']

# Convert to pd
with localconverter(robjects.default_converter + pandas2ri.converter):
  data = robjects.conversion.rpy2py(data)
data.reset_index(drop=True, inplace=True)

In [24]:
data

Unnamed: 0,sex,age,race,hispanic_origin,citizenship,nativity,marital,family_size,children,education_level,english_level,salary,hours_worked,weeks_worked,occupation,industry,economic_region
0,male,64.0,black,no,1.0,native,married,2.0,0.0,20.0,0.0,43000.0,56.0,49.0,13-1081,928P,Southeast
1,female,54.0,white,no,1.0,native,married,3.0,1.0,20.0,0.0,45000.0,42.0,49.0,29-2061,6231,Southeast
2,male,38.0,black,no,1.0,native,married,3.0,1.0,24.0,0.0,99000.0,50.0,49.0,25-1000,611M1,Southeast
3,female,41.0,asian,no,1.0,native,married,3.0,1.0,24.0,0.0,63000.0,50.0,49.0,25-1000,611M1,Southeast
4,female,40.0,white,no,1.0,native,married,4.0,2.0,21.0,0.0,45200.0,40.0,49.0,27-1010,611M1,Southeast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,female,54.0,white,no,1.0,native,married,2.0,0.0,22.0,0.0,38000.0,40.0,39.0,27-3092,923,Far West
19996,male,43.0,other,yes,4.0,foreign-born,divorced,4.0,0.0,21.0,1.0,100000.0,40.0,49.0,33-3021,92MP,Far West
19997,male,35.0,AIAN,yes,4.0,foreign-born,married,4.0,2.0,22.0,2.0,73000.0,40.0,47.0,25-2030,6111,Far West
19998,male,26.0,white,no,1.0,native,married,2.0,0.0,18.0,0.0,60000.0,75.0,49.0,55-2010,928110P4,Far West


In [25]:
# placing columns
X = "sex"
W = ["marital", "family_size", "children", "education_level",
    "english_level", "hours_worked", "weeks_worked", "occupation",
    "industry"]
Z = ["age", "race", "hispanic_origin", "citizenship", "nativity",
    "economic_region"]
Y = "salary"
x0 = "male"
x1 = "female"


In [26]:
import faircause.faircause as faircause

fc_census = faircause.FairCause(data, X, Z, W, Y, x0, x1)

In [27]:
print(fc_census)

faircause object:

Attribute:       sex
Outcome:         salary
Confounders:     marital, family_size, children, education_level, english_level, hours_worked, weeks_worked, occupation, industry
Mediators:       age, race, hispanic_origin, citizenship, nativity, economic_region


In [28]:
fc_census.estimate_effects()

[0.80619412 0.49638208 0.44466834 ... 0.64521215 0.65666012 0.69263492]


  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts


[0.60213653 0.61381029 0.60264914 ... 0.5595243  0.68869048 0.85987664]


  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts


[0.48152279 0.65326804 0.46315985 ... 0.43400915 0.55463448 0.61481824]


  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts


[0.468069   0.43718953 0.51920711 ... 0.59655039 0.25162338 0.71451689]


  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts


[0.53331779 0.63753683 0.55554604 ... 0.50065857 0.63841213 0.46310818]


  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y0[ts] = (Y.loc[ts].values - y_z0_ts) * (X.loc[ts] == 0).values / (1-px_z_ts) + y_z0_ts
  y1[ts] = (Y.loc[ts].values - y_z1_ts) * (X.loc[ts] == 1).values / (px_z_ts) + y_z1_ts


4.08 percent of extreme P(x|z) or p(x|z,w) prob
 Reported results are for the overlap pop. Consider investigating overlap issues


In [29]:
fc_census.summary()

faircause object summary:

Protected attribute:                 sex
Protected attribute levels:          male, female
Total Variation (TV): -15011.0780

TV decomposition(s):

TV_malefemale(y) (-15011.0780) = CtfDE_malefemale(y | male) (-7034.2133) - CtfIE_femalemale(y | male) (8878.2473) - CtfSE_femalemale(y) (-901.3826)


Unnamed: 0,measure,value,sd
0,ctfde,-7034.213282,1085.397309
1,ctfie,8878.247308,1086.900396
2,ctfse,-901.382623,481.627536
3,ett,-15912.46059,484.763498
4,expse_x0,-859.674213,654.167655
5,expse_x1,149.953415,385.140326
6,nde,-6616.992802,1941.518183
7,nie,9403.712793,1682.725194
8,te,-16020.705595,1066.697886
9,tv,-15011.077967,683.271623


In [30]:
fc_census.plot()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(x='Measure', y='Value', data=df, palette='Set2')
  ax.set_xticklabels([rename.get(m, m) for m in df['Measure']], rotation=45, ha='right')


ValueError: 
$$TV_{male, female}(y)$$ decomposition 
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)

Error in callback <function _draw_all_if_interactive at 0x10c44e840> (for post_execute), with arguments args (),kwargs {}:


ValueError: 
$$TV_{male, female}(y)$$ decomposition 
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)

ValueError: 
$$TV_{male, female}(y)$$ decomposition 
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)

<Figure size 1200x800 with 1 Axes>