In [1]:
from gen_data_stability import gen_data_and_resample_noise, sampling_distribution
from numpy.random import normal, binomial

In [2]:
single_samp = gen_data_and_resample_noise(n_runs=5, x_err_input=True, y_err_input=True)

In [3]:
hundred_samp = sampling_distribution(s_samples=100,
                                     n_runs=1000,
                                     x_err_input=True, 
                                     y_err_input=True)

In [125]:
import pandas as pd
import os, argparse, pathlib, itertools
from utils.basic_utils import writeToCSV


def get_counterfactual_data(output_dir='default'):
    
    # Define base repo directory
    try:
        # When executed directly from script
        base_dir = pathlib.Path(__file__).parents[2]
    except:
        # When executed from jupyter notebook
        base_dir  = pathlib.Path(os.getcwd()).parents[1]
        
    # Read in estimated parameters
    param_dir = base_dir / 'out/parameter_data/stability/{}'.format(output_dir)    
    y_params_df = pd.read_csv(param_dir / 'params_y.csv', index_col='Unnamed: 0')
    x_params_df = pd.read_csv(param_dir / 'params_x.csv', index_col='Unnamed: 0')
    
    # Get number of samples drawn from parameter files
    s_samples = len(y_params_df)
    
    # Assert parameter files are the same length
    assert s_samples == len(x_params_df), \
           'Parameter files indicate different number of samples.'
    
    # Loop through s_samples
    for s in range(1, s_samples + 1):
        
        # Read in this sample of the data
        cur_df = pd.read_csv(base_dir / 'out/synthetic_data/stability/{}/data/observed_samp_{}.csv'\
                                                                            .format(output_dir, s),
                             index_col='Unnamed: 0')
        
        # Intialize dataframe to hold counterfactual data
        counter_df = pd.DataFrame({'original_a':cur_df['a'], 'original_y':cur_df['y']})

        # Get list of groups in A (usually [0,1])
        group_list = [x for x in cur_df['a'].unique()]
        
        # Isolate parameters estimated from this sample
        x_params = x_params_df.loc[s]
        y_params = y_params_df.loc[s]
        
        # Calcuate X-residuals
        x_residuals = cur_df['x'] - ( x_params['a']*cur_df['a'] + \
                                      x_params['intercept'] ).values

        # Calcuate Y-residuals
        y_residuals = cur_df['y'] - ( y_params['a']*cur_df['a'] + \
                                      y_params['x']*cur_df['x'] + \
                                      y_params['intercept'] ).values
        
        # Loop through groups in A present in this sample
        for group in group_list:
            
            # Get baseline X prediction for A <- group
            counter_base_x = group*x_params['a'] + \
                             x_params['intercept']

            # Estimate counterfactual X
            counter_x = counter_base_x + x_residuals

            # Get baseline Y prediction for A <- group for non-resolving X
            counter_base_y_nonres = group*y_params['a'] + \
                                    counter_x*y_params['x'] + \
                                    y_params['intercept']
            
            # Estimate counterfactual Y for non-resolving X
            counter_y_nonres = counter_base_y_nonres + y_residuals

            # Get baseline Y prediction for A <- group for resolving X
            counter_base_y_res = group*y_params['a'] + \
                                 cur_df['x']*y_params['x'] + \
                                 y_params['intercept']
            
            # Estimate counterfactual Y for resolving X
            counter_y_res = counter_base_y_res + y_residuals
            
            # Save counterfactual Y for non-resolving X
            counter_df['cf_y_nonres_a{}'.format(group)] = counter_y_nonres
            
            # Save counterfactual Y for resolving X
            counter_df['cf_y_xres_a{}'.format(group)] = counter_y_res

        counter_df.to_csv(base_dir / 'out/counterfactual_data/stability/{}/counter_samp_{}.csv'\
                                                                 .format(output_dir, s))
        
    return 

In [127]:
get_counterfactual_data()

In [61]:
print(counter_base_y)

NameError: name 'counter_base_y' is not defined

In [37]:
try:
    # when executed from jupyter notebook
    base_dir = pathlib.Path(__file__).parents[2]
except:
    # when executed directly from script
    base_dir  = pathlib.Path(os.getcwd()).parents[1]

param_base = base_dir / 'out/parameter_data/stability/{}'.format('default')   
pd.read_csv(param_base / 'params_y.csv', index_col='Unnamed: 0')

Unnamed: 0,a,x,intercept
1,0.278099,-0.076967,0.398750
2,0.215806,0.479788,0.187558
3,0.190565,0.223374,0.257857
4,0.067204,-0.244976,0.646443
5,0.087950,0.177786,0.470748
...,...,...,...
96,0.117461,0.489131,0.262182
97,-0.145523,0.644839,0.220770
98,0.430925,0.071027,-0.004378
99,0.310773,0.478864,0.120493


In [49]:
print(x_residuals)

NameError: name 'x_residuals' is not defined