# Simulation 5: Interactive Instrumental Variable Regression Model

<!-- model description: 
$$
  Y_i^{(j)} = \gamma^*(D_i^{(j)}, X_i^{(j)}) + U_i^{(j)}, \quad \mathbb{E}[U_i^{(j)} | X_i^{(j)}, D_i^{(j)}] = 0, \\
  D_i^{(j)} = \mu^*(X_i^{(j)}) + V_i^{(j)}, \quad \mathbb{E}[V_i^{(j)} | X_i^{(j)}] = 0,
$$ -->

In [1]:
import pandas as pd
import holoviews as hv
from holoviews import dim
hv.extension('bokeh')

(1) setting: sample size n = 1000,

In [None]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_rnp100_rds10'
        file_name = file_name + '.csv'

        out = pd.read_csv(file_name)
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 10, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
)

boxwhisker * plt_hline

K 5   | psi_d 0.1 :  100.0 %
K 5   | psi_d 0.5 :  100.0 %
K 5   | psi_d 1.0 :  100.0 %
K 10  | psi_d 0.1 :  100.0 %
K 10  | psi_d 0.5 :  100.0 %
K 10  | psi_d 1.0 :  100.0 %


setting: n_rft = 200 v.s. 250

In [12]:
label_method = ['Average', 'M1']
label_nrft = ['200', '250']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_nrft for x3 in label_method)
    plot.state.x_range.factors = [*factors]

out_comp = pd.DataFrame()

K = '5'

for n_rft in label_nrft: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_nrft' + n_rft
        file_name = file_name + '_rnp100_rds10'
        file_name = file_name + '.csv'

        out = pd.read_csv(file_name)
        out['psi_d'] = psi_d
        out['n_rft'] = n_rft

        # print("n_rft", '{0: <2}'.format(n_rft), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 10, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'n_rft']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'n_rft', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'n_rft', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
)

boxwhisker * plt_hline

setting: n -> 2000

In [30]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n2000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_rnp100_rds10'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 10, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
)

boxwhisker * plt_hline

(2) setting: hard classifier for model_nu

In [5]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5_hard' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_rnp100_rds10'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 10, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
)

boxwhisker * plt_hline

(3) setting: split (model_mu and model_nu) with n = 1000

In [14]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5_split' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_rnp100_rds10'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 10, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
)

boxwhisker * plt_hline

setting: split with n -> 2000

In [2]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5_split' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n2000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_rnp100_rds10'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 10, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
)

boxwhisker * plt_hline

**setting: n_rds -> 20

In [31]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5_split' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_nrft300'
        file_name = file_name + '_rnp100'
        file_name = file_name + '_rds20'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 20, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
    
)

boxwhisker * plt_hline

K 5   | psi_d 0.1 :  92.65 %
K 5   | psi_d 0.5 :  90.6 %
K 5   | psi_d 1.0 :  90.1 %
FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5_split_K10_n1000_psid0.1_nrft300_rnp100_rds20.csv
FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5_split_K10_n1000_psid0.5_nrft300_rnp100_rds20.csv
FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5_split_K10_n1000_psid1.0_nrft300_rnp100_rds20.csv


setting: nrft -> 200, rds -> 20

In [40]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5_split' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_nrft200'
        file_name = file_name + '_rnp100'
        file_name = file_name + '_rds20'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 20, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
    
)

boxwhisker * plt_hline

FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5_split_K10_n1000_psid0.1_nrft200_rnp100_rds20.csv
FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5_split_K10_n1000_psid0.5_nrft200_rnp100_rds20.csv
FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5_split_K10_n1000_psid1.0_nrft200_rnp100_rds20.csv


New setting: 

In [2]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5s_split' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_nrft300'
        file_name = file_name + '_rnp100'
        file_name = file_name + '_rds20'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 20, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
    
)

boxwhisker * plt_hline

FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5s_split_K10_n1000_psid0.1_nrft300_rnp100_rds20.csv
FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5s_split_K10_n1000_psid0.5_nrft300_rnp100_rds20.csv
FileNotFoundError:  /project/Stat/s1155168529/programs/DDML/output/out_sim5s_split_K10_n1000_psid1.0_nrft300_rnp100_rds20.csv


Setting: p -> 20

In [4]:
label_method = ['Average', 'M1']
label_K = ['5', '10']
label_psid = ['0.1', '0.5', '1.0']

def hook_change_order(plot, element):
    factors =  ((x1, x2, x3) for x1 in label_psid for x2 in label_K for x3 in label_method)
    plot.state.x_range.factors = [*factors]


out_comp = pd.DataFrame()

for K in label_K: 
    for psi_d in label_psid: 
        file_name = '/project/Stat/s1155168529/programs/DDML/output/out_sim5s_split' 
        file_name = file_name + '_K' + K 
        file_name = file_name + '_n1000'
        # file_name = file_name + '_psid' + psi_d.replace('.', '')
        file_name = file_name + '_psid' + psi_d
        file_name = file_name + '_nrft300'
        file_name = file_name + '_p20'
        file_name = file_name + '_rnp100'
        file_name = file_name + '_rds20'
        file_name = file_name + '.csv'

        try:
            out = pd.read_csv(file_name)
        except FileNotFoundError: 
            print("FileNotFoundError: ", file_name)
            continue
        
        out['psi_d'] = psi_d
        out['K'] = K

        # print("K", '{0: <2}'.format(K), " | psi_d", '{0: <3}'.format(psi_d), ": ", out.shape[0] / 20, "%")
        
        out_comp = pd.concat([out_comp, out], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'psi_d', 'K']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'K', 'psi_d'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

# bdw = .05

boxwhisker = hv.BoxWhisker(out_long, ['psi_d', 'K', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=1000, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    # ylim=(.5 - bdw, .5 + bdw),
    hooks=[hook_change_order]
)

plt_hline = hv.HLine(.5)
plt_hline.opts(
    color='black', 
    active_tools=[]
    
)

boxwhisker * plt_hline