# Simulation 2: Partial linear regression model 

Model description: 
$$
    Y^{(j)} = D^{(j)} \beta + \gamma_j(X^{(j)}) + U^{(j)}, \quad E[U^{(j)} | X^{(j)}, D^{(j)}], \\
    D^{(j)} = \mu_j(X^{(j)}) + V^{(j)}, \quad E[V^{(j)} | X^{(j)}]. \\
$$
    

In [1]:
import pandas as pd
import holoviews as hv
from holoviews import dim
# from bokeh.sampledata.autompg import autompg as df
# from bokeh.models import Range1d
hv.extension('bokeh')

## Scenario 1
Basic parameter setting: $K = 5$, $n = 100$, $c_j = 0.25$, $100$ replications.

In [37]:
out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_iter100.csv'
)

out.set_index('rnd', inplace=True)
out_long = out.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, 'Method', 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=300, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

(boxwhisker * plt_hline).opts(default_tools=[])

Parameter setting: $n = 100 \rightarrow 500$.

In [None]:
out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n500_iter100.csv'
)

out.set_index('rnd', inplace=True)
out_long = out.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, 'Method', 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=300, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline

Comparison

In [36]:
out100 = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_iter100.csv'
)
out100['n'] = 100

out500 = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n500_iter100.csv'
)
out500['n'] = 500

out_comp = pd.concat([out100, out500], axis=0)

out_comp.set_index(['rnd', 'n'], inplace=True)
out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, ['n', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=500, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

(boxwhisker * plt_hline).opts(default_tools=[])

## Scenario 2: random data splitting

Setting: once random data splitting

In [None]:

out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_rnp100_rds10.csv',
)

out = out[out['rnd_ds'] == 128]

out = out.drop('rnd_ds', axis=1)

out.set_index('rnd_np', inplace=True)


out_long = out.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, 'Method', 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=300, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline


Setting: $10$ random data splittings.

In [34]:
out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_rnp100_rds10.csv',
)

# print(out.describe())

out_mean = out.groupby('rnd_np').mean()
out_mean['type'] = 'Mean'

out_median = out.groupby('rnd_np').median()
out_median['type'] = 'Median'

out_comp = pd.concat([out_mean, out_median])


out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'type'], inplace=True)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, ['type', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=500, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline

Sanity check: non-random data splitting

In [None]:
out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_rnp100_nds.csv',
)

out[out['rnd_ds'] == 128]

out = out.drop('rnd_ds', axis=1)

out.set_index('rnd_np', inplace=True)


out_long = out.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, 'Method', 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=300, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline

Sanity check: once data splitting

In [68]:
out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_rnp100_rds10.csv',
)

out_filter = out[out['rnd_ds'] <= 130]

out_filter = out_filter.groupby(['rnd_np', 'rnd_ds']).mean()

out_long = out_filter.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, ['rnd_ds', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=600, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    ylim=(1.5, 2.5),
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline

Comparison of $10$-$20$-$50$ random data splittings

In [55]:
out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_M5_rnp100_rds50.csv',
)

out10 = out[out['rnd_ds'] < 138].groupby('rnd_np').mean()
out10['iter'] = 10

out20 = out[out['rnd_ds'] < 148].groupby('rnd_np').mean()
out20['iter'] = 20

out50 = out.groupby('rnd_np').mean()
out50['iter'] = 50

out_comp = pd.concat([out10, out20, out50], axis=0)

out_comp = out_comp.drop('rnd_ds', axis=1).reset_index()

out_comp.set_index(['rnd_np', 'iter'], inplace=True)


out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

boxwhisker = hv.BoxWhisker(out_long, ['iter', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=800, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline

Comparison of $n=100$-$200$ sample size of each block

In [77]:
out100 = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_rnp100_rds10.csv',
)
out100['n'] = 100

out200 = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n200_rnp100_rds10.csv',
)
out200['n'] = 200

out_comp = pd.concat([out100, out200], axis=0)

out_comp = out_comp.groupby(['rnd_np', 'n']).mean()

out_comp = out_comp.drop('rnd_ds', axis=1)

out_long = out_comp.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

out_long

boxwhisker = hv.BoxWhisker(out_long, ['n', 'Method'], 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=500, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline


## ~~Scenario 3: updating random data splitting~~

In [None]:
out = pd.read_csv(
    '/project/Stat/s1155168529/programs/DDML/output/out_sim2_org_ini_K5_n100_rnp100_rdsu10.csv',
)

out = out.drop('NaN', axis=1)

out.set_index('rnd_np', inplace=True)


out_long = out.melt(
    ignore_index=False, var_name='Method', value_name='EST'
).reset_index()

boxwhisker = hv.BoxWhisker(out_long, 'Method', 'EST')
boxwhisker.opts(
    show_legend=False, 
    width=400, 
    box_fill_color=dim('Method').str(), 
    cmap='Set1', 
    ylim=(1.5, 2.5)
)

plt_hline = hv.HLine(2)
plt_hline.opts(
    color='black',
)

boxwhisker * plt_hline