In [None]:
import jsonpickle
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import rankdata

Helper Functions

In [None]:
# helper function to read json file
def open_json(name):
    f = open(name, 'r')
    json_str = f.read()
    file = jsonpickle.decode(json_str)
    return file


In [None]:
# helper function to make data plot ready
def make_plot_ready(data_set, setting):
    # make them dataframes
    df1 = pd.DataFrame(data_set[0])
    df2 = pd.DataFrame(data_set[1])
    df3 = pd.DataFrame(data_set[2])
    # add metalearner names
    column_names = ['T-Learner', 'S-Learner', 'X-Learner', 'R-Learner', 'DR-Learner', 'RA-Learner', 'PW-Learner',
                    'U-Learner',
                    'SampleSize']
    df1.columns = column_names
    df2.columns = column_names
    df3.columns = column_names
    # wide format -> long format
    df1 = df1.melt('SampleSize', var_name='MetaLearner', value_name='MSE')
    df2 = df2.melt('SampleSize', var_name='MetaLearner', value_name='MSE')
    df3 = df3.melt('SampleSize', var_name='MetaLearner', value_name='MSE')
    # add new column indicating baselearner
    df1['BaseLearner'] = 'Random Forest'
    df2['BaseLearner'] = 'Lasso-Based'
    df3['BaseLearner'] = 'Neural Network'
    # concatenate
    df_1 = pd.concat([df1, df2, df3], ignore_index=True)
    # column setting
    df_1['Setting'] = setting
    # using dictionary to convert specific columns
    dictionary = {'SampleSize': int,
                  'MetaLearner': 'category',
                  'Setting': 'category'
                  }
    df_1 = df_1.astype(dictionary)
    return df_1

In [None]:
# helper function to make data plot ready (rankings)
def make_rank_ready():
    rf = np.zeros((0, 9))
    lasso = np.zeros((0, 9))
    nn = np.zeros((0, 9))
    # for random forest (=0)
    for i in range(1, 25):
        rf = np.vstack((rf, d1["data{0}".format(i)][0][:, 0:9]))
    # for lasso (=1)
    for i in range(1, 25):
        lasso = np.vstack((lasso, d1["data{0}".format(i)][1][:, 0:9]))
    # for lasso (=1)
    for i in range(1, 25):
        nn = np.vstack((nn, d1["data{0}".format(i)][2][:, 0:9]))
    # random forest
    rf_500 = rf[rf[:, 8] == 500][:, 0:8]
    rf_1000 = rf[rf[:, 8] == 1000][:, 0:8]
    rf_2000 = rf[rf[:, 8] == 2000][:, 0:8]
    rf_5000 = rf[rf[:, 8] == 5000][:, 0:8]
    # random forest
    lasso_500 = lasso[lasso[:, 8] == 500][:, 0:8]
    lasso_1000 = lasso[lasso[:, 8] == 1000][:, 0:8]
    lasso_2000 = lasso[lasso[:, 8] == 2000][:, 0:8]
    lasso_5000 = lasso[lasso[:, 8] == 5000][:, 0:8]
    # random forest
    nn_500 = nn[nn[:, 8] == 500][:, 0:8]
    nn_1000 = nn[nn[:, 8] == 1000][:, 0:8]
    nn_2000 = nn[nn[:, 8] == 2000][:, 0:8]
    nn_5000 = nn[nn[:, 8] == 5000][:, 0:8]
    # ranks
    ranks_rf_500 = rankdata(rf_500, axis=1)
    ranks_rf_1000 = rankdata(rf_1000, axis=1)
    ranks_rf_2000 = rankdata(rf_2000, axis=1)
    ranks_rf_5000 = rankdata(rf_5000, axis=1)
    ranks_lasso_500 = rankdata(lasso_500, axis=1)
    ranks_lasso_1000 = rankdata(lasso_1000, axis=1)
    ranks_lasso_2000 = rankdata(lasso_2000, axis=1)
    ranks_lasso_5000 = rankdata(lasso_5000, axis=1)
    ranks_nn_500 = rankdata(nn_500, axis=1)
    ranks_nn_1000 = rankdata(nn_1000, axis=1)
    ranks_nn_2000 = rankdata(nn_2000, axis=1)
    ranks_nn_5000 = rankdata(nn_5000, axis=1)
    # mean ranks
    rf_rankmed_500 = np.mean(ranks_rf_500, axis=0)
    rf_rankmed_1000 = np.mean(ranks_rf_1000, axis=0)
    rf_rankmed_2000 = np.mean(ranks_rf_2000, axis=0)
    rf_rankmed_5000 = np.mean(ranks_rf_5000, axis=0)
    lasso_rankmed_500 = np.mean(ranks_lasso_500, axis=0)
    lasso_rankmed_1000 = np.mean(ranks_lasso_1000, axis=0)
    lasso_rankmed_2000 = np.mean(ranks_lasso_2000, axis=0)
    lasso_rankmed_5000 = np.mean(ranks_lasso_5000, axis=0)
    nn_rankmed_500 = np.mean(ranks_nn_500, axis=0)
    nn_rankmed_1000 = np.mean(ranks_nn_1000, axis=0)
    nn_rankmed_2000 = np.mean(ranks_nn_2000, axis=0)
    nn_rankmed_5000 = np.mean(ranks_nn_5000, axis=0)
    # stack
    rf_rankmed = np.vstack((rf_rankmed_500.reshape(1, 8), rf_rankmed_1000.reshape(1, 8), rf_rankmed_2000.reshape(1, 8),
                            rf_rankmed_5000.reshape(1, 8)))
    lasso_rankmed = np.vstack((lasso_rankmed_500.reshape(1, 8), lasso_rankmed_1000.reshape(1, 8),
                               lasso_rankmed_2000.reshape(1, 8), lasso_rankmed_5000.reshape(1, 8)))
    nn_rankmed = np.vstack((nn_rankmed_500.reshape(1, 8), nn_rankmed_1000.reshape(1, 8), nn_rankmed_2000.reshape(1, 8),
                            nn_rankmed_5000.reshape(1, 8)))
    # to pandas
    rf_rankmed = pd.DataFrame(rf_rankmed)
    rf_rankmed['SampleSize'] = [500, 1000, 2000, 5000]
    lasso_rankmed = pd.DataFrame(lasso_rankmed)
    lasso_rankmed['SampleSize'] = [500, 1000, 2000, 5000]
    nn_rankmed = pd.DataFrame(nn_rankmed)
    nn_rankmed['SampleSize'] = [500, 1000, 2000, 5000]
    # column names
    column_names = ['T-Learner', 'S-Learner', 'X-Learner', 'R-Learner', 'DR-Learner', 'RA-Learner',
                    'PW-Learner',
                    'U-Learner',
                    'SampleSize']
    rf_rankmed.columns = column_names
    lasso_rankmed.columns = column_names
    nn_rankmed.columns = column_names
    # finally melt
    melted_rf = rf_rankmed.melt('SampleSize', var_name='MetaLearner', value_name='MeanRank')
    melted_rf['SampleSize'] = melted_rf['SampleSize'].astype('category')
    melted_lasso = lasso_rankmed.melt('SampleSize', var_name='MetaLearner', value_name='MeanRank')
    melted_lasso['SampleSize'] = melted_lasso['SampleSize'].astype('category')
    melted_nn = nn_rankmed.melt('SampleSize', var_name='MetaLearner', value_name='MeanRank')
    melted_nn['SampleSize'] = melted_nn['SampleSize'].astype('category')

    return melted_rf, melted_lasso, melted_nn


In [None]:
def ihdp_make_plot_ready():
    ihdp = open_json("final_results_final/results_ihdp_100run(s)_final.json")
    ihdp_rf = ihdp[0]  # RF
    ihdp_lm = ihdp[1]  # lasso-based
    ihdp_nn = ihdp[2]  # NN
    # make df
    ihdp_rf = pd.DataFrame(ihdp_rf)
    ihdp_lm = pd.DataFrame(ihdp_lm)
    ihdp_nn = pd.DataFrame(ihdp_nn)
    # column names
    columns = ['T', 'S', 'X', 'R', 'DR', 'RA', 'PW',
               'U']
    ihdp_rf.columns = columns
    ihdp_lm.columns = columns
    ihdp_nn.columns = columns
    # melt
    ihdp_rf = ihdp_rf.melt(var_name='MetaLearner', value_name='MSE')
    ihdp_lm = ihdp_lm.melt(var_name='MetaLearner', value_name='MSE')
    ihdp_nn = ihdp_nn.melt(var_name='MetaLearner', value_name='MSE')
    # names base-learner
    ihdp_rf['BaseLearner'] = 'RandomForest'
    ihdp_lm['BaseLearner'] = 'Lasso-Based'
    ihdp_nn['BaseLearner'] = 'NeuralNetwork'
    # concatenate
    ihdp_all = pd.concat([ihdp_rf, ihdp_lm, ihdp_nn])

    return ihdp_all


In [None]:
def ihdp_make_rank_ready():
    ihdp = open_json("final_results_final/results_ihdp_100run(s)_final.json")
    # rank
    ihdp_rank_rf = rankdata(ihdp[0], axis=1)
    ihdp_rank_lasso = rankdata(ihdp[1], axis=1)
    ihdp_rank_nn = rankdata(ihdp[2], axis=1)
    # meanranks
    ihdp_meanranks_rf = np.mean(ihdp_rank_rf, axis=0).reshape(1, 8)
    ihdp_meanranks_lasso = np.mean(ihdp_rank_lasso, axis=0).reshape(1, 8)
    ihdp_meanranks_nn = np.mean(ihdp_rank_nn, axis=0).reshape(1, 8)
    # to df
    meanranks_rf = pd.DataFrame(ihdp_meanranks_rf)
    meanranks_lasso = pd.DataFrame(ihdp_meanranks_lasso)
    meanranks_nn = pd.DataFrame(ihdp_meanranks_nn)
    # column names
    meanranks_rf.columns = learner_names
    meanranks_lasso.columns = learner_names
    meanranks_nn.columns = learner_names
    # concat
    meanranks_ihdp = pd.concat([meanranks_rf, meanranks_lasso, meanranks_nn], axis=0)
    meanranks_ihdp['BaseLearner'] = ['RandomForest', 'Lasso-Based', 'NeuralNetwork']
    meanranks_ihdp['BaseLearner'] = meanranks_ihdp['BaseLearner'].astype('category')
    # melt
    meanranks_ihdp = meanranks_ihdp.melt('BaseLearner', var_name='MetaLearner', value_name='MeanRank')

    return meanranks_ihdp


In [None]:
# names and colors for the plots
learner_names = ['T-Learner', 'S-Learner', 'X-Learner', 'R-Learner', 'DR-Learner', 'RA-Learner', 'PW-Learner',
                 'U-Learner']
learner_colors = {
    'T-Learner': 'blue',
    'S-Learner': 'orange',
    'X-Learner': 'green',
    'R-Learner': 'red',
    'DR-Learner': 'purple',
    'RA-Learner': 'brown',
    'PW-Learner': 'pink',
    'U-Learner': 'gray'
}

In [None]:
col_names = ['T-Learner', 'S-Learner', 'X-Learner', 'R-Learner', 'DR-Learner', 'RA-Learner', 'PW-Learner', 'U-Learner']
metalearner_colors = {
    'T-Learner': 'blue',
    'S-Learner': 'orange',
    'X-Learner': 'green',
    'R-Learner': 'red',
    'DR-Learner': 'purple',
    'RA-Learner': 'brown',
    'PW-Learner': 'pink',
    'U-Learner': 'gray'
}

In [None]:
metalearners_colors = {
    'T': 'blue',
    'S': 'orange',
    'X': 'green',
    'R': 'red',
    'DR': 'purple',
    'RA': 'brown',
    'PW': 'pink',
    'U': 'gray'
}
columns = ['T', 'S', 'X', 'R', 'DR', 'RA', 'PW',
               'U']

In [None]:
order_x = ['RandomForest', 'Lasso-Based', 'NeuralNetwork']

Process Data

In [None]:
# load all settings
d1 = {}
d2 = {}
for i in range(1, 25):
    file_name = f'final_results_final/results_simulated_setting{i}_10run(s)_final.json'
    d1["data{0}".format(i)] = open_json(file_name)
    d2["df_{0}".format(i)] = make_plot_ready(d1["data{0}".format(i)], i)

In [None]:
# concat setting 1-6, 7-12, 13-18, 19-24
df_1_6 = pd.concat([d2['df_1'], d2['df_2'], d2['df_3'], d2['df_4'], d2['df_5'], d2['df_6']])
df_7_12 = pd.concat([d2['df_7'], d2['df_8'], d2['df_9'], d2['df_10'], d2['df_11'], d2['df_12']])
df_13_18 = pd.concat([d2['df_13'], d2['df_14'], d2['df_15'], d2['df_16'], d2['df_17'], d2['df_18']])
df_19_24 = pd.concat([d2['df_19'], d2['df_20'], d2['df_21'], d2['df_22'], d2['df_23'], d2['df_24']])

In [None]:
# sns options
sns.set_style('darkgrid')
sns.set(font_scale=1.0)

# Plotting Results (fully-synthetic data)

Settings 1-6

In [None]:
face1 = sns.FacetGrid(data=df_1_6, row='Setting', col='BaseLearner', sharey="row", height=4, aspect=1,
                      margin_titles=True)
face1.map(sns.lineplot, 'SampleSize', 'MSE', 'MetaLearner', errorbar=None, marker='o', estimator="mean", linewidth=1,
          hue_order=col_names, palette=metalearner_colors)
face1.add_legend()
face1.set(yscale='log')
face1.fig.subplots_adjust(top=0.95)  # adjust the Figure in rp
face1.fig.suptitle('Performances of the Metalearners in Simulation Settings 1-6', fontsize=22)
face1.set_xlabels("sample size")
face1.set_ylabels("MSE")

In [None]:
# save
face1.savefig('plots_final/setting_1_6_final.eps', format="eps")

Settings 7-12

In [None]:
face2 = sns.FacetGrid(data=df_7_12, row='Setting', col='BaseLearner', sharey="row", height=4, aspect=1,
                      margin_titles=True)
face2.map(sns.lineplot, 'SampleSize', 'MSE', 'MetaLearner', errorbar=None, marker='o', estimator="mean", linewidth=1,
          hue_order=col_names, palette=metalearner_colors)
face2.add_legend()
face2.set(yscale='log')  #symlog
face2.fig.subplots_adjust(top=0.95)  # adjust the Figure in rp
face2.fig.suptitle('Performances of the Metalearners in Simulation Settings 7-12', fontsize=22)
face2.set_xlabels("sample size")
face2.set_ylabels("MSE")

In [None]:
# save
face2.savefig('plots_final/setting_7_12_final.eps', format="eps")

Settings 13-18

In [None]:
face3 = sns.FacetGrid(data=df_13_18, row='Setting', col='BaseLearner', sharey="row", height=4, aspect=1,
                      margin_titles=True)
face3.map(sns.lineplot, 'SampleSize', 'MSE', 'MetaLearner', errorbar=None, marker='o', estimator="mean", linewidth=1,
          hue_order=col_names, palette=metalearner_colors)
face3.add_legend()
face3.set(yscale='log')
face3.fig.subplots_adjust(top=0.95)  # adjust the Figure in rp
face3.fig.suptitle('Performances of the Metalearners in Simulation Settings 13-18', fontsize=22)
face3.set_xlabels("sample size")
face3.set_ylabels("MSE")

In [None]:
# save
face3.savefig('plots_final/setting_13_18_final.eps', format="eps")

Settings 19-24

In [None]:
face4 = sns.FacetGrid(data=df_19_24, row='Setting', col='BaseLearner', sharey="row", height=4, aspect=1,
                      margin_titles=True)
face4.map(sns.lineplot, 'SampleSize', 'MSE', 'MetaLearner', errorbar=None, marker='o', estimator="mean", linewidth=1,
          hue_order=col_names, palette=metalearner_colors)
face4.add_legend()
face4.set(yscale='log')
face4.fig.subplots_adjust(top=0.95)  # adjust the Figure in rp
face4.fig.suptitle('Performances of the Metalearners in Simulation Settings 19-24', fontsize=22)
face4.set_xlabels("sample size")
face4.set_ylabels("MSE")

In [None]:
# save
face4.savefig('plots_final/setting_19_24_final.eps', format="eps")

# Plotting Mean Rankings

In [None]:
# get data
melted_rf, melted_lasso, melted_nn = make_rank_ready()

For Random Forests

In [None]:
ax1 = sns.stripplot(data=melted_rf, x='SampleSize', y='MeanRank', hue='MetaLearner', jitter=False)
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))
ax1.set_title('Mean Ranking of the Metalearners with Random Forests', fontsize=14)
ax1.set_ylabel("mean ranking")
ax1.set_xlabel("sample size")

In [None]:
# save
fig1 = ax1.get_figure()
fig1.savefig("plots_final/rf_ranks_final.eps", format='eps', bbox_inches='tight')

For Lasso-Based Regression

In [None]:
ax2 = sns.stripplot(data=melted_lasso, x='SampleSize', y='MeanRank', hue='MetaLearner', jitter=False)
sns.move_legend(ax2, "upper left", bbox_to_anchor=(1, 1))
ax2.set_title('Mean Ranking of the Metalearners with Lasso-Based Regression', fontsize=14)
ax2.set_ylabel("mean ranking")
ax2.set_xlabel("sample size")

In [None]:
# save
fig2 = ax2.get_figure()
fig2.savefig("plots_final/lasso_ranks_final.eps", format='eps', bbox_inches='tight')

For Neural Networks

In [None]:
ax3 = sns.stripplot(data=melted_nn, x='SampleSize', y='MeanRank', hue='MetaLearner', jitter=False)
sns.move_legend(ax3, "upper left", bbox_to_anchor=(1, 1))
ax3.set_title('Mean Ranking of the Metalearners with Neural Networks', fontsize=14)
ax3.set_ylabel("mean ranking")
ax3.set_xlabel("sample size")

In [None]:
# save
fig3 = ax3.get_figure()
fig3.savefig("plots_final/nn_ranks_final.eps", format='eps', bbox_inches='tight')

# Plots results for semi-synthetic data (IHDP)

Boxplots of MSEs

In [None]:
ihdp_all = ihdp_make_plot_ready()

In [None]:
sns.set(font_scale=1.0)
face_ihdp = sns.FacetGrid(data=ihdp_all, col='BaseLearner', sharey=True, height=4, aspect=1,
                          margin_titles=True)
face_ihdp.map(sns.boxplot, 'MetaLearner', 'MSE', orient='v', order=columns, palette=metalearners_colors,
              fliersize=0.5, linewidth=1, flierprops={"marker": "."}, )
face_ihdp.set(yscale='log')
face_ihdp.fig.subplots_adjust(top=0.8)  # adjust the Figure in rp
face_ihdp.fig.suptitle('Boxplots of the MSEs for the Semi-Synthetic Experiment', fontsize=16)


In [None]:
face_ihdp.savefig('plots_final/ihdp_final.eps', format='eps')

Mean Rankings

In [None]:
meanranks_ihdp = ihdp_make_rank_ready()

In [None]:
ax5 = sns.stripplot(meanranks_ihdp, x='BaseLearner', y='MeanRank', jitter=False, hue='MetaLearner', order=order_x)
ax5.set_title('Mean Rankings of the Metalearners in the Semi-Synthetic Experiment', fontsize=13)
ax5.set_ylabel("mean ranking")
sns.move_legend(ax5, "upper left", bbox_to_anchor=(1, 1))

In [None]:
fig5 = ax5.get_figure()
fig5.savefig("plots_final/ranks_ihdp_final.eps", format='eps', bbox_inches='tight')