# simulate stacking experiment

In [None]:
import numpy as np
from simulation_experiment import synexp
from visualize_sim import sim_plots
import pandas as pd

n_runs = 50 # how many simulation experiments
run_sim = True # run experiment?
plot_sim = True # plot results?

type_sim = 1 # what parameter to vary?
# 1 - Vary the dimensionality of X1 with respect to other feature spaces
# 2 - Vary the weight of X1 with respect to other feature spaces
# 3 - Vary the noise level
# 4 - Vary the number of samples

# Note, the experiments below are from the paper, you can change the parameters as you want
# It might be more feasible to make a script and call synexp non-interactively, especially for large experiments

In [None]:
import random
random.seed(10)

version = 'v4'

# ds_settings - dimentionality of each feature space
# alpha - stacking weights for simulated data
# n - number of samples 
# sigma - noise level

if type_sim ==1: # Vary the dimensionality of X1 with respect to other feature spaces

    ds_settings = [[3,200,200,200],[20,200,200,200],[50,200,200,200],[100,200,200,200],[200,200,200,200]]
    n = 400
    sigma = 0.2
    alpha = [0.5,0.2,0.2,0.1] 

    print('setup')

    if run_sim:
        print('running')
        R_d3  = synexp(runs = n_runs,sim_type = 'Feat_Dim_ratio',samples_settings=n,ds_settings=ds_settings,y_dim=2,
                          correl = 0, alpha_settings = alpha,scale = 0.5,y_noise_settings=sigma)
        
        R_d3.to_pickle('sweep_d_cluster_{}.npy'.format(version))
    
    if plot_sim:
        R_d3 = pd.read_pickle('sweep_d_cluster_{}.npy'.format(version))
        var = dict(sigma = sigma, d_sum = sum(ds_settings[0][1:]), n = n,alphas = alpha)
        ratio = [d[0] for d in ds_settings]
        sim_plots(R_d3,'Feat_Dim_ratio', ratio, filename = 'sweep_d',var_dict = var,ylim0=[-0.1,1],ylim1=[-.2,0.6],
                 ylim2=[0,0.5])

elif type_sim ==2: # Vary the weight of X1 with respect to other feature spaces

    ds = [10,200,200,200]
    n = 400
    sigma = 0.2
    alpha =  [[0.1,0.2,0.5,0.2],[0.3,0.2,0.3,0.2],[0.5,0.1,0.2,0.2],[0.7,0.1,0,0.2],[0.9,0.1,0.,0.]] 
    for a in alpha:
        assert np.round(sum(a),2)==1

    if run_sim:
        R_C3  = synexp(runs = n_runs,sim_type = 'Cond',samples_settings=n,ds_settings=ds,y_dim=2,
                          correl = 0, alpha_settings = alpha,scale = 0.5,y_noise_settings=sigma)
        
        R_C3.to_pickle('sweep_c_cluster_{}.npy'.format(version))
    
    if plot_sim:
        R_C3 = pd.read_pickle('sweep_c_cluster_{}.npy'.format(version))
        var = dict(sigma = sigma, ds = ds, n = n,alphas = alpha)
        
        ratio = [a[0] for a in alpha]
        sim_plots(R_C3,'Cond', ratio, filename = 'sweep_alpha',var_dict = var,ylim0=[-0.1,1],ylim1=[-.2,0.6],
                 ylim2=[0,0.5])


elif type_sim ==3: # Vary the noise level

    ds = [10,200,200,200]
    n = 400
    sigma = [0,0.2,0.5,1,1.5]
    alpha = [0.5,0.2,0.2,0.1]
    assert np.round(sum(alpha),2)==1

    if run_sim:
        R_sigma3  =synexp(runs = n_runs,sim_type = 'noise',samples_settings=n,ds_settings=ds,y_dim=2,
                          correl = 0, alpha_settings = alpha,scale = 0.5,y_noise_settings=sigma)
        
        R_sigma3.to_pickle('sweep_sigma_cluster_{}.npy'.format(version))
    
    if plot_sim:
        R_sigma3 = pd.read_pickle('sweep_sigma_cluster_{}.npy'.format(version))
        var = dict( ds = ds, alphas = alpha,n = n)
        sim_plots(R_sigma3,'noise', sigma, filename = 'sweep_sigma',var_dict = var,ylim0=[-0.1,1],ylim1=[-.2,0.6],
                 ylim2=[0,0.5])

elif type_sim ==4: # Vary the number of samples
    
    ds = [10,200,200,200]
    n = [100,200,400,800,1600]
    sigma = 0.2
    alpha = [0.5,0.2,0.2,0.1]
    assert np.round(sum(alpha),2)==1

    if run_sim:
        R_n3  = synexp(runs = n_runs,sim_type = 'Sample_Dim_ratio',samples_settings=n,ds_settings=ds,y_dim=2,
                          correl = 0, alpha_settings = alpha,scale = 0.5,y_noise_settings=sigma)
        
        R_n3.to_pickle('sweep_n_cluster_{}.npy'.format(version))
    
    if plot_sim:
        R_n3 = pd.read_pickle('sweep_n_cluster_{}.npy'.format(version))
    
        var = dict(sigma = sigma, ds = ds, alpha = alpha)
        sim_plots(R_n3,'Sample_Dim_ratio', n, filename = 'sweep_n',var_dict = var,ylim0=[-0.1,1],ylim1=[-.2,0.6],
                 ylim2=[0,0.5])