In [1]:
# based on Hu et al. (2023) - Interpretable Machine Learning based on Functional ANOVA Framework: Algorithms and Comparisons
# generate 20 variables from a multivariate gaussian distribution with mean 0, variance 1 and equal correlation 0.5 between all pairs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(0)
n = 10000
p = 5
mean = np.zeros(p)
cov = np.eye(p) * 1
data_1 = np.random.multivariate_normal(mean, cov, n)
# truncate in interval [-2.5, 2.5]
data_1[data_1 > 2.5] = 2.5
data_1[data_1 < -2.5] = -2.5
df_1 = pd.DataFrame(data_1, columns=range(1, 6))

# generate 5 more variables
n = 10000
p = 5
mean = np.zeros(p)
cov = np.eye(p) * 1
data_2 = np.random.multivariate_normal(mean, cov, n)
# truncate in interval [-2.5, 2.5]
data_2[data_2 > 2.5] = 2.5
data_2[data_2 < -2.5] = -2.5
# create a dataframe, starting the columns from 20
df_2 = pd.DataFrame(data_2, columns=range(6, 11))

# combine the two datasets
X = pd.concat([df_1, df_2], axis=1)
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1.764052,0.400157,0.978738,2.240893,1.867558,0.066041,1.699982,-1.619216,-0.596377,-2.117494
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599,-0.863075,0.396472,-0.626770,-2.006807,-1.733378
2,0.144044,1.454274,0.761038,0.121675,0.443863,0.348964,-0.382994,2.032733,1.987586,0.267726
3,0.333674,1.494079,-0.205158,0.313068,-0.854096,-1.447437,0.349444,-0.864214,-0.501443,2.074074
4,-2.500000,0.653619,0.864436,-0.742165,2.269755,-0.388298,-0.431737,-0.405904,1.008090,-0.695019
...,...,...,...,...,...,...,...,...,...,...
9995,-0.393226,-0.035262,0.586450,-0.529516,0.685231,0.251562,-0.338182,1.014143,-0.582528,0.638790
9996,-0.164385,-0.344142,0.206748,0.112080,-1.841391,-0.367787,-0.568358,-0.721248,0.198280,-0.217422
9997,1.084542,0.797747,-1.619339,-0.335616,0.511565,0.914757,0.443757,-0.306204,-0.949056,-0.448083
9998,0.143326,0.618511,1.355440,0.732079,0.795743,0.428571,-0.164999,0.864963,-0.647662,0.720058


In [16]:
# calculate the following function based on the current instance of X:
# y = x_1 + x_1x_2  0.5I(x_3>0)x_4x_5

def g(x):
    y = x[:,0] + x[:,0]*x[:,1] + (x[:,2]>0)*x[:,3]*x[:,4]
    # add some noise N(0, 0.5^2)
    #y += np.random.normal(0, 0.5, y.shape)
    return y

In [17]:
# generate the target variable y
y = g(X.values)
y

array([ 6.65494861, -1.90577827,  0.40752939, ...,  1.94973094,
        0.81452151, -1.01327901])

In [9]:
# save the data
X.to_csv('../datasets/synth_simple.csv', index=False)
pd.DataFrame(y).to_csv('../datasets/synth_simple_y.csv', index=False)