In [51]:
# based on Hu et al. (2023) - Interpretable Machine Learning based on Functional ANOVA Framework: Algorithms and Comparisons
# generate 20 variables from a multivariate gaussian distribution with mean 0, variance 1 and equal correlation 0.5 between all pairs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

n = 2000
p = 20
mean = np.zeros(p)
cov = np.ones((p,p)) * 0.5 + np.eye(p) * 0.5 # creates a covariance matrix with 0.5 correlation between all pairs and variance 1 on the diagonal
data_20 = np.random.multivariate_normal(mean, cov, n)
# truncate in interval [-2.5, 2.5]
data_20[data_20 > 2.5] = 2.5
data_20[data_20 < -2.5] = -2.5
df_20 = pd.DataFrame(data_20, columns=range(1, 21))

# generate 10 more variables
n = 2000
p = 10
mean = np.zeros(p)
cov = np.ones((p,p)) * 0.5 + np.eye(p) * 0.5
data_10 = np.random.multivariate_normal(mean, cov, n)
# truncate in interval [-2.5, 2.5]
data_10[data_10 > 2.5] = 2.5
data_10[data_10 < -2.5] = -2.5
# create a dataframe, starting the columns from 20
df_10 = pd.DataFrame(data_10, columns=range(21, 31))

# combine the two datasets
X = pd.concat([df_20, df_10], axis=1)
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
0,-0.768909,-0.435434,-0.874395,-0.456231,-1.025847,-0.725063,-0.120541,0.671386,-0.833334,-0.045331,...,0.410338,0.238435,1.604473,1.375358,0.225867,0.844801,0.976682,0.551789,0.038735,0.678123
1,0.656114,-0.331512,-1.663123,0.276044,-0.424792,-0.458579,0.055915,0.142532,-1.169747,0.382103,...,1.173566,1.837238,0.142870,0.288052,1.870870,1.385225,1.981938,1.609727,1.620806,0.430001
2,1.320909,-1.322968,-0.013313,0.309164,0.842732,-0.593569,-1.244140,-0.859752,-0.453841,-0.548486,...,1.649354,1.648770,1.167639,2.036628,-0.068143,-0.669495,0.306118,0.072043,1.423382,0.918833
3,0.158134,0.940538,0.324161,0.114515,1.572493,1.139180,-0.715875,1.439754,1.010380,1.047889,...,1.000577,-0.410611,1.263685,-0.084249,-0.527212,0.659976,-0.726049,0.180396,2.317532,-0.317440
4,-0.726109,0.716073,0.024014,1.377482,-1.015093,-0.394446,0.200622,-0.958536,0.819457,0.361684,...,0.023509,-0.494439,1.059284,-0.359750,-1.724223,-0.030290,-0.935891,-0.339516,0.255487,-0.337784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1.107907,0.322001,-1.724493,0.329758,0.837447,0.900160,-0.295785,-1.572189,-0.200053,-1.959943,...,0.663050,-1.373198,0.526919,-0.028057,-1.512144,-1.220845,-0.534435,0.110804,1.199760,-0.543039
1996,-0.290148,0.762832,-0.146224,-0.341477,0.633661,-0.107037,-0.017232,0.031719,0.652277,0.475212,...,-0.228750,-1.047474,-0.387730,-0.000067,-1.207721,-0.803479,-0.165109,0.451004,-0.906664,0.435873
1997,1.391011,-0.658509,1.271064,2.125396,0.839381,2.203365,1.225170,0.987445,2.143287,0.913989,...,1.395471,-1.037651,0.618483,-0.427495,-1.229250,-0.547733,-0.591020,-0.043034,-1.722360,-0.065805
1998,1.153055,-0.092462,-0.207188,-0.373478,-0.658168,0.224624,-0.210658,0.159894,1.041563,-0.609779,...,-0.384010,0.528656,-0.787615,1.046218,-0.911493,0.558775,0.521212,0.248517,-1.351705,-0.975287


In [48]:
# calculate the following function based on the current instance of X:
# y = \sum_{j=1}^5x_j + 0.5*\sum_{j=6}^8x_j^2+\sum_{j=9}^{10} x_j I(x_j>0) + x_1x_2 + x_1x_3 + x_2x_3 + 0.5x_1x_2x_3 + x_4x_5 + x_4x_6 + x_5x_6 + 0.5I(x_4>0)x_5x_6

def g(x):
    y = np.sum(x[:,0:5], axis=1) + 0.5*np.sum(x[:,5:8]**2, axis=1) + np.sum(x[:,8:10] * (x[:,8:10] > 0), axis=1) + x[:,0]*x[:,1] + x[:,0]*x[:,2] + x[:,1]*x[:,2] + 0.5*x[:,0]*x[:,1]*x[:,2] + x[:,3]*x[:,4] + x[:,3]*x[:,5] + x[:,4]*x[:,5] + 0.5*(x[:,3]>0)*x[:,4]*x[:,5]
    # add some noise N(0, 0.5^2)
    y += np.random.normal(0, 0.5, y.shape)
    return y

In [52]:
# generate the target variable y
y = g(X.values)
y

array([ 0.02897766, -1.79400366, -0.55899401, ..., 19.70568542,
        0.08578997, -1.18114313])

In [50]:
# save the data
X.to_csv('../datasets/synth_Hu4.csv', index=False)
pd.DataFrame(y).to_csv('../datasets/synth_Hu4_y.csv', index=False)

In [53]:
# create a second dataset with 1000 samples for the evaluation
X.to_csv('../datasets/synth_Hu4_eval.csv', index=False)
pd.DataFrame(y).to_csv('../datasets/synth_Hu4_y_eval.csv', index=False)