In [2]:
import pandas as pd
from pymer4 import Lmer
import scipy.stats as stats

from significance_analysis import conduct_analysis

# Load example dataset
data = pd.read_csv("./example_dataset.csv")

def GLRT(mod1, mod2):
    chi_square = 2 * abs(mod1.logLike - mod2.logLike)
    delta_params = abs(len(mod1.coefs) - len(mod2.coefs))
    return {
        "chi_square": chi_square,
        "df": delta_params,
        "p": 1 - stats.chi2.cdf(chi_square, df=delta_params),
}

In [3]:
# First Analysis: Analyse performance of acquisition functions over all benchmarks and trainingrounds
data2=data.loc[(data["acquisition"]!="ExpectedImprovement")| (data["benchmark"]!="Branin") ]
conduct_analysis(data2, "mean", "acquisition", "benchmark",show_plots=False)

P-value: 0.0

As the p-value 0.0 is smaller than 0.05, we can reject the Null-Hypothesis that the model that does not consider the acquisition describes the data as well as the one that does. Therefore there is significant difference within acquisition.

P-values adjusted by tukey method for family of 36 estimates
                 acquisition  Estimate  2.5_ci  97.5_ci     SE     DF
1        ExpectedImprovement     5.736  -7.793   19.266  4.881  4.016
2   ProbabilityOfImprovement     6.097  -7.433   19.626  4.881  4.016
3       UpperConfidenceBound     5.878  -7.651   19.408  4.881  4.016
4       qExpectedImprovement     5.822  -7.708   19.351  4.881  4.016
5         qKnowledgeGradient     6.255  -7.274   19.784  4.881  4.016
6  qProbabilityOfImprovement     6.283  -7.247   19.812  4.881  4.016
7              qSimpleRegret     5.856  -7.673   19.385  4.881  4.016
8      qUpperConfidenceBound     5.912  -7.617   19.442  4.881  4.016
9               randomSearch     6.652  -6.878   20.18

({'chi_square': 265.9322355031036, 'df': 8, 'p': 0.0},
 (                 acquisition  Estimate  2.5_ci  97.5_ci     SE     DF
  1        ExpectedImprovement     5.736  -7.793   19.266  4.881  4.016
  2   ProbabilityOfImprovement     6.097  -7.433   19.626  4.881  4.016
  3       UpperConfidenceBound     5.878  -7.651   19.408  4.881  4.016
  4       qExpectedImprovement     5.822  -7.708   19.351  4.881  4.016
  5         qKnowledgeGradient     6.255  -7.274   19.784  4.881  4.016
  6  qProbabilityOfImprovement     6.283  -7.247   19.812  4.881  4.016
  7              qSimpleRegret     5.856  -7.673   19.385  4.881  4.016
  8      qUpperConfidenceBound     5.912  -7.617   19.442  4.881  4.016
  9               randomSearch     6.652  -6.878   20.181  4.881  4.016,
                                               Contrast  Estimate  2.5_ci  97.5_ci     SE          DF  T-stat  P-val  Sig              acquisition_1              acquisition_2
  1      ExpectedImprovement - ProbabilityOfImpr

In [32]:
data3=data
dimension={
    "Branin":2,
    "Hartmann6":6,
    "Jahs_Bench":10,
    "NN_HPO_Bench":10
}
categorical={
    "Branin":"Numerical",
    "Hartmann6":"Numerical",
    "Jahs_Bench":"Categorical",
    "NN_HPO_Bench":"Numerical"
}
data3["benchmark_dim"]=data3["benchmark"].apply(lambda x:dimension[x])
data3["benchmark_cat"]=data3["benchmark"].apply(lambda x:categorical[x])
#data3=data3.loc[(data3["acquisition"]=="ExpectedImprovement")|(data3["acquisition"]=="qExpectedImprovement")]


metric="mean"
input_id="benchmark"
system_id="acquisition"
bin_id="budget"
"""
# "Common"-Model assumes significant difference, which is why the system-identifier is included
complex_model = Lmer(
                formula=f"{metric}~{system_id}+{meta_cat}+(1|{input_id})", data=data3
            )
simple_model = Lmer(
                formula=f"{metric}~{system_id}+(1|{input_id})", data=data3
            )
            """

#complex_model = Lmer(
#                formula=f"{metric}~{input_id}+{meta_cat}+(1|{system_id})", data=data3
#            )

meta_dim="benchmark_dim"
meta_cat="benchmark_cat"
input_id="acquisition"
system_id="benchmark"
complex_model = Lmer(
                formula=f"{metric}~{meta_dim}+{meta_cat}+(1|{input_id})", data=data3
            )
simple_model = Lmer(
                formula=f"{metric}~{meta_dim}+(1|{input_id})", data=data3
            )

# factors specifies names of system_identifier, i.e. Baseline, or Algorithm1
complex_model.fit(
    factors={system_id: list(data[system_id].unique())},
    REML=False,
    summarize=False,
)
simple_model.fit(
    factors={system_id: list(data[system_id].unique())},
    REML=False,
    summarize=False,
)
print(GLRT(simple_model, complex_model))
print(complex_model.summary())
print(complex_model.ranef)
print(simple_model.summary())
print(simple_model.ranef)


{'chi_square': 156548.23927646526, 'df': 1, 'p': 0.0}
Linear mixed model fit by maximum likelihood  ['lmerMod']
Formula: mean~benchmark_dim+benchmark_cat+(1|acquisition)

Family: gaussian	 Inference: parametric

Number of observations: 180000	 Groups: {'acquisition': 9.0}

Log-likelihood: -617985.444 	 AIC: 1235980.888

Random effects:

                    Name     Var    Std
acquisition  (Intercept)   0.073  0.270
Residual                  56.175  7.495

No random effect correlations specified

Fixed effects:

                        Estimate  2.5_ci  97.5_ci     SE          DF   T-stat  P-val  Sig
(Intercept)               27.300  27.074   27.525  0.115      22.298  237.033    0.0  ***
benchmark_dim             -0.474  -0.486   -0.462  0.006  179991.000  -75.929    0.0  ***
benchmark_catNumerical   -23.897 -23.991  -23.803  0.048  179991.000 -499.526    0.0  ***
                           X.Intercept.
ExpectedImprovement           -0.286488
ProbabilityOfImprovement       0.038029
qEx