In [5]:
# Fit the mixed effects model
from patsy import dmatrix
from statsmodels.formula.api import mixedlm
from statsmodels.stats.multicomp import MultiComparison
import pandas as pd
import numpy as np
from pymer4.models import Lmer


In [6]:
data = pd.read_csv("./dataset_copy_DELETEAFTER.csv")

metric="mean"
input_id="benchmark"
system_id="acquisition"
bin_id="budget"


differentMeans_model = mixedlm(formula=f"{metric} ~ {system_id}", data=data, groups=input_id)
diffModelFit = differentMeans_model.fit( reml=False)
print(diffModelFit.summary())


                       Mixed Linear Model Regression Results
Model:                     MixedLM         Dependent Variable:         mean        
No. Observations:          180000          Method:                     ML          
No. Groups:                4               Scale:                      53.4669     
Min. group size:           45000           Log-Likelihood:             -613546.8042
Max. group size:           45000           Converged:                  Yes         
Mean group size:           45000.0                                                 
-----------------------------------------------------------------------------------
                                         Coef.  Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------
Intercept                                 5.760    4.374  1.317 0.188 -2.813 14.332
acquisition[T.ProbabilityOfImprovement]   0.337    0.073  4.609 0.000  0.194  0.480
acquisition[T.U

In [7]:
# Get predicted values for each level of system_id
grid = (
    np.array(
        np.meshgrid(
            data[input_id].unique(),
            data[system_id].unique(),
        )
    )
    .reshape(2, len(data[input_id].unique()) * len(data[system_id].unique())).T)

grid = pd.DataFrame(grid, columns=[input_id, system_id])
betas = diffModelFit.fe_params
# print(betas)
mat = dmatrix(f"C({system_id})", grid, return_type="matrix")
# print(mat)
emmeans = grid
emmeans["means"] = mat @ betas
#print(emmeans)
vcov = diffModelFit.cov_params()
# print(vcov)

vcov = vcov[~vcov.index.str.contains("Var|Cor")]
vcov = vcov.loc[:, ~vcov.columns.str.contains("Var|Cor")]
#print(vcov)
emmeans["SE"] = np.sqrt(np.diagonal(mat @ vcov) @ mat.T)
print(emmeans)


       benchmark                acquisition     means        SE
0         Branin        ExpectedImprovement  5.759636  4.373900
1      Hartmann6        ExpectedImprovement  5.759636  4.373900
2     Jahs_Bench        ExpectedImprovement  5.759636  4.373900
3   NN_HPO_Bench        ExpectedImprovement  5.759636  4.373900
4         Branin   ProbabilityOfImprovement  6.096633  4.373594
5      Hartmann6   ProbabilityOfImprovement  6.096633  4.373594
6     Jahs_Bench   ProbabilityOfImprovement  6.096633  4.373594
7   NN_HPO_Bench   ProbabilityOfImprovement  6.096633  4.373594
8         Branin       UpperConfidenceBound  5.878372  4.373594
9      Hartmann6       UpperConfidenceBound  5.878372  4.373594
10    Jahs_Bench       UpperConfidenceBound  5.878372  4.373594
11  NN_HPO_Bench       UpperConfidenceBound  5.878372  4.373594
12        Branin       qExpectedImprovement  5.821530  4.373594
13     Hartmann6       qExpectedImprovement  5.821530  4.373594
14    Jahs_Bench       qExpectedImprovem

In [8]:
predicted_values = diffModelFit.predict(grid)
print(pd.DataFrame(predicted_values, columns=["pred"])["pred"].unique())
# Perform Tukey's HSD test
tukey_results = MultiComparison(predicted_values, grid[system_id]).tukeyhsd(
    alpha=0.05
)

# calculate the standard deviation for each pair of groups
print(tukey_results.std_pairs)
print(tukey_results.summary())
print("tukey end")

[5.75963622 6.09663289 5.87837206 5.82153017 6.25490228 6.28279609
 5.85607482 5.91246751 6.65186281]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
                   Multiple Comparison of Means - Tukey HSD, FWER=0.05                   
          group1                    group2          meandiff p-adj  lower   upper  reject
-----------------------------------------------------------------------------------------
      ExpectedImprovement  ProbabilityOfImprovement    0.337   0.0   0.337   0.337   True
      ExpectedImprovement      UpperConfidenceBound   0.1187   0.0  0.1187  0.1187   True
      ExpectedImprovement      qExpectedImprovement   0.0619   0.0  0.0619  0.0619   True
      ExpectedImprovement        qKnowledgeGradient   0.4953   0.0  0.4953  0.4953   True
      ExpectedImprovement qProbabilityOfImprovement   0.5232   0.0  0.5232  0.5232   True
      ExpectedImprovement             qSimpleRegret   0.0964   0.0 

  st_range = np.abs(meandiffs) / std_pairs #studentized range statistic
