In [1]:
import optuna
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from glob import glob
import re

In [2]:
def load_study(study_name):
    journal_name = f'{study_name}.log'
    study_storage = optuna.storages.JournalStorage(
        optuna.storages.JournalFileStorage(journal_name)) 
    for study in study_storage.get_all_studies():
        study_name = study_storage.get_study_name_from_id(study._study_id)
        break
    study = optuna.study.load_study(study_name=study_name, storage=study_storage)
    return study

In [3]:
def parse_studies(prefix='criteo_', suffix='.log', pattern='*.log'):
    def helper():
        for filename in glob('*.log'):
            study_name = filename.removesuffix(suffix)
            clean_name = study_name.removeprefix(prefix)
            embed_dim_match = re.search('_([0-9]+)$', clean_name)
            embed_dim = int(embed_dim_match.group(1))
            exp_name = clean_name.removesuffix(embed_dim_match.group(0))
            yield exp_name, embed_dim, study_name
    return list(helper())

In [4]:
parse_studies()

[('bins', 32, 'criteo_bins_32'),
 ('splines_0', 32, 'criteo_splines_0_32'),
 ('splines_3', 32, 'criteo_splines_3_32'),
 ('bins', 48, 'criteo_bins_48'),
 ('splines_0', 48, 'criteo_splines_0_48'),
 ('splines_3', 48, 'criteo_splines_3_48'),
 ('bins', 40, 'criteo_bins_40'),
 ('splines_3', 40, 'criteo_splines_3_40'),
 ('splines_0', 40, 'criteo_splines_0_40'),
 ('splines_3', 24, 'criteo_splines_3_24'),
 ('splines_0', 24, 'criteo_splines_0_24'),
 ('bins', 24, 'criteo_bins_24')]

In [5]:
trials_df = []
for experiment, embed_dim, study_name in parse_studies():
    study = load_study(study_name)
    print(f'Study {study_name} has {len(study.trials)} trials')

    trial_data = [trial.params | 
                  trial.user_attrs | 
                  dict(val_loss=trial.value, study_name=study_name, experiment=experiment, embed_dim=embed_dim)
                  for trial in [study.best_trial]]
    trial_data_df = pd.DataFrame.from_records(trial_data)
    trials_df.append(trial_data_df)

trials_df = pd.concat(trials_df, axis=0)
trials_df

Study criteo_bins_32 has 50 trials
Study criteo_splines_0_32 has 50 trials


  study_storage = optuna.storages.JournalStorage(


Study criteo_splines_3_32 has 50 trials
Study criteo_bins_48 has 50 trials
Study criteo_splines_0_48 has 50 trials
Study criteo_splines_3_48 has 50 trials
Study criteo_bins_40 has 50 trials
Study criteo_splines_3_40 has 50 trials
Study criteo_splines_0_40 has 50 trials
Study criteo_splines_3_24 has 50 trials
Study criteo_splines_0_24 has 50 trials
Study criteo_bins_24 has 50 trials


Unnamed: 0,lr,l2reg,random_seed,degree,best_epoch,test_loss,val_loss,study_name,experiment,embed_dim
0,0.00012,3.157859e-08,42,0,1,0.45159,0.446994,criteo_bins_32,bins,32
0,0.000119,1.280773e-07,42,0,1,0.450906,0.446243,criteo_splines_0_32,splines_0,32
0,0.000114,6.730283e-08,42,3,1,0.450473,0.445821,criteo_splines_3_32,splines_3,32
0,0.000108,2.827414e-07,42,0,1,0.451844,0.447224,criteo_bins_48,bins,48
0,9.6e-05,4.389726e-06,42,0,1,0.450997,0.446305,criteo_splines_0_48,splines_0,48
0,9.5e-05,0.0004177857,42,3,1,0.450431,0.44579,criteo_splines_3_48,splines_3,48
0,0.00011,6.344493e-07,42,0,1,0.451276,0.446637,criteo_bins_40,bins,40
0,0.000104,1.716062e-05,42,3,1,0.450731,0.445985,criteo_splines_3_40,splines_3,40
0,0.000102,3.16921e-05,42,0,1,0.450896,0.446257,criteo_splines_0_40,splines_0,40
0,0.00013,0.0008988797,42,3,1,0.450712,0.446049,criteo_splines_3_24,splines_3,24


In [6]:
trials_df[['experiment', 'embed_dim', 'lr', 'l2reg']].pivot(columns=['experiment'], index='embed_dim', values=['lr', 'l2reg'])

Unnamed: 0_level_0,lr,lr,lr,l2reg,l2reg,l2reg
experiment,bins,splines_0,splines_3,bins,splines_0,splines_3
embed_dim,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
24,0.000131,0.00013,0.00013,0.0003240514,0.0008988797,0.0008988797
32,0.00012,0.000119,0.000114,3.157859e-08,1.280773e-07,6.730283e-08
40,0.00011,0.000102,0.000104,6.344493e-07,3.16921e-05,1.716062e-05
48,0.000108,9.6e-05,9.5e-05,2.827414e-07,4.389726e-06,0.0004177857


In [7]:
pivoted = trials_df[['experiment', 'embed_dim', 'val_loss']].pivot(columns=['experiment'], index='embed_dim', values=['val_loss'])
pivoted

Unnamed: 0_level_0,val_loss,val_loss,val_loss
experiment,bins,splines_0,splines_3
embed_dim,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
24,0.446373,0.446044,0.446049
32,0.446994,0.446243,0.445821
40,0.446637,0.446257,0.445985
48,0.447224,0.446305,0.44579


In [8]:
pivoted = trials_df[['experiment', 'embed_dim', 'test_loss']].pivot(columns=['experiment'], index='embed_dim', values=['test_loss'])
pivoted

Unnamed: 0_level_0,test_loss,test_loss,test_loss
experiment,bins,splines_0,splines_3
embed_dim,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
24,0.451172,0.450681,0.450712
32,0.45159,0.450906,0.450473
40,0.451276,0.450896,0.450731
48,0.451844,0.450997,0.450431


In [9]:
lift_pct = (100 * (1 - pivoted.iloc[:, 1:] / pivoted.iloc[:, 0].values[:, np.newaxis]))
lift_pct

Unnamed: 0_level_0,test_loss,test_loss
experiment,splines_0,splines_3
embed_dim,Unnamed: 1_level_2,Unnamed: 2_level_2
24,0.108882,0.102011
32,0.151523,0.247429
40,0.084409,0.120768
48,0.187404,0.312794
