In [1]:
import math
import random
import wandb
import pprint
import json
import polars as pl
import numpy as np
from src.utils import load_yaml
from wandb.sdk.internal.internal_api import gql

from typing import Union, List, Dict
from src.constant import PROJECT_NAME
from src.utils.config import load_yaml, parse_parameters, parse_config, parse_scientific_notation

In [2]:
api = wandb.Api() # Initialize Weights & Biases API, used for fetching run data

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: [wandb.Api()] Loaded credentials for https://api.wandb.ai from /home/hafidh_rendyanto/.netrc.


# Get Config

In [3]:
config = load_yaml("configs/hyperparameter_search/mf:baseline.yaml")
parameters_config = config["parameters"]
parameters_config

{'model': {'value': 'matrix_factorization'},
 'embedding_dimension': {'distribution': 'categorical',
  'values': [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]},
 'max_epoch': {'value': 64},
 'batch_size': {'value': 16384},
 'shuffle': {'distribution': 'categorical', 'values': [True, False]},
 'learning_rate': {'value': 0.01},
 'l1_regularization': {'value': 0.0},
 'l2_regularization': {'distribution': 'categorical',
  'values': [0.0,
   '1e-10',
   '1e-9',
   '1e-8',
   '1e-7',
   '1e-6',
   '1e-5',
   '1e-4',
   '1e-3',
   '1e-2']},
 'embedding_dropout_rate': {'value': 0.0},
 'log_freq': {'value': 'epoch'},
 'evaluation_cutoffs': {'distribution': 'constant', 'value': [2, 10, 20, 50]},
 'early_stopping': {'value': False},
 'early_stopping_monitor': {'value': 'test_recall@10'},
 'early_stopping_mode': {'value': 'max'},
 'early_stopping_patience': {'value': 0},
 'random_seed': {'distribution': 'int_uniform', 'min': 1, 'max': 16384},
 'store_model': {'value': False}}

In [4]:
ground_truth_config = {
    'model': 'matrix_factorization',
    'embedding_dimension': 4,
    'max_epoch': 64,
    'batch_size': 16384,
    'shuffle': False,
    'learning_rate': 0.01,
    'l1_regularization': 0.0,
    'l2_regularization': '1e-9',
    'embedding_dropout_rate': 0.0,
    'log_freq': 'epoch',
    'evaluation_cutoffs': [2, 10, 20, 50],
    'early_stopping': False,
    'early_stopping_monitor': 'test_recall@10',
    'early_stopping_mode': 'max',
    'early_stopping_patience': 0,
    'random_seed': 14353,
    'store_model': False
}


In [5]:
fixed_parameters: Dict[str, Union[int, float, str]] = {}
free_categorical_parameters: Dict[str, List[Union[int, float, str]]] = {}
free_random_parameters: Dict[str, Dict[str, Union[int, float, str]]] = {}
for parameter, parameter_config in parameters_config.items():
    if not isinstance(parameter_config, dict):
        fixed_parameters[parameter] = parse_scientific_notation(parameter_config)
    elif "value" in parameter_config:
        fixed_parameters[parameter] = parse_scientific_notation(parameter_config["value"])
    elif "distribution" in parameter_config:
        distribution = parameter_config["distribution"]
        if distribution == "constant":
            fixed_parameters[parameter] = parse_scientific_notation(parameter_config["value"])
        elif distribution == "categorical":
            free_categorical_parameters[parameter] = parse_scientific_notation(parameter_config["values"])
        elif distribution in ["int_uniform", "uniform", "log_uniform"]:
            free_random_parameters[parameter] = parameter_config
        else:
            raise ValueError(f"Unsupported distribution type: {distribution} for parameter: {parameter}")
    else:
        raise ValueError(f"Invalid parameter configuration for {parameter}: {parameter_config}")
    
free_categorical_parameters = dict(sorted(free_categorical_parameters.items(), key=lambda x: len(x[1]), reverse=True))

In [6]:
fixed_parameters

{'model': 'matrix_factorization',
 'max_epoch': 64,
 'batch_size': 16384,
 'learning_rate': 0.01,
 'l1_regularization': 0.0,
 'embedding_dropout_rate': 0.0,
 'log_freq': 'epoch',
 'evaluation_cutoffs': [2, 10, 20, 50],
 'early_stopping': False,
 'early_stopping_monitor': 'test_recall@10',
 'early_stopping_mode': 'max',
 'early_stopping_patience': 0,
 'store_model': False}

In [7]:
free_categorical_parameters

{'embedding_dimension': [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024],
 'l2_regularization': [0.0,
  1e-10,
  1e-09,
  1e-08,
  1e-07,
  1e-06,
  1e-05,
  0.0001,
  0.001,
  0.01],
 'shuffle': [True, False]}

In [8]:
free_random_parameters

{'random_seed': {'distribution': 'int_uniform', 'min': 1, 'max': 16384}}

In [9]:
set(ground_truth_config) - set(fixed_parameters)

{'embedding_dimension', 'l2_regularization', 'random_seed', 'shuffle'}

# Get Current State of the Hyperparameter Field

In [None]:
query = """
    query Runs($project: String!, $entity: String!, $cursor: String, $filters: JSONString) {
        project(name: $project, entityName: $entity) {
            runs(first: 256, after: $cursor, filters: $filters) {
                edges {
                    node {
                        id
                        name
                        config
                    }
                    cursor
                }
                pageInfo {
                    hasNextPage
                    endCursor
                }
            }
        }
    }
"""
query = gql(query)

experiment_runs = []
cursor = None
filters = json.dumps({
    f"config.{key}": value for key, value in fixed_parameters.items()
})

while True:
    variables = {
        "project": PROJECT_NAME,
        "entity": api.default_entity,
        "cursor": cursor,
        "filters": filters
    }
    
    result = api.client.execute(query, variables)
    runs_data = result["project"]["runs"]
    
    for edge in runs_data["edges"]:
        current_run = edge["node"]
        current_run['config'] = parse_config(current_run['config'])

        experiment_runs.append({
            "id": current_run["id"],
            "name": current_run["name"],
            **current_run["config"],
        })
    
    if not runs_data["pageInfo"]["hasNextPage"]:
        print(f"Fetched {len(experiment_runs)} runs...")
        break

    cursor = runs_data["pageInfo"]["endCursor"]
    print(f"Fetched {len(experiment_runs)} runs...")

experiment_runs = pl.DataFrame(experiment_runs, infer_schema_length=None)
experiment_runs

Fetched 256 runs...
Fetched 512 runs...
Fetched 768 runs...
Fetched 985 runs...


id,name,model,config,shuffle,log_freq,max_epoch,batch_size,random_seed,store_model,learning_rate,early_stopping,l1_regularization,l2_regularization,evaluation_cutoffs,early_stopping_mode,embedding_dimension,early_stopping_monitor,embedding_dropout_rate,early_stopping_patience
str,str,str,str,bool,str,i64,i64,i64,bool,f64,bool,i64,f64,list[i64],str,i64,str,i64,i64
"""UnVuOnYxOmdiNGVrZzhrOnBlcHBlcm…","""gb4ekg8k""","""matrix_factorization""","""configs/hyperparameter_search/…",false,"""epoch""",64,16384,2973,false,0.01,false,0,0.0,"[2, 10, … 50]","""max""",4,"""test_recall@10""",0,0
"""UnVuOnYxOjU3dTdqazcyOnBlcHBlcm…","""57u7jk72""","""matrix_factorization""","""configs/hyperparameter_search/…",false,"""epoch""",64,16384,4762,false,0.01,false,0,0.01,"[2, 10, … 50]","""max""",8,"""test_recall@10""",0,0
"""UnVuOnYxOnlnNTQ1b3BlOnBlcHBlcm…","""yg545ope""","""matrix_factorization""","""configs/hyperparameter_search/…",false,"""epoch""",64,16384,15239,false,0.01,false,0,1.0000e-9,"[2, 10, … 50]","""max""",128,"""test_recall@10""",0,0
"""UnVuOnYxOmh3MTh4Z2Z1OnBlcHBlcm…","""hw18xgfu""","""matrix_factorization""",,false,"""epoch""",64,16384,3360,false,0.01,false,0,0.0001,"[2, 10, … 50]","""max""",128,"""test_recall@10""",0,0
"""UnVuOnYxOjdiOW82cXUxOnBlcHBlcm…","""7b9o6qu1""","""matrix_factorization""",,false,"""epoch""",64,16384,619,false,0.01,false,0,1.0000e-8,"[2, 10, … 50]","""max""",128,"""test_recall@10""",0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""UnVuOnYxOmZmdHoxZGVrOnBlcHBlcm…","""fftz1dek""","""matrix_factorization""",,true,"""epoch""",64,16384,14785,false,0.01,false,0,0.000001,"[2, 10, … 50]","""max""",256,"""test_recall@10""",0,0
"""UnVuOnYxOm85NHEwanVrOnBlcHBlcm…","""o94q0juk""","""matrix_factorization""",,false,"""epoch""",64,16384,3060,false,0.01,false,0,0.000001,"[2, 10, … 50]","""max""",256,"""test_recall@10""",0,0
"""UnVuOnYxOmZ3YXk1dTJ6OnBlcHBlcm…","""fway5u2z""","""matrix_factorization""",,false,"""epoch""",64,16384,15171,false,0.01,false,0,0.0001,"[2, 10, … 50]","""max""",512,"""test_recall@10""",0,0
"""UnVuOnYxOmJwaGNsMnhmOnBlcHBlcm…","""bphcl2xf""","""matrix_factorization""",,false,"""epoch""",64,16384,2606,false,0.01,false,0,0.0,"[2, 10, … 50]","""max""",1024,"""test_recall@10""",0,0


In [11]:
len(experiment_runs)

985

# Find Config with the Lowest Representation

In [12]:
free_categorical_parameters

{'embedding_dimension': [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024],
 'l2_regularization': [0.0,
  1e-10,
  1e-09,
  1e-08,
  1e-07,
  1e-06,
  1e-05,
  0.0001,
  0.001,
  0.01],
 'shuffle': [True, False]}

In [13]:
categorical_parameter_names = list(free_categorical_parameters.keys())
categorical_parameter_names

['embedding_dimension', 'l2_regularization', 'shuffle']

In [14]:
parameter_space_dimension = [len(values) for parameter, values in free_categorical_parameters.items()]
parameter_space = np.zeros(shape=parameter_space_dimension, dtype=int)
parameter_space.shape

(10, 10, 2)

In [17]:
for i, rows in enumerate(experiment_runs.select(categorical_parameter_names).iter_rows()):
    parameter_index = []
    for j, value in enumerate(rows):
        parameter_name = categorical_parameter_names[j]
        value_index = free_categorical_parameters[parameter_name].index(value)
        parameter_index.append(value_index)

    parameter_space[*parameter_index] += 1

In [18]:
parameter_space

array([[[ 7,  6],
        [ 3,  4],
        [ 1,  4],
        [ 5,  5],
        [ 2,  5],
        [ 3,  3],
        [ 5,  6],
        [ 2,  4],
        [ 7,  5],
        [ 4,  6]],

       [[ 7,  9],
        [ 3,  2],
        [ 5,  4],
        [ 6,  2],
        [ 5,  2],
        [ 4,  8],
        [ 5,  4],
        [ 8,  6],
        [ 5,  4],
        [ 3,  7]],

       [[ 9,  9],
        [ 4,  9],
        [ 2,  4],
        [ 5,  3],
        [ 2,  7],
        [ 6,  4],
        [ 5,  3],
        [ 5,  4],
        [ 4,  5],
        [ 7,  3]],

       [[ 8,  8],
        [ 3,  2],
        [ 2,  2],
        [10,  8],
        [ 4,  2],
        [ 4,  5],
        [ 9,  3],
        [ 4,  9],
        [ 5,  4],
        [ 6,  4]],

       [[11,  4],
        [ 2,  3],
        [ 2,  3],
        [ 3,  5],
        [ 5,  5],
        [10,  6],
        [ 3,  4],
        [ 3,  6],
        [ 3,  9],
        [ 7,  8]],

       [[ 6,  6],
        [ 5,  3],
        [ 4,  2],
        [ 4,  4],
        [ 6,  8],


In [19]:
# Find the flat index of the minimum value
min_flat_index = np.argmin(parameter_space)

# Convert flat index to multi-dimensional indices
min_indices = np.unravel_index(min_flat_index, parameter_space.shape)

# Get the actual parameter values
least_explored_categorical_config = {}
for i, parameter_name in enumerate(categorical_parameter_names):
    parameter_value_index = min_indices[i]
    least_explored_categorical_config[parameter_name] = free_categorical_parameters[parameter_name][parameter_value_index]

In [20]:
min_indices

(np.int64(0), np.int64(2), np.int64(0))

In [21]:
least_explored_categorical_config

{'embedding_dimension': 2, 'l2_regularization': 1e-09, 'shuffle': True}

In [62]:
parameter_space[0, 2, 0] += 1

# Parse Free Random Variable

In [22]:
free_random_parameters

{'random_seed': {'distribution': 'int_uniform', 'min': 1, 'max': 16384}}

In [24]:
parse_parameters(free_random_parameters)

{'random_seed': 14070}

In [25]:
{
    **fixed_parameters,
    **least_explored_categorical_config,
    **parse_parameters(free_random_parameters)
}

{'model': 'matrix_factorization',
 'max_epoch': 64,
 'batch_size': 16384,
 'learning_rate': 0.01,
 'l1_regularization': 0.0,
 'embedding_dropout_rate': 0.0,
 'log_freq': 'epoch',
 'evaluation_cutoffs': [2, 10, 20, 50],
 'early_stopping': False,
 'early_stopping_monitor': 'test_recall@10',
 'early_stopping_mode': 'max',
 'early_stopping_patience': 0,
 'store_model': False,
 'embedding_dimension': 2,
 'l2_regularization': 1e-09,
 'shuffle': True,
 'random_seed': 16140}