## Test Sweep Config


In [1]:
from epsilon_transformers.configs.hyperparam_sweep_cfg import generate_hyperparam_combinations, hyperparams
combos = list(generate_hyperparam_combinations(hyperparams))
print(len(combos))  # Should print the total number of combinations
for combo in combos[:3]:
    print(combo)

108
{'learning_rate': 1e-05, 'batch_size': 32, 'n_layers': 1, 'd_model': 32, 'n_heads': 4}
{'learning_rate': 1e-05, 'batch_size': 32, 'n_layers': 1, 'd_model': 32, 'n_heads': 8}
{'learning_rate': 1e-05, 'batch_size': 32, 'n_layers': 1, 'd_model': 64, 'n_heads': 4}


## Test Logger

In [2]:
from epsilon_transformers.training.logger import CSVLogger
logger = CSVLogger('test_log.csv', ['epoch', 'loss'])
logger.log({'epoch': 1, 'loss': 0.5})
logger.close()

In [3]:
# open test_log.csv and check that it has the correct data
import pandas as pd
df = pd.read_csv('test_log.csv')
print(df)

   epoch  loss
0      1   0.5


## testing Training argparse

In [120]:
# launcher.py

import argparse
import itertools
import subprocess
import yaml
import os
from datetime import datetime

def load_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

def calculate_model_params(n_heads, d_attn):
    d_model = d_attn * n_heads
    d_mlp = 4 * d_model
    return d_model, d_mlp

def run_experiment(run_config, gpu_id):
    cmd = [
        'python', 'scripts/train.py',
        '--config', run_config['config_path'],
        '--gpu_id', str(gpu_id)
    ]
    subprocess.Popen(cmd)


config = load_config('./experiment_config.yaml')


In [121]:
config

{'global_config': {'output_dir': './results'},
 'train_config': {'batches_per_epoch': 100},
 'model_config': {'n_ctx': 9,
  'n_heads': 2,
  'act_fn': 'relu',
  'normalization_type': 'LN',
  'attn_only': False,
  'seed': 42,
  'dtype': 'float32'},
 'sweep_config': {'train_config': {'learning_rates': ['1e-4'],
   'batch_sizes': [64]},
  'model_config': {'d_head': [8],
   'n_layers': [1, 2, 4, 8, 12],
   'n_heads': [1, 2, 4, 8]},
  'process_config': [{'name': 'post_quantum',
    'params': {'alpha': 1.0, 'beta': 1.0}},
   {'name': 'tom_quantum', 'params': {'alpha': 1.5, 'beta': 0.5}},
   {'name': 'fanizza', 'params': {'alpha': 2.0, 'lamb': 0.7}},
   {'name': 'rrxor', 'params': {'pR1': 0.5, 'pR2': 0.5}},
   {'name': 'mess3', 'params': {'x': 0.2, 'a': 0.7}}]}}

In [122]:

def create_config_sweep(config):
    global_config = config.get('global_config', {})

    model_config = config.get('model_config', {})
    train_config = config.get('train_config', {})

    sweep_config = config.get('sweep_config', {})
    sweep_train_config = sweep_config.get('train_config', {})
    sweep_model_config = sweep_config.get('model_config', {})
    sweep_process_config = sweep_config.get('process_config', {})

    # use itertools.product to create the combinations
    model_config_combinations = [dict(zip(sweep_model_config.keys(), combination)) for combination in itertools.product(*sweep_model_config.values())]
    train_config_combinations = [dict(zip(sweep_train_config.keys(), combination)) for combination in itertools.product(*sweep_train_config.values())]
    # now append the constant values from the model_config and train_config
    for cfg in model_config_combinations:
        cfg.update(model_config)
    for cfg in train_config_combinations:
        cfg.update(train_config)


    # Create a combined iterator
    combined_config_iter = itertools.product(model_config_combinations, train_config_combinations, sweep_process_config)

    # Create the final iterator of dict of dicts
    config_sweep_iter = (
        {
            'global_config': global_config,
            'model_config': model_cfg,
            'train_config': train_cfg,
            'process_config': process_cfg
        }
        for model_cfg, train_cfg, process_cfg in combined_config_iter
    )
    return config_sweep_iter


In [123]:
config_sweep_iter = create_config_sweep(config)
for cfg in config_sweep_iter:
    print(cfg)
    break

{'global_config': {'output_dir': './results'}, 'model_config': {'d_head': 8, 'n_layers': 1, 'n_heads': 2, 'n_ctx': 9, 'act_fn': 'relu', 'normalization_type': 'LN', 'attn_only': False, 'seed': 42, 'dtype': 'float32'}, 'train_config': {'learning_rates': '1e-4', 'batch_sizes': 64, 'batches_per_epoch': 100}, 'process_config': {'name': 'post_quantum', 'params': {'alpha': 1.0, 'beta': 1.0}}}


In [118]:
def create_individal_config_sweep(config, sweep_config_key):

    sweep_has_values = not config['sweep_config'][sweep_config_key] is None
    constant_has_values = not config[sweep_config_key] is None

    if sweep_has_values:
        sweep_configs = {list(param.keys())[0]: param[list(param.keys())[0]] for param in config['sweep_config'][sweep_config_key]}
        config_combinations = list(itertools.product(*sweep_configs.values()))
        for combination in config_combinations:
            config_sweep = dict(zip(sweep_configs.keys(), combination))
            if constant_has_values:
                config_sweep.update(config[sweep_config_key])
            yield config_sweep
    elif constant_has_values:
        yield config[sweep_config_key]
    else:
        yield {}

def create_process_config_sweep(config):
    return create_individal_config_sweep(config, 'process_config')

def create_model_config_sweep(config):
    return create_individal_config_sweep(config, 'model_config')

def create_training_config_sweep(config):
    return create_individal_config_sweep(config, 'train_config')

In [119]:
for config_sweep in create_model_config_sweep(config):
    print(config_sweep)


AttributeError: 'str' object has no attribute 'keys'

In [65]:
config['model_config']

{'n_ctx': 9,
 'n_heads': 2,
 'act_fn': 'relu',
 'normalization_type': 'LN',
 'attn_only': False,
 'seed': 42,
 'dtype': 'float32'}

In [42]:
# create model configs
sweep_model_configs = {list(param.keys())[0]: param[list(param.keys())[0]] for param in config['sweep_config']['model_config']}
model_config_combinations = list(itertools.product(*sweep_model_configs.values()))
for combination in model_config_combinations:
    model_config_sweep = dict(zip(sweep_model_configs.keys(), combination))
    # add the model config to the config
    model_config_sweep.update(config['model_config'])


{'d_head': 8,
 'n_layers': 12,
 'n_heads': 2,
 'n_ctx': 9,
 'act_fn': 'relu',
 'normalization_type': 'LN',
 'attn_only': False,
 'seed': 42,
 'dtype': 'float32'}

In [12]:


sweep_params = itertools.product(
    *[(key, value) for key in config['sweep_config'].keys() for value in config['sweep_config'][key]]
)
for param in sweep_params:
    print(param)
    break

('learning_rates', 'batch_sizes', 'model_architectures', 'model_architectures', 'model_architectures', 'processes', 'processes', 'processes', 'processes', 'processes')


In [18]:

    # Generate all combinations for the sweep
    sweep_params = itertools.product(
        config['sweep_config']['learning_rates'],
        config['sweep_config']['batch_sizes'],
        config['sweep_config']['model_architectures'][1]['n_layers'],
        config['sweep_config']['model_architectures'][2]['n_heads'],
        config['sweep_config']['processes']
    )

    run_id = config.get('run_id', datetime.now().strftime("%Y%m%d_%H%M%S"))
    os.makedirs(f"{config['global_config']['output_dir']}/{run_id}", exist_ok=True)

    for i, (lr, bs, n_layers, n_heads, process) in enumerate(sweep_params):
        d_attn = config['sweep_config']['model_architectures'][0]['d_attn'][0]
        d_model, d_mlp = calculate_model_params(n_heads, d_attn)

        run_config = {
            'global_config': config['global_config'],
            'model_config': {
                'activation': config['model_config']['activation'],
                'learning_rate': lr,
                'batch_size': bs,
                'n_layers': n_layers,
                'n_heads': n_heads,
                'd_attn': d_attn,
                'd_model': d_model,
                'd_mlp': d_mlp
            },
            'process': process
        }

        config_path = f"{config['global_config']['output_dir']}/{run_id}/config_{i}.yaml"
        with open(config_path, 'w') as f:
            yaml.dump(run_config, f)

        run_config['config_path'] = config_path
        gpu_id = i % config['global_config']['num_gpus']
        run_experiment(run_config, gpu_id)

if __name__ == "__main__":
    main()


Command output:
Arguments parsed successfully!



In [4]:
import yaml

def load_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)
config = load_config('/Users/adamimos/Documents/GitHub/epsilon-transformers/results/20241002151222/config_run_0_L1_H2_DH8_DM16_fanizza.yaml')
config

{'config_path': './results/20241002151222/config_run_0_L1_H2_DH8_DM16_fanizza.yaml',
 'global_config': {'device': 'cpu',
  'output_dir': './results',
  'sweep_id': '20241002151222'},
 'model_config': {'act_fn': 'relu',
  'attn_only': False,
  'd_head': 8,
  'd_mlp': 64,
  'd_model': 16,
  'device': 'cpu',
  'dtype': 'float32',
  'n_ctx': 9,
  'n_heads': 2,
  'n_layers': 1,
  'normalization_type': 'LN',
  'seed': 42},
 'process_config': {'alpha': 2000.0, 'lamb': 0.49, 'name': 'fanizza'},
 'run_id': 'run_0_L1_H2_DH8_DM16_fanizza',
 'train_config': {'batch_size': 64,
  'batches_per_epoch': 100,
  'bos': False,
  'learning_rate': '1e-4'}}

In [5]:
%reload_ext autoreload
%autoreload 2
from epsilon_transformers.training.dataloader import get_dataloader_and_loss_lower_bound
dataloader, loss_lower_bound, d_vocab = get_dataloader_and_loss_lower_bound(process_params=config['process_config'],
                                    n_ctx=config['model_config']['n_ctx'],
                                    batches_per_epoch=config['train_config']['batches_per_epoch'],
                                    batch_size=config['train_config']['batch_size'],
                                    device='cpu',
                                    bos=config['train_config']['bos'])

Process initialized successfully!


In [6]:
for i in dataloader:
    print(i)
    break

(tensor([[1, 0, 1, 1, 0, 1, 1, 0, 1],
        [0, 1, 1, 0, 1, 1, 0, 1, 1],
        [1, 0, 1, 0, 1, 0, 1, 0, 0],
        [0, 0, 1, 0, 1, 0, 0, 1, 0],
        [0, 1, 1, 0, 1, 1, 0, 0, 1],
        [1, 0, 1, 1, 0, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 1, 1],
        [0, 1, 0, 1, 0, 1, 1, 0, 0],
        [1, 0, 1, 1, 0, 1, 1, 0, 0],
        [0, 0, 1, 1, 0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 1, 1, 0, 0],
        [1, 1, 0, 1, 0, 1, 0, 1, 1],
        [1, 0, 0, 0, 0, 1, 1, 1, 1],
        [1, 1, 0, 1, 1, 1, 1, 0, 1],
        [1, 0, 0, 0, 0, 0, 1, 1, 0],
        [1, 0, 0, 0, 0, 1, 0, 0, 1],
        [0, 1, 1, 0, 1, 1, 0, 0, 1],
        [1, 0, 1, 1, 1, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 1, 1, 1, 1],
        [0, 1, 0, 1, 0, 0, 0, 0, 1],
        [0, 1, 1, 0, 1, 0, 1, 1, 0],
        [0, 0, 0, 0, 1, 0, 1, 1, 0],
        [0, 0, 1, 1, 0, 1, 1, 0, 0],
        [0, 0, 0, 0, 1, 0, 1, 1, 0],
        [1, 1, 1, 1, 0, 0, 1, 0, 0],
        [0, 1, 1, 0, 0, 1, 0, 1, 0],
        [0, 1, 0, 0, 0, 1, 0, 1, 1],


In [7]:
print(d_vocab)

2


In [8]:
from transformer_lens import HookedTransformer, HookedTransformerConfig
import torch
#compute d_vocab
# add d_vocab to model_config
config['model_config']['d_vocab'] = d_vocab
config['model_config']['dtype'] = getattr(torch, config['model_config']['dtype'])
hooked_model_config = HookedTransformerConfig(**config['model_config'])
model = HookedTransformer(hooked_model_config)

In [2]:
hooked_model_config.to_dict()

NameError: name 'hooked_model_config' is not defined

In [2]:
from epsilon_transformers.process.transition_matrices import get_matrix_from_args
from epsilon_transformers.process.processes import TransitionMatrixGHMM
import matplotlib.pyplot as plt
import numpy as np
T = get_matrix_from_args("post_quantum", alpha=np.exp(1), beta=.01)
print(T.shape)
process = TransitionMatrixGHMM(T)
process.name = "post_quantum"
msp = process.derive_mixed_state_tree(9)
beliefs = np.array(msp.belief_states).squeeze()
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(
    x=beliefs[:, 0],
    y=beliefs[:, 1],
    z=beliefs[:, 2],
    mode='markers',
    marker=dict(
        size=5,
        color=beliefs[:, 2],  # color by z-axis value
        colorscale='Viridis',
        opacity=0.8
    )
)])

fig.update_layout(scene=dict(
    xaxis_title='X',
    yaxis_title='Y',
    zaxis_title='Z'
),
    width=800,
    height=800,
    title='3D Scatter Plot of Belief States'
)

fig.show()

(3, 3, 3)


In [3]:
beliefs.shape

(8003, 1, 3)

# Test Quantum Training