In [6]:
import yaml
import os
import copy
import numpy as np

import utils

In [7]:
tokenizer=utils.Tokenizer_v2()
n_tokens=tokenizer.get_n_tokens()
n_tokens

676

In [8]:
config={
    "exp_dir":"./data/",
    "wandb":{"project":"markov_mixture"},
    "seed":0,
    "task":"markov_mixture",
    "mask_loss":False,

    "data_params":{
        "k":5,
        "transition_params":{"type":"sparse","sparsity":0.5,"random_state":42},
        "l":128,
        "batch_size":128,
    },
    "model_params":{
        "model_type":"transformer",
        "gpt_config":{
            "tokenized":True,
            "in_size":n_tokens,
            "n_embd":256,
            "n_head":8,
            "n_layer":6,
            "rope":False,
        },
        "optimizer_type":"AdamW",
        "optimizer_params":{
                    "lr":6e-4,
                    "weight_decay":1e-1,
                    "betas":[0.9,0.95]
        },
    },
    "training_params":{
        "n_steps":100_000,
        "save_steps":[5000,10_000,20_000, 40_000, 60_000, 80_000, 100_000],
        "save_opt":False,
    },
}
None

In [9]:
def get_ckpt_steps(n_steps,step_first=30,n_ckpts=50):
    return np.logspace(np.log10(step_first),np.log10(n_steps),n_ckpts,base=10).astype(int).tolist()

Variants (rope, no mlp, etc)

In [21]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
k=10
l=512
batch_size=128
seed=42
for variant in ["default_long","lte_long","default_long_rope_only","lte_long_rope_only","default_long_rope_only_nomlp"]:
    flops_total=1e15 if "long" not in variant else 1e16
    n_embd=64
    n_head=4
    n_layer=2
    model_name=variant
    group_name=f"final_v1/{variant}"
    for n in ns:
        np.random.seed(seed)
        ks=[k]*n
        ps=np.random.random(n)
        ps/=ps.sum()
        ps=ps.tolist()
        rss=np.random.randint(0,100000,n).tolist()

        exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
        config_=copy.deepcopy(config)
        config_["exp_dir"]=f"./data/{exp_name}"
        config_["wandb"]={"project":"markov_mixture",
        "group":group_name,
        "name":f"{model_name}_n={n}"}

        config_["data_params"]={
            "ks":ks,
            "ps":ps,
            "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
            "l":l,
            "batch_size":batch_size,
            "fast_dataset":True,
            "two_token_input":False if "lte" not in variant else True,
        }
        config_["model_params"]["gpt_config"]["n_embd"]=n_embd
        config_["model_params"]["gpt_config"]["n_head"]=n_head
        config_["model_params"]["gpt_config"]["n_layer"]=n_layer
        config_["model_params"]["gpt_config"]["mlp"]=False if "nomlp" in variant else True
        if "rope_only" in variant:
            config_["model_params"]["gpt_config"]["rope"]=True
            config_["model_params"]["gpt_config"]["pos_embed"]=False


        ## set n_steps
        flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config=config_["model_params"]["gpt_config"],seq_len=l)
        n_steps=int(flops_total/flops_per_step)
        config_["memo"]={"flops_per_step":flops_per_step,}
        config_["training_params"]["n_steps"]=n_steps
        config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

        yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
        fol=os.path.dirname(yaml_path)
        os.makedirs(fol,exist_ok=True)
        yaml.dump(config_, open(yaml_path, 'w'))

## model scaling

In [11]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
k=10
l=512
batch_size=128
seed=42
for width in [32,48,64,96,128,192,256]:
    for variant in ["default_long","default_long_rope_only"]:
        flops_total=1e15 if "long" not in variant else 1e16
        n_embd=width
        n_head=4
        n_layer=2
        model_name=f"width={width}"
        group_name=f"final_v1/{variant}_widths"
        for n in ns:
            np.random.seed(seed)
            ks=[k]*n
            ps=np.random.random(n)
            ps/=ps.sum()
            ps=ps.tolist()
            rss=np.random.randint(0,100000,n).tolist()

            exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
            config_=copy.deepcopy(config)
            config_["exp_dir"]=f"./data/{exp_name}"
            config_["wandb"]={"project":"markov_mixture",
            "group":group_name,
            "name":f"{model_name}_n={n}"}

            config_["data_params"]={
                "ks":ks,
                "ps":ps,
                "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
                "l":l,
                "batch_size":batch_size,
                "fast_dataset":True,
                "two_token_input":False if "lte" not in variant else True,
            }
            config_["model_params"]["gpt_config"]["n_embd"]=n_embd
            config_["model_params"]["gpt_config"]["n_head"]=n_head
            config_["model_params"]["gpt_config"]["n_layer"]=n_layer
            if "rope_only" in variant:
                config_["model_params"]["gpt_config"]["rope"]=True
                config_["model_params"]["gpt_config"]["pos_embed"]=False


            ## set n_steps
            flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config=config_["model_params"]["gpt_config"],seq_len=l)
            n_steps=int(flops_total/flops_per_step)
            config_["memo"]={"flops_per_step":flops_per_step,}
            config_["training_params"]["n_steps"]=n_steps
            config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

            yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
            fol=os.path.dirname(yaml_path)
            os.makedirs(fol,exist_ok=True)
            yaml.dump(config_, open(yaml_path, 'w'))

## depths

In [12]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
k=10
l=512
batch_size=128
seed=42
for n_layer in [1,2,4,6,8,12,16]:
    for variant in ["default_long","default_long_rope_only"]:
        flops_total=1e15 if "long" not in variant else 1e16
        n_embd=64
        n_head=4
        n_layer=n_layer
        model_name=f"n_layer={n_layer}"
        group_name=f"final_v1/{variant}_depths"
        for n in ns:
            np.random.seed(seed)
            ks=[k]*n
            ps=np.random.random(n)
            ps/=ps.sum()
            ps=ps.tolist()
            rss=np.random.randint(0,100000,n).tolist()

            exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
            config_=copy.deepcopy(config)
            config_["exp_dir"]=f"./data/{exp_name}"
            config_["wandb"]={"project":"markov_mixture",
            "group":group_name,
            "name":f"{model_name}_n={n}"}

            config_["data_params"]={
                "ks":ks,
                "ps":ps,
                "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
                "l":l,
                "batch_size":batch_size,
                "fast_dataset":True,
                "two_token_input":False if "lte" not in variant else True,
            }
            config_["model_params"]["gpt_config"]["n_embd"]=n_embd
            config_["model_params"]["gpt_config"]["n_head"]=n_head
            config_["model_params"]["gpt_config"]["n_layer"]=n_layer
            if "rope_only" in variant:
                config_["model_params"]["gpt_config"]["rope"]=True
                config_["model_params"]["gpt_config"]["pos_embed"]=False


            ## set n_steps
            flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config=config_["model_params"]["gpt_config"],seq_len=l)
            n_steps=int(flops_total/flops_per_step)
            config_["memo"]={"flops_per_step":flops_per_step,}
            config_["training_params"]["n_steps"]=n_steps
            config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

            yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
            fol=os.path.dirname(yaml_path)
            os.makedirs(fol,exist_ok=True)
            yaml.dump(config_, open(yaml_path, 'w'))

## rmlps

In [13]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
k=10
l=512
batch_size=128
seed=42
for rmlp in [0.25,1,16]:#4 is default
    for variant in ["default_long","default_long_rope_only"]:
        flops_total=1e15 if "long" not in variant else 1e16
        n_embd=64
        n_head=4
        n_layer=2
        model_name=f"rmlp={rmlp}"
        group_name=f"final_v1/{variant}_rmlps"
        for n in ns:
            np.random.seed(seed)
            ks=[k]*n
            ps=np.random.random(n)
            ps/=ps.sum()
            ps=ps.tolist()
            rss=np.random.randint(0,100000,n).tolist()

            exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
            config_=copy.deepcopy(config)
            config_["exp_dir"]=f"./data/{exp_name}"
            config_["wandb"]={"project":"markov_mixture",
            "group":group_name,
            "name":f"{model_name}_n={n}"}

            config_["data_params"]={
                "ks":ks,
                "ps":ps,
                "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
                "l":l,
                "batch_size":batch_size,
                "fast_dataset":True,
                "two_token_input":False if "lte" not in variant else True,
            }
            config_["model_params"]["gpt_config"]["n_embd"]=n_embd
            config_["model_params"]["gpt_config"]["n_head"]=n_head
            config_["model_params"]["gpt_config"]["n_layer"]=n_layer
            if "rope_only" in variant:
                config_["model_params"]["gpt_config"]["rope"]=True
                config_["model_params"]["gpt_config"]["pos_embed"]=False
            
            config_["model_params"]["gpt_config"]["rmlp"]=rmlp

            ## set n_steps
            flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config=config_["model_params"]["gpt_config"],seq_len=l)
            n_steps=int(flops_total/flops_per_step)
            config_["memo"]={"flops_per_step":flops_per_step,}
            config_["training_params"]["n_steps"]=n_steps
            config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

            yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
            fol=os.path.dirname(yaml_path)
            os.makedirs(fol,exist_ok=True)
            yaml.dump(config_, open(yaml_path, 'w'))

# random permute

In [14]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
k=10
l=512
batch_size=128
seed=42
for mode in ["online"]:
    for variant in ["default_long","default_long_rope_only"]:
        flops_total=1e15 if "long" not in variant else 1e16
        n_embd=64
        n_head=4
        n_layer=2
        model_name=f"mode={mode}"
        group_name=f"final_v1/{variant}_permutes"
        for n in ns:
            np.random.seed(seed)
            ks=[k]*n
            ps=np.random.random(n)
            ps/=ps.sum()
            ps=ps.tolist()
            rss=np.random.randint(0,100000,n).tolist()

            exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
            config_=copy.deepcopy(config)
            config_["exp_dir"]=f"./data/{exp_name}"
            config_["wandb"]={"project":"markov_mixture",
            "group":group_name,
            "name":f"{model_name}_n={n}"}

            config_["data_params"]={
                "ks":ks,
                "ps":ps,
                "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
                "l":l,
                "batch_size":batch_size,
                "fast_dataset":True,
                "two_token_input":False if "lte" not in variant else True,
                "online_permute":True if mode=="online" else False,
            }
            config_["model_params"]["gpt_config"]["n_embd"]=n_embd
            config_["model_params"]["gpt_config"]["n_head"]=n_head
            config_["model_params"]["gpt_config"]["n_layer"]=n_layer
            if "rope_only" in variant:
                config_["model_params"]["gpt_config"]["rope"]=True
                config_["model_params"]["gpt_config"]["pos_embed"]=False

            ## set n_steps
            flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config=config_["model_params"]["gpt_config"],seq_len=l)
            n_steps=int(flops_total/flops_per_step)
            config_["memo"]={"flops_per_step":flops_per_step,}
            config_["training_params"]["n_steps"]=n_steps
            config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

            yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
            fol=os.path.dirname(yaml_path)
            os.makedirs(fol,exist_ok=True)
            yaml.dump(config_, open(yaml_path, 'w'))

## heads

In [15]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
k=10
l=512
batch_size=128
seed=42
for nhead in [1,8,16]:
    for variant in ["default_long","default_long_rope_only"]:
        flops_total=1e15 if "long" not in variant else 1e16
        n_embd=64
        n_head=nhead
        n_layer=2
        model_name=f"nhead={nhead}"
        group_name=f"final_v1/{variant}_nheads"
        for n in ns:
            np.random.seed(seed)
            ks=[k]*n
            ps=np.random.random(n)
            ps/=ps.sum()
            ps=ps.tolist()
            rss=np.random.randint(0,100000,n).tolist()

            exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
            config_=copy.deepcopy(config)
            config_["exp_dir"]=f"./data/{exp_name}"
            config_["wandb"]={"project":"markov_mixture",
            "group":group_name,
            "name":f"{model_name}_n={n}"}

            config_["data_params"]={
                "ks":ks,
                "ps":ps,
                "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
                "l":l,
                "batch_size":batch_size,
                "fast_dataset":True,
                "two_token_input":False if "lte" not in variant else True,
            }
            config_["model_params"]["gpt_config"]["n_embd"]=n_embd
            config_["model_params"]["gpt_config"]["n_head"]=n_head
            config_["model_params"]["gpt_config"]["n_layer"]=n_layer
            if "rope_only" in variant:
                config_["model_params"]["gpt_config"]["rope"]=True
                config_["model_params"]["gpt_config"]["pos_embed"]=False

            ## set n_steps
            flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config=config_["model_params"]["gpt_config"],seq_len=l)
            n_steps=int(flops_total/flops_per_step)
            config_["memo"]={"flops_per_step":flops_per_step,}
            config_["training_params"]["n_steps"]=n_steps
            config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

            yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
            fol=os.path.dirname(yaml_path)
            os.makedirs(fol,exist_ok=True)
            yaml.dump(config_, open(yaml_path, 'w'))

In [16]:
#ks

In [17]:
def get_flops_total(flops_k10,k):
    return flops_k10*(k/10)

In [18]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
for variant in ["default_long","default_long_rope_only"]:
    flops_total_k10=1e16 if "long" in variant else 1e15
    for k in [2,4,8,16,32,64]:
        flops_total=get_flops_total(flops_k10=flops_total_k10,k=k)
        n_embd=64
        n_head=4
        n_layer=2
        model_name=f"k={k}"
        group_name=f"final_v1/{variant}_ks"
        for n in ns:
            k=k
            l=512
            seed=42
            #
            np.random.seed(seed)
            ks=[k]*n
            ps=np.random.random(n)
            ps/=ps.sum()
            ps=ps.tolist()
            rss=np.random.randint(0,100000,n).tolist()

            exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
            config_=copy.deepcopy(config)
            config_["exp_dir"]=f"./data/{exp_name}"
            config_["wandb"]={"project":"markov_mixture",
            "group":group_name,
            "name":f"{model_name}_n={n}"}

            config_["seed"]=0
            config_["task"]="markov_mixture"
            config_["data_params"]={
                "ks":ks,
                "ps":ps,
                "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
                "l":l,
                "batch_size":128,
                "fast_dataset":True,
                "two_token_input":False if "lte" not in variant else True,
            }
            config_["model_params"]["gpt_config"]["in_size"]=n_tokens
            config_["model_params"]["gpt_config"]["n_embd"]=n_embd
            config_["model_params"]["gpt_config"]["n_head"]=n_head
            config_["model_params"]["gpt_config"]["n_layer"]=n_layer
            if "rope_only" in variant:
                config_["model_params"]["gpt_config"]["rope"]=True
                config_["model_params"]["gpt_config"]["pos_embed"]=False

            gpt_config=config_["model_params"]["gpt_config"]
            batch_size=config_["data_params"]["batch_size"]
            l=config_["data_params"]["l"]
            flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config,seq_len=l)
            n_steps=(flops_total/flops_per_step)
            n_steps=int(n_steps)
            config_["memo"]={
                "flops_per_step":flops_per_step,
            }
            config_["training_params"]["n_steps"]=n_steps
            config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

            yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
            fol=os.path.dirname(yaml_path)
            os.makedirs(fol,exist_ok=True)
            yaml.dump(config_, open(yaml_path, 'w'))

In [19]:
#context lengths

In [20]:
ns=[4,8,16,32,64,128,256,512,1024,2048]
k=10
batch_size=128
seed=42
for l in [64,128,256,512,1024,2048]:
    for variant in ["default_long_rope_only"]:
        flops_total=1e15 if "long" not in variant else 1e16
        n_embd=64
        n_head=4
        n_layer=2
        model_name=f"l={l}"
        group_name=f"final_v1/{variant}_ls"
        for n in ns:
            np.random.seed(seed)
            ks=[k]*n
            ps=np.random.random(n)
            ps/=ps.sum()
            ps=ps.tolist()
            rss=np.random.randint(0,100000,n).tolist()

            exp_name=f"markov_mixture/{group_name}/{model_name}/n={n}"#dense,2,3
            config_=copy.deepcopy(config)
            config_["exp_dir"]=f"./data/{exp_name}"
            config_["wandb"]={"project":"markov_mixture",
            "group":group_name,
            "name":f"{model_name}_n={n}"}

            config_["data_params"]={
                "ks":ks,
                "ps":ps,
                "transition_paramss":[{"type":"dense","random_state":rs} for rs in rss],
                "l":l,
                "batch_size":batch_size,
                "fast_dataset":True,
                "two_token_input":False if "lte" not in variant else True,
            }
            config_["model_params"]["gpt_config"]["n_embd"]=n_embd
            config_["model_params"]["gpt_config"]["n_head"]=n_head
            config_["model_params"]["gpt_config"]["n_layer"]=n_layer
            config_["model_params"]["gpt_config"]["block_size"]=l if l>1024 else 1024
            if "rope_only" in variant:
                config_["model_params"]["gpt_config"]["rope"]=True
                config_["model_params"]["gpt_config"]["pos_embed"]=False
            

            ## set n_steps
            flops_per_step=3*(batch_size)*utils.get_nano_gpt_forward_flops(gpt_config=config_["model_params"]["gpt_config"],seq_len=l)
            #3 for training
            n_steps=int(flops_total/flops_per_step)
            config_["memo"]={"flops_per_step":flops_per_step,}
            config_["training_params"]["n_steps"]=n_steps
            config_["training_params"]["save_steps"]=get_ckpt_steps(n_steps)

            yaml_path=os.path.join(f"./yamls/{exp_name}.yaml")
            fol=os.path.dirname(yaml_path)
            os.makedirs(fol,exist_ok=True)
            yaml.dump(config_, open(yaml_path, 'w'))