In [1]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformer_lens import HookedTransformer
from utils import *
from trainer import Trainer
from huggingface_hub import login
import pickle

login(
    token='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',
    add_to_git_credential=False
)

checkpoint = torch.load("datasets/none_hooked.pt", map_location="cuda")
base_model = HookedTransformer(checkpoint["config"])
base_model.load_state_dict(checkpoint["state_dict"])

checkpoint = torch.load("datasets/all_hooked.pt", map_location="cuda")
tuned_model = HookedTransformer(checkpoint["config"])
tuned_model.load_state_dict(checkpoint["state_dict"])

with open('datasets/uniform_tokens', 'rb') as f:
    tokens = pickle.load(f)
tokens = torch.stack([torch.tensor(t) for t in tokens])

default_cfg = {
    "seed": 49,
    "batch_size": 4096,
    "buffer_mult": 128,
    "lr": 5e-5,
    "num_tokens": 60_000_000,
    "l1_coeff": 2,
    "beta1": 0.9,
    "beta2": 0.999,
    "d_in": base_model.cfg.d_model,
    "dict_size": 2**14,
    "seq_len": 1024,
    "enc_dtype": "fp32",
    "model_name": "gemma-2-2b",
    "site": "resid_pre",
    "device": "cuda:0",
    "model_batch_size": 4,
    "log_every": 200,
    "save_every": 3000,
    "dec_init_norm": 0.08,
    "hook_point": "blocks.22.hook_resid_pre",
    "wandb_project": "a",
    "wandb_entity": "santiago-aranguri-reg-new-york-university",
}

cfg = arg_parse_update_cfg(default_cfg)

trainer = Trainer(cfg, base_model, tuned_model, tokens)
trainer.train()
#trainer.save()

In IPython - skipped argparse


Estimating norm scaling factor: 100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
Estimating norm scaling factor: 100%|██████████| 100/100 [00:45<00:00,  2.19it/s]


Refreshing the buffer!


100%|██████████| 128/128 [00:37<00:00,  3.42it/s]
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msantiago-aranguri-reg[0m ([33msantiago-aranguri-reg-new-york-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 3/14648 [00:00<22:57, 10.63it/s]

{'loss': 4367.791015625, 'l2_loss': 4367.791015625, 'l1_loss': 117.06669616699219, 'l0_loss': 8187.79052734375, 'l1_coeff': 0.0, 'lr': 5e-05, 'explained_variance': -0.2979230284690857, 'explained_variance_A': -0.29202109575271606, 'explained_variance_B': -0.3061803877353668}


  0%|          | 61/14648 [00:04<16:51, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.45it/s]
  1%|          | 125/14648 [00:14<16:47, 14.41it/s] 

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.46it/s]
  1%|▏         | 187/14648 [00:25<16:43, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.47it/s]
  1%|▏         | 203/14648 [00:32<34:59,  6.88it/s]  

{'loss': 1586.200927734375, 'l2_loss': 990.142578125, 'l1_loss': 1091.3826904296875, 'l0_loss': 8258.8701171875, 'l1_coeff': 0.5461496450027307, 'lr': 5e-05, 'explained_variance': 0.7111722826957703, 'explained_variance_A': 0.7095806002616882, 'explained_variance_B': 0.7132670879364014}


  2%|▏         | 251/14648 [00:35<16:41, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
  2%|▏         | 313/14648 [00:46<16:40, 14.33it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.10it/s]
  3%|▎         | 377/14648 [00:57<16:35, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.79it/s]
  3%|▎         | 403/14648 [01:05<19:44, 12.03it/s]  

{'loss': 1679.0648193359375, 'l2_loss': 946.0999755859375, 'l1_loss': 671.029296875, 'l0_loss': 3295.8720703125, 'l1_coeff': 1.0922992900054613, 'lr': 5e-05, 'explained_variance': 0.7247815728187561, 'explained_variance_A': 0.7220245599746704, 'explained_variance_B': 0.7264257073402405}


  3%|▎         | 439/14648 [01:08<16:26, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.41it/s]
  3%|▎         | 503/14648 [01:18<16:21, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.56it/s]
  4%|▍         | 565/14648 [01:29<16:17, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.57it/s]
  4%|▍         | 603/14648 [01:37<16:36, 14.09it/s]  

{'loss': 1762.0941162109375, 'l2_loss': 1057.070556640625, 'l1_loss': 430.29937744140625, 'l0_loss': 1519.279052734375, 'l1_coeff': 1.638448935008192, 'lr': 5e-05, 'explained_variance': 0.6885404586791992, 'explained_variance_A': 0.6864312887191772, 'explained_variance_B': 0.6908007860183716}


  4%|▍         | 629/14648 [01:39<16:13, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.52it/s]
  5%|▍         | 691/14648 [01:50<16:10, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.50it/s]
  5%|▌         | 755/14648 [02:00<16:04, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.46it/s]
  5%|▌         | 803/14648 [02:10<16:08, 14.30it/s]  

{'loss': 1762.5294189453125, 'l2_loss': 1117.6397705078125, 'l1_loss': 322.44482421875, 'l0_loss': 846.15966796875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.673366904258728, 'explained_variance_A': 0.6713030338287354, 'explained_variance_B': 0.6756473779678345}


  6%|▌         | 817/14648 [02:11<16:03, 14.35it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
  6%|▌         | 881/14648 [02:21<15:58, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
  6%|▋         | 943/14648 [02:32<15:51, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.43it/s]
  7%|▋         | 1003/14648 [02:42<15:49, 14.37it/s] 

{'loss': 1658.1251220703125, 'l2_loss': 1067.889404296875, 'l1_loss': 295.11785888671875, 'l0_loss': 632.64794921875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.6864092946052551, 'explained_variance_A': 0.683882474899292, 'explained_variance_B': 0.6891486048698425}


  7%|▋         | 1007/14648 [02:43<15:48, 14.38it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.47it/s]
  7%|▋         | 1069/14648 [02:53<15:44, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.50it/s]
  8%|▊         | 1133/14648 [03:04<15:39, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
  8%|▊         | 1195/14648 [03:14<15:35, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.48it/s]
  8%|▊         | 1203/14648 [03:21<1:26:13,  2.60it/s]

{'loss': 1582.5369873046875, 'l2_loss': 1021.1358642578125, 'l1_loss': 280.7005615234375, 'l0_loss': 515.45751953125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.6995224356651306, 'explained_variance_A': 0.6968647241592407, 'explained_variance_B': 0.7023978233337402}


  9%|▊         | 1259/14648 [03:25<15:31, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.38it/s]
  9%|▉         | 1321/14648 [03:35<15:26, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.44it/s]
  9%|▉         | 1385/14648 [03:46<15:21, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 10%|▉         | 1403/14648 [03:53<27:11,  8.12it/s]  

{'loss': 1535.9150390625, 'l2_loss': 994.3480834960938, 'l1_loss': 270.783447265625, 'l0_loss': 435.444580078125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7085280418395996, 'explained_variance_A': 0.7058030962944031, 'explained_variance_B': 0.7114382386207581}


 10%|▉         | 1447/14648 [03:56<15:19, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
 10%|█         | 1511/14648 [04:07<15:13, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.45it/s]
 11%|█         | 1573/14648 [04:17<15:11, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 11%|█         | 1603/14648 [04:26<16:28, 13.19it/s]  

{'loss': 1488.3525390625, 'l2_loss': 962.6171875, 'l1_loss': 262.86767578125, 'l0_loss': 383.39404296875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7157243490219116, 'explained_variance_A': 0.7132321000099182, 'explained_variance_B': 0.7183006405830383}


 11%|█         | 1637/14648 [04:28<15:06, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 12%|█▏        | 1699/14648 [04:39<15:02, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
 12%|█▏        | 1763/14648 [04:49<14:57, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.97it/s]
 12%|█▏        | 1803/14648 [04:59<15:11, 14.09it/s]  

{'loss': 1453.083251953125, 'l2_loss': 936.9307861328125, 'l1_loss': 258.07623291015625, 'l0_loss': 341.75830078125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.725476861000061, 'explained_variance_A': 0.7223192453384399, 'explained_variance_B': 0.7286897301673889}


 12%|█▏        | 1825/14648 [05:00<14:55, 14.32it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.12it/s]
 13%|█▎        | 1889/14648 [05:11<14:49, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 13%|█▎        | 1951/14648 [05:21<14:40, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.26it/s]
 14%|█▎        | 2003/14648 [05:31<14:42, 14.33it/s]  

{'loss': 1435.138427734375, 'l2_loss': 927.2041015625, 'l1_loss': 253.96719360351562, 'l0_loss': 312.02197265625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7273223400115967, 'explained_variance_A': 0.7243239879608154, 'explained_variance_B': 0.7302642464637756}


 14%|█▍        | 2015/14648 [05:32<14:40, 14.35it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.13it/s]
 14%|█▍        | 2077/14648 [05:43<14:37, 14.33it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.15it/s]
 15%|█▍        | 2141/14648 [05:54<14:29, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
 15%|█▌        | 2203/14648 [06:04<14:29, 14.32it/s]  

{'loss': 1406.6168212890625, 'l2_loss': 908.8285522460938, 'l1_loss': 248.89414978027344, 'l0_loss': 284.93408203125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7327145338058472, 'explained_variance_A': 0.7299231290817261, 'explained_variance_B': 0.7356675863265991}
Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.75it/s]
 15%|█▌        | 2267/14648 [06:15<14:23, 14.33it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 16%|█▌        | 2329/14648 [06:26<14:18, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.97it/s]
 16%|█▋        | 2393/14648 [06:37<14:13, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.16it/s]
 16%|█▋        | 2403/14648 [06:44<1:00:41,  3.36it/s]

{'loss': 1383.4423828125, 'l2_loss': 891.3148803710938, 'l1_loss': 246.063720703125, 'l0_loss': 264.947021484375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7381731271743774, 'explained_variance_A': 0.7353579998016357, 'explained_variance_B': 0.7411300539970398}


 17%|█▋        | 2455/14648 [06:47<14:10, 14.33it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.21it/s]
 17%|█▋        | 2519/14648 [06:58<14:04, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.26it/s]
 18%|█▊        | 2581/14648 [07:09<14:01, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.11it/s]
 18%|█▊        | 2603/14648 [07:17<19:24, 10.34it/s]  

{'loss': 1368.1531982421875, 'l2_loss': 882.22412109375, 'l1_loss': 242.9645538330078, 'l0_loss': 249.755615234375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7402889728546143, 'explained_variance_A': 0.7374692559242249, 'explained_variance_B': 0.7431868314743042}


 18%|█▊        | 2645/14648 [07:20<13:55, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
 18%|█▊        | 2707/14648 [07:30<13:51, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.06it/s]
 19%|█▉        | 2771/14648 [07:41<13:48, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.84it/s]
 19%|█▉        | 2803/14648 [07:50<14:41, 13.44it/s]  

{'loss': 1361.2110595703125, 'l2_loss': 877.2327880859375, 'l1_loss': 241.9891357421875, 'l0_loss': 235.15771484375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7436168193817139, 'explained_variance_A': 0.7407275438308716, 'explained_variance_B': 0.7464490532875061}


 19%|█▉        | 2833/14648 [07:52<13:44, 14.32it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.02it/s]
 20%|█▉        | 2897/14648 [08:03<13:37, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.01it/s]
 20%|██        | 2959/14648 [08:13<13:35, 14.33it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.12it/s]
 20%|██        | 3001/14648 [08:24<38:18,  5.07it/s]  

Saved as version 0 in checkpoints/version_14
{'loss': 1342.041259765625, 'l2_loss': 864.3353271484375, 'l1_loss': 238.85296630859375, 'l0_loss': 222.012451171875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7471309304237366, 'explained_variance_A': 0.7442789077758789, 'explained_variance_B': 0.7499542236328125}


 21%|██        | 3023/14648 [08:25<14:01, 13.82it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.99it/s]
 21%|██        | 3085/14648 [08:36<13:25, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.13it/s]
 21%|██▏       | 3149/14648 [08:47<13:22, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.02it/s]
 22%|██▏       | 3203/14648 [08:57<13:18, 14.33it/s]  

{'loss': 1324.69580078125, 'l2_loss': 851.894287109375, 'l1_loss': 236.40072631835938, 'l0_loss': 212.222900390625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7489075660705566, 'explained_variance_A': 0.7458019256591797, 'explained_variance_B': 0.7520802617073059}


 22%|██▏       | 3211/14648 [08:57<13:17, 14.34it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.98it/s]
 22%|██▏       | 3275/14648 [09:08<13:13, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.11it/s]
 23%|██▎       | 3337/14648 [09:19<13:07, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.09it/s]
 23%|██▎       | 3401/14648 [09:30<13:04, 14.34it/s]  

{'loss': 1319.8719482421875, 'l2_loss': 852.8035888671875, 'l1_loss': 233.53419494628906, 'l0_loss': 202.151123046875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7490020990371704, 'explained_variance_A': 0.7460277676582336, 'explained_variance_B': 0.7520563006401062}
Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.17it/s]
 24%|██▎       | 3463/14648 [09:40<12:58, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.27it/s]
 24%|██▍       | 3527/14648 [09:51<12:54, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.22it/s]
 25%|██▍       | 3589/14648 [10:02<12:50, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.20it/s]
 25%|██▍       | 3603/14648 [10:09<33:14,  5.54it/s]  

{'loss': 1307.304931640625, 'l2_loss': 839.9310302734375, 'l1_loss': 233.68692016601562, 'l0_loss': 195.313232421875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7541117668151855, 'explained_variance_A': 0.7511242032051086, 'explained_variance_B': 0.7570462226867676}


 25%|██▍       | 3653/14648 [10:12<12:44, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.16it/s]
 25%|██▌       | 3715/14648 [10:23<12:40, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.03it/s]
 26%|██▌       | 3779/14648 [10:34<12:36, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.14it/s]
 26%|██▌       | 3803/14648 [10:42<15:58, 11.32it/s]  

{'loss': 1291.332763671875, 'l2_loss': 828.7554931640625, 'l1_loss': 231.28860473632812, 'l0_loss': 187.69873046875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7563475370407104, 'explained_variance_A': 0.753368616104126, 'explained_variance_B': 0.7594094276428223}


 26%|██▌       | 3841/14648 [10:45<12:32, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.21it/s]
 27%|██▋       | 3905/14648 [10:55<12:27, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.22it/s]
 27%|██▋       | 3967/14648 [11:06<12:22, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.25it/s]
 27%|██▋       | 4003/14648 [11:15<12:43, 13.94it/s]  

{'loss': 1289.0377197265625, 'l2_loss': 830.8641967773438, 'l1_loss': 229.08677673339844, 'l0_loss': 180.711181640625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7560280561447144, 'explained_variance_A': 0.7532860040664673, 'explained_variance_B': 0.7587857246398926}


 28%|██▊       | 4031/14648 [11:17<12:18, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.00it/s]
 28%|██▊       | 4093/14648 [11:27<12:12, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.35it/s]
 28%|██▊       | 4157/14648 [11:38<12:08, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 29%|██▊       | 4203/14648 [11:47<12:09, 14.33it/s]  

{'loss': 1272.9781494140625, 'l2_loss': 819.2994384765625, 'l1_loss': 226.83934020996094, 'l0_loss': 174.141845703125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7589225769042969, 'explained_variance_A': 0.7560113668441772, 'explained_variance_B': 0.761928379535675}


 29%|██▉       | 4219/14648 [11:48<12:05, 14.38it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.32it/s]
 29%|██▉       | 4283/14648 [11:59<11:59, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 30%|██▉       | 4345/14648 [12:10<11:56, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 30%|███       | 4403/14648 [12:20<11:52, 14.39it/s]  

{'loss': 1274.53759765625, 'l2_loss': 821.3565673828125, 'l1_loss': 226.59048461914062, 'l0_loss': 171.249755859375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7570720911026001, 'explained_variance_A': 0.7542845010757446, 'explained_variance_B': 0.7599532604217529}


 30%|███       | 4409/14648 [12:20<11:51, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 31%|███       | 4471/14648 [12:31<11:47, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 31%|███       | 4535/14648 [12:41<11:41, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 31%|███▏      | 4597/14648 [12:52<11:39, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 31%|███▏      | 4603/14648 [12:59<1:27:42,  1.91it/s]

{'loss': 1269.8934326171875, 'l2_loss': 818.099365234375, 'l1_loss': 225.89703369140625, 'l0_loss': 165.931396484375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7599549293518066, 'explained_variance_A': 0.7568554878234863, 'explained_variance_B': 0.7631725668907166}


 32%|███▏      | 4661/14648 [13:03<11:33, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 32%|███▏      | 4723/14648 [13:13<11:29, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.35it/s]
 33%|███▎      | 4787/14648 [13:24<11:25, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.37it/s]
 33%|███▎      | 4803/14648 [13:31<23:56,  6.85it/s]  

{'loss': 1266.0362548828125, 'l2_loss': 815.4766845703125, 'l1_loss': 225.27978515625, 'l0_loss': 163.513427734375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7590293288230896, 'explained_variance_A': 0.7561632394790649, 'explained_variance_B': 0.7618860006332397}


 33%|███▎      | 4849/14648 [13:34<11:20, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.40it/s]
 34%|███▎      | 4913/14648 [13:45<11:15, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.37it/s]
 34%|███▍      | 4975/14648 [13:55<11:11, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 34%|███▍      | 5003/14648 [14:04<12:37, 12.73it/s]  

{'loss': 1254.7969970703125, 'l2_loss': 809.0067138671875, 'l1_loss': 222.89512634277344, 'l0_loss': 157.307861328125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7609937787055969, 'explained_variance_A': 0.7583412528038025, 'explained_variance_B': 0.7636047601699829}


 34%|███▍      | 5039/14648 [14:06<11:06, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 35%|███▍      | 5101/14648 [14:17<11:02, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 35%|███▌      | 5165/14648 [14:27<10:58, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.41it/s]
 36%|███▌      | 5203/14648 [14:36<11:09, 14.10it/s]  

{'loss': 1254.8543701171875, 'l2_loss': 809.0692749023438, 'l1_loss': 222.89256286621094, 'l0_loss': 154.397216796875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7627261281013489, 'explained_variance_A': 0.7599284648895264, 'explained_variance_B': 0.7656360268592834}


 36%|███▌      | 5227/14648 [14:38<10:54, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.44it/s]
 36%|███▌      | 5291/14648 [14:48<10:49, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.45it/s]
 37%|███▋      | 5353/14648 [14:59<10:44, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.25it/s]
 37%|███▋      | 5403/14648 [15:08<10:43, 14.37it/s]  

{'loss': 1238.5341796875, 'l2_loss': 797.0131225585938, 'l1_loss': 220.76055908203125, 'l0_loss': 149.6494140625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7655702829360962, 'explained_variance_A': 0.762563943862915, 'explained_variance_B': 0.7684772610664368}


 37%|███▋      | 5417/14648 [15:09<10:40, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 37%|███▋      | 5479/14648 [15:20<10:37, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 38%|███▊      | 5543/14648 [15:31<10:31, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.37it/s]
 38%|███▊      | 5603/14648 [15:41<10:28, 14.38it/s]  

{'loss': 1247.1533203125, 'l2_loss': 805.279296875, 'l1_loss': 220.93702697753906, 'l0_loss': 147.513916015625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7633547186851501, 'explained_variance_A': 0.7599564790725708, 'explained_variance_B': 0.766814112663269}


 38%|███▊      | 5605/14648 [15:41<10:28, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 39%|███▊      | 5669/14648 [15:52<10:26, 14.33it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.15it/s]
 39%|███▉      | 5731/14648 [16:02<10:21, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.23it/s]
 40%|███▉      | 5795/14648 [16:13<10:14, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.26it/s]
 40%|███▉      | 5803/14648 [16:20<57:42,  2.55it/s]  

{'loss': 1228.38330078125, 'l2_loss': 791.6129150390625, 'l1_loss': 218.38519287109375, 'l0_loss': 143.760498046875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7663115859031677, 'explained_variance_A': 0.763296365737915, 'explained_variance_B': 0.7693337798118591}


 40%|███▉      | 5857/14648 [16:24<10:14, 14.31it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.11it/s]
 40%|████      | 5921/14648 [16:35<10:06, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
 41%|████      | 5983/14648 [16:45<10:01, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.09it/s]
 41%|████      | 6001/14648 [16:54<36:10,  3.98it/s]  

Saved as version 1 in checkpoints/version_14
{'loss': 1232.050537109375, 'l2_loss': 792.7603759765625, 'l1_loss': 219.64508056640625, 'l0_loss': 143.242919921875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7665172219276428, 'explained_variance_A': 0.7635282278060913, 'explained_variance_B': 0.7695135474205017}


 41%|████▏     | 6047/14648 [16:57<09:59, 14.34it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.16it/s]
 42%|████▏     | 6109/14648 [17:07<09:55, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.20it/s]
 42%|████▏     | 6173/14648 [17:18<09:50, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.24it/s]
 42%|████▏     | 6203/14648 [17:27<10:48, 13.03it/s]  

{'loss': 1230.7490234375, 'l2_loss': 792.9290771484375, 'l1_loss': 218.9099578857422, 'l0_loss': 139.52392578125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7675915360450745, 'explained_variance_A': 0.7645105719566345, 'explained_variance_B': 0.7707250118255615}


 43%|████▎     | 6235/14648 [17:29<09:45, 14.38it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.74it/s]
 43%|████▎     | 6299/14648 [17:40<09:40, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.67it/s]
 43%|████▎     | 6361/14648 [17:51<09:37, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.18it/s]
 44%|████▎     | 6403/14648 [18:01<09:40, 14.20it/s]  

{'loss': 1224.0592041015625, 'l2_loss': 790.5673828125, 'l1_loss': 216.7459259033203, 'l0_loss': 137.00927734375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7667484283447266, 'explained_variance_A': 0.7638864517211914, 'explained_variance_B': 0.7696642875671387}


 44%|████▍     | 6425/14648 [18:02<09:33, 14.35it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.26it/s]
 44%|████▍     | 6487/14648 [18:13<09:28, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.06it/s]
 45%|████▍     | 6551/14648 [18:24<09:23, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.05it/s]
 45%|████▌     | 6603/14648 [18:34<09:22, 14.31it/s]  

{'loss': 1220.4061279296875, 'l2_loss': 788.4611206054688, 'l1_loss': 215.97251892089844, 'l0_loss': 134.046142578125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7674593925476074, 'explained_variance_A': 0.764519214630127, 'explained_variance_B': 0.7704052925109863}


 45%|████▌     | 6613/14648 [18:34<09:19, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.14it/s]
 46%|████▌     | 6677/14648 [18:45<09:14, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.91it/s]
 46%|████▌     | 6739/14648 [18:56<09:09, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
 46%|████▋     | 6803/14648 [19:07<09:05, 14.39it/s]  

{'loss': 1226.450439453125, 'l2_loss': 794.70263671875, 'l1_loss': 215.87388610839844, 'l0_loss': 131.96484375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7667765617370605, 'explained_variance_A': 0.7639414072036743, 'explained_variance_B': 0.7695239782333374}
Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.04it/s]
 47%|████▋     | 6865/14648 [19:17<09:00, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
 47%|████▋     | 6929/14648 [19:28<08:56, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.12it/s]
 48%|████▊     | 6991/14648 [19:39<08:53, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 48%|████▊     | 7003/14648 [19:46<28:52,  4.41it/s]  

{'loss': 1223.1661376953125, 'l2_loss': 787.9380493164062, 'l1_loss': 217.61404418945312, 'l0_loss': 132.09521484375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7691245079040527, 'explained_variance_A': 0.7659647464752197, 'explained_variance_B': 0.7723253965377808}


 48%|████▊     | 7055/14648 [19:49<08:47, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.35it/s]
 49%|████▊     | 7117/14648 [20:00<08:42, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.42it/s]
 49%|████▉     | 7181/14648 [20:10<08:38, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 49%|████▉     | 7203/14648 [20:18<11:53, 10.44it/s]  

{'loss': 1213.8621826171875, 'l2_loss': 782.620849609375, 'l1_loss': 215.62066650390625, 'l0_loss': 129.650634765625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7690184116363525, 'explained_variance_A': 0.7660481333732605, 'explained_variance_B': 0.7719410061836243}


 49%|████▉     | 7243/14648 [20:21<08:33, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.27it/s]
 50%|████▉     | 7307/14648 [20:32<08:30, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 50%|█████     | 7369/14648 [20:42<08:25, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.36it/s]
 51%|█████     | 7403/14648 [20:51<08:46, 13.77it/s]  

{'loss': 1213.02587890625, 'l2_loss': 782.8696899414062, 'l1_loss': 215.07806396484375, 'l0_loss': 127.4521484375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7705440521240234, 'explained_variance_A': 0.7678072452545166, 'explained_variance_B': 0.7732632160186768}


 51%|█████     | 7433/14648 [20:53<08:22, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.17it/s]
 51%|█████     | 7495/14648 [21:04<08:19, 14.33it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.32it/s]
 52%|█████▏    | 7559/14648 [21:14<08:11, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
 52%|█████▏    | 7603/14648 [21:23<08:12, 14.30it/s]  

{'loss': 1212.327880859375, 'l2_loss': 784.5350341796875, 'l1_loss': 213.89642333984375, 'l0_loss': 125.490478515625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7689913511276245, 'explained_variance_A': 0.7661737203598022, 'explained_variance_B': 0.7716843485832214}


 52%|█████▏    | 7621/14648 [21:25<08:07, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.26it/s]
 52%|█████▏    | 7685/14648 [21:35<08:05, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.16it/s]
 53%|█████▎    | 7747/14648 [21:46<07:59, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.20it/s]
 53%|█████▎    | 7803/14648 [21:56<07:57, 14.33it/s]  

{'loss': 1213.4501953125, 'l2_loss': 785.5167846679688, 'l1_loss': 213.9667205810547, 'l0_loss': 124.461181640625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7690039277076721, 'explained_variance_A': 0.7660636901855469, 'explained_variance_B': 0.7719242572784424}


 53%|█████▎    | 7811/14648 [21:57<07:57, 14.32it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 54%|█████▎    | 7873/14648 [22:07<07:50, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.21it/s]
 54%|█████▍    | 7937/14648 [22:18<07:45, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.37it/s]
 55%|█████▍    | 7999/14648 [22:29<07:43, 14.35it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.05it/s]
 55%|█████▍    | 8003/14648 [22:35<1:21:59,  1.35it/s]

{'loss': 1208.10205078125, 'l2_loss': 781.6787719726562, 'l1_loss': 213.21163940429688, 'l0_loss': 122.229736328125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7705109119415283, 'explained_variance_A': 0.7676750421524048, 'explained_variance_B': 0.7732341289520264}


 55%|█████▌    | 8063/14648 [22:39<07:37, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 55%|█████▌    | 8125/14648 [22:50<07:34, 14.34it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.18it/s]
 56%|█████▌    | 8189/14648 [23:01<07:28, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.27it/s]
 56%|█████▌    | 8203/14648 [23:08<19:19,  5.56it/s]  

{'loss': 1203.307861328125, 'l2_loss': 776.0548095703125, 'l1_loss': 213.62655639648438, 'l0_loss': 121.75390625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7720658779144287, 'explained_variance_A': 0.769402265548706, 'explained_variance_B': 0.774732768535614}


 56%|█████▋    | 8251/14648 [23:11<07:24, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 57%|█████▋    | 8315/14648 [23:22<07:20, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.18it/s]
 57%|█████▋    | 8377/14648 [23:33<07:15, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 57%|█████▋    | 8403/14648 [23:41<08:34, 12.15it/s]  

{'loss': 1213.0264892578125, 'l2_loss': 785.1517333984375, 'l1_loss': 213.9373779296875, 'l0_loss': 121.098876953125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7702188491821289, 'explained_variance_A': 0.7673633098602295, 'explained_variance_B': 0.7731214761734009}


 58%|█████▊    | 8441/14648 [23:43<07:11, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.38it/s]
 58%|█████▊    | 8503/14648 [23:54<07:06, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 58%|█████▊    | 8567/14648 [24:04<07:02, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.11it/s]
 59%|█████▊    | 8603/14648 [24:13<07:13, 13.95it/s]  

{'loss': 1206.0323486328125, 'l2_loss': 781.3464965820312, 'l1_loss': 212.3429412841797, 'l0_loss': 119.050048828125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7706763744354248, 'explained_variance_A': 0.7675762176513672, 'explained_variance_B': 0.7736868858337402}


 59%|█████▉    | 8629/14648 [24:15<06:58, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 59%|█████▉    | 8693/14648 [24:26<06:54, 14.38it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.17it/s]
 60%|█████▉    | 8755/14648 [24:36<06:49, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 60%|██████    | 8803/14648 [24:46<06:49, 14.29it/s]  

{'loss': 1199.41064453125, 'l2_loss': 778.938720703125, 'l1_loss': 210.23599243164062, 'l0_loss': 117.243408203125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7699235677719116, 'explained_variance_A': 0.7668095827102661, 'explained_variance_B': 0.7730391025543213}


 60%|██████    | 8819/14648 [24:47<06:44, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.35it/s]
 61%|██████    | 8881/14648 [24:58<06:40, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 61%|██████    | 8945/14648 [25:08<06:36, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.35it/s]
 61%|██████▏   | 9001/14648 [25:19<18:20,  5.13it/s]  

Saved as version 2 in checkpoints/version_14
{'loss': 1195.6634521484375, 'l2_loss': 777.5621948242188, 'l1_loss': 209.05062866210938, 'l0_loss': 115.1630859375, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7699626684188843, 'explained_variance_A': 0.7667761445045471, 'explained_variance_B': 0.7729599475860596}


 61%|██████▏   | 9007/14648 [25:20<10:35,  8.88it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 62%|██████▏   | 9071/14648 [25:30<06:27, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 62%|██████▏   | 9133/14648 [25:41<06:23, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 63%|██████▎   | 9197/14648 [25:52<06:19, 14.36it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 63%|██████▎   | 9203/14648 [25:58<47:51,  1.90it/s]  

{'loss': 1190.8585205078125, 'l2_loss': 771.6563720703125, 'l1_loss': 209.60108947753906, 'l0_loss': 114.326171875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7721801996231079, 'explained_variance_A': 0.7692241668701172, 'explained_variance_B': 0.775118887424469}


 63%|██████▎   | 9259/14648 [26:02<06:15, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.23it/s]
 64%|██████▎   | 9323/14648 [26:13<06:09, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 64%|██████▍   | 9385/14648 [26:23<06:05, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.32it/s]
 64%|██████▍   | 9403/14648 [26:31<10:46,  8.11it/s]  

{'loss': 1194.338623046875, 'l2_loss': 774.1512451171875, 'l1_loss': 210.09368896484375, 'l0_loss': 113.815185546875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.773054838180542, 'explained_variance_A': 0.7704920172691345, 'explained_variance_B': 0.775696337223053}


 65%|██████▍   | 9449/14648 [26:34<06:00, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 65%|██████▍   | 9511/14648 [26:44<05:56, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 65%|██████▌   | 9575/14648 [26:55<05:51, 14.42it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 66%|██████▌   | 9603/14648 [27:03<06:35, 12.75it/s]  

{'loss': 1185.524658203125, 'l2_loss': 767.7637939453125, 'l1_loss': 208.8804473876953, 'l0_loss': 112.52978515625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.772899329662323, 'explained_variance_A': 0.7702279090881348, 'explained_variance_B': 0.7755495309829712}


 66%|██████▌   | 9637/14648 [27:06<05:48, 14.38it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 66%|██████▌   | 9701/14648 [27:16<05:43, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
 67%|██████▋   | 9763/14648 [27:27<05:39, 14.39it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.26it/s]
 67%|██████▋   | 9803/14648 [27:36<05:41, 14.20it/s]  

{'loss': 1186.8035888671875, 'l2_loss': 766.756103515625, 'l1_loss': 210.0237579345703, 'l0_loss': 112.75732421875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.773876428604126, 'explained_variance_A': 0.7708449363708496, 'explained_variance_B': 0.7767958045005798}


 67%|██████▋   | 9827/14648 [27:38<05:34, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 68%|██████▊   | 9889/14648 [27:48<05:30, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 68%|██████▊   | 9953/14648 [27:59<05:25, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 68%|██████▊   | 10003/14648 [28:09<05:23, 14.37it/s] 

{'loss': 1188.7735595703125, 'l2_loss': 769.2962646484375, 'l1_loss': 209.7386474609375, 'l0_loss': 112.213134765625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7736728191375732, 'explained_variance_A': 0.7704188823699951, 'explained_variance_B': 0.7767630219459534}


 68%|██████▊   | 10015/14648 [28:09<05:21, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.32it/s]
 69%|██████▉   | 10079/14648 [28:20<05:17, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.25it/s]
 69%|██████▉   | 10141/14648 [28:31<05:12, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.25it/s]
 70%|██████▉   | 10203/14648 [28:41<05:08, 14.40it/s]  

{'loss': 1191.578857421875, 'l2_loss': 773.6963500976562, 'l1_loss': 208.94122314453125, 'l0_loss': 110.871826171875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7727370262145996, 'explained_variance_A': 0.769904613494873, 'explained_variance_B': 0.7755680084228516}


 70%|██████▉   | 10205/14648 [28:41<05:08, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 70%|███████   | 10267/14648 [28:52<05:04, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 71%|███████   | 10331/14648 [29:03<04:59, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 71%|███████   | 10393/14648 [29:13<04:55, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 71%|███████   | 10403/14648 [29:20<20:44,  3.41it/s]  

{'loss': 1177.6083984375, 'l2_loss': 762.4500732421875, 'l1_loss': 207.57916259765625, 'l0_loss': 108.775634765625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7751813530921936, 'explained_variance_A': 0.7722856402397156, 'explained_variance_B': 0.7781023979187012}


 71%|███████▏  | 10457/14648 [29:24<04:51, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 72%|███████▏  | 10519/14648 [29:34<04:46, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.22it/s]
 72%|███████▏  | 10583/14648 [29:45<04:42, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.25it/s]
 72%|███████▏  | 10603/14648 [29:53<07:14,  9.30it/s]  

{'loss': 1191.92333984375, 'l2_loss': 774.1607055664062, 'l1_loss': 208.88131713867188, 'l0_loss': 109.2236328125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7731782793998718, 'explained_variance_A': 0.7704463005065918, 'explained_variance_B': 0.7758881449699402}


 73%|███████▎  | 10645/14648 [29:56<04:37, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 73%|███████▎  | 10709/14648 [30:06<04:33, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.35it/s]
 74%|███████▎  | 10771/14648 [30:17<04:29, 14.40it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 74%|███████▍  | 10803/14648 [30:25<04:43, 13.55it/s]  

{'loss': 1192.378173828125, 'l2_loss': 774.6395263671875, 'l1_loss': 208.86935424804688, 'l0_loss': 108.135986328125, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7724909782409668, 'explained_variance_A': 0.7696245908737183, 'explained_variance_B': 0.7754092812538147}


 74%|███████▍  | 10835/14648 [30:27<04:24, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 74%|███████▍  | 10897/14648 [30:38<04:21, 14.37it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 75%|███████▍  | 10961/14648 [30:49<04:15, 14.42it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 75%|███████▌  | 11003/14648 [30:58<04:15, 14.26it/s]  

{'loss': 1182.59033203125, 'l2_loss': 766.8575439453125, 'l1_loss': 207.86642456054688, 'l0_loss': 107.771728515625, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7730039358139038, 'explained_variance_A': 0.7699358463287354, 'explained_variance_B': 0.7760923504829407}


 75%|███████▌  | 11023/14648 [30:59<04:11, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.37it/s]
 76%|███████▌  | 11087/14648 [31:10<04:07, 14.41it/s]  

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 76%|███████▌  | 11149/14648 [31:20<04:03, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 76%|███████▋  | 11203/14648 [31:30<03:59, 14.39it/s]

{'loss': 1190.813232421875, 'l2_loss': 774.583984375, 'l1_loss': 208.11465454101562, 'l0_loss': 107.571044921875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7726938128471375, 'explained_variance_A': 0.7701160907745361, 'explained_variance_B': 0.7752329111099243}


 77%|███████▋  | 11213/14648 [31:31<03:58, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 77%|███████▋  | 11275/14648 [31:41<03:53, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 77%|███████▋  | 11339/14648 [31:52<03:49, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.23it/s]
 78%|███████▊  | 11401/14648 [32:03<03:45, 14.41it/s]

{'loss': 1182.018310546875, 'l2_loss': 768.2464599609375, 'l1_loss': 206.88592529296875, 'l0_loss': 106.326904296875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7732788920402527, 'explained_variance_A': 0.7702999114990234, 'explained_variance_B': 0.7762143015861511}
Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 78%|███████▊  | 11465/14648 [32:13<03:40, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.41it/s]
 79%|███████▊  | 11527/14648 [32:24<03:36, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 79%|███████▉  | 11591/14648 [32:35<03:32, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.14it/s]
 79%|███████▉  | 11603/14648 [32:42<11:38,  4.36it/s]

{'loss': 1176.369384765625, 'l2_loss': 762.0472412109375, 'l1_loss': 207.16104125976562, 'l0_loss': 105.36669921875, 'l1_coeff': 2, 'lr': 5e-05, 'explained_variance': 0.7758142948150635, 'explained_variance_A': 0.7733038067817688, 'explained_variance_B': 0.7783418893814087}


 80%|███████▉  | 11653/14648 [32:45<03:28, 14.35it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.99it/s]
 80%|███████▉  | 11717/14648 [32:56<03:24, 14.34it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
 80%|████████  | 11779/14648 [33:07<03:19, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 81%|████████  | 11803/14648 [33:15<04:10, 11.36it/s]

{'loss': 1177.4559326171875, 'l2_loss': 763.5989990234375, 'l1_loss': 206.928466796875, 'l0_loss': 104.324951171875, 'l1_coeff': 2, 'lr': 4.859025122883673e-05, 'explained_variance': 0.7754935026168823, 'explained_variance_A': 0.7722240686416626, 'explained_variance_B': 0.7786104083061218}


 81%|████████  | 11843/14648 [33:17<03:14, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.32it/s]
 81%|████████▏ | 11905/14648 [33:28<03:10, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.20it/s]
 82%|████████▏ | 11969/14648 [33:39<03:05, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.36it/s]
 82%|████████▏ | 12001/14648 [33:48<08:47,  5.02it/s]

Saved as version 3 in checkpoints/version_14
{'loss': 1175.384521484375, 'l2_loss': 761.989990234375, 'l1_loss': 206.697265625, 'l0_loss': 104.90283203125, 'l1_coeff': 2, 'lr': 4.5176815947569665e-05, 'explained_variance': 0.7740140557289124, 'explained_variance_A': 0.7708516120910645, 'explained_variance_B': 0.777055561542511}


 82%|████████▏ | 12031/14648 [33:50<03:03, 14.27it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 83%|████████▎ | 12095/14648 [34:01<02:57, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.38it/s]
 83%|████████▎ | 12157/14648 [34:11<02:53, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.25it/s]
 83%|████████▎ | 12203/14648 [34:21<02:50, 14.32it/s]

{'loss': 1178.5498046875, 'l2_loss': 767.3719482421875, 'l1_loss': 205.5889434814453, 'l0_loss': 103.681396484375, 'l1_coeff': 2, 'lr': 4.176338066630259e-05, 'explained_variance': 0.7738198637962341, 'explained_variance_A': 0.770758867263794, 'explained_variance_B': 0.7768080830574036}


 83%|████████▎ | 12221/14648 [34:22<02:48, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.15it/s]
 84%|████████▍ | 12283/14648 [34:33<02:44, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
 84%|████████▍ | 12347/14648 [34:43<02:39, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.27it/s]
 85%|████████▍ | 12403/14648 [34:53<02:36, 14.39it/s]

{'loss': 1185.6724853515625, 'l2_loss': 770.79296875, 'l1_loss': 207.4397735595703, 'l0_loss': 103.916259765625, 'l1_coeff': 2, 'lr': 3.834994538503553e-05, 'explained_variance': 0.77411949634552, 'explained_variance_A': 0.7713510394096375, 'explained_variance_B': 0.7768954038619995}


 85%|████████▍ | 12409/14648 [34:54<02:35, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 85%|████████▌ | 12473/14648 [35:05<02:31, 14.34it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.14it/s]
 86%|████████▌ | 12535/14648 [35:15<02:27, 14.32it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.17it/s]
 86%|████████▌ | 12599/14648 [35:26<02:22, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.25it/s]
 86%|████████▌ | 12603/14648 [35:33<24:48,  1.37it/s]

{'loss': 1161.349365234375, 'l2_loss': 749.7286376953125, 'l1_loss': 205.8103485107422, 'l0_loss': 102.656982421875, 'l1_coeff': 2, 'lr': 3.493651010376846e-05, 'explained_variance': 0.7783855199813843, 'explained_variance_A': 0.7759454250335693, 'explained_variance_B': 0.7807656526565552}


 86%|████████▋ | 12661/14648 [35:37<02:21, 14.08it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 87%|████████▋ | 12725/14648 [35:47<02:13, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.31it/s]
 87%|████████▋ | 12787/14648 [35:58<02:09, 14.32it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 87%|████████▋ | 12803/14648 [36:05<04:30,  6.83it/s]

{'loss': 1176.3768310546875, 'l2_loss': 763.2481689453125, 'l1_loss': 206.5643310546875, 'l0_loss': 102.719482421875, 'l1_coeff': 2, 'lr': 3.1523074822501395e-05, 'explained_variance': 0.7750289440155029, 'explained_variance_A': 0.772417426109314, 'explained_variance_B': 0.7776013016700745}


 88%|████████▊ | 12851/14648 [36:09<02:04, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 88%|████████▊ | 12913/14648 [36:19<02:00, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.39it/s]
 89%|████████▊ | 12977/14648 [36:30<01:56, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 89%|████████▉ | 13003/14648 [36:38<02:15, 12.14it/s]

{'loss': 1171.1256103515625, 'l2_loss': 760.5335693359375, 'l1_loss': 205.2960205078125, 'l0_loss': 102.308349609375, 'l1_coeff': 2, 'lr': 2.8109639541234327e-05, 'explained_variance': 0.7754004001617432, 'explained_variance_A': 0.7725438475608826, 'explained_variance_B': 0.7782148718833923}


 89%|████████▉ | 13039/14648 [36:40<01:51, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 89%|████████▉ | 13103/14648 [36:51<01:47, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.30it/s]
 90%|████████▉ | 13165/14648 [37:01<01:43, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.28it/s]
 90%|█████████ | 13203/14648 [37:10<01:42, 14.10it/s]

{'loss': 1175.332763671875, 'l2_loss': 762.06787109375, 'l1_loss': 206.6324462890625, 'l0_loss': 102.810302734375, 'l1_coeff': 2, 'lr': 2.469620425996726e-05, 'explained_variance': 0.7750117778778076, 'explained_variance_A': 0.7721002697944641, 'explained_variance_B': 0.777833878993988}


 90%|█████████ | 13229/14648 [37:12<01:38, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.52it/s]
 91%|█████████ | 13291/14648 [37:22<01:34, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.49it/s]
 91%|█████████ | 13355/14648 [37:33<01:30, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.14it/s]
 92%|█████████▏| 13403/14648 [37:43<01:27, 14.30it/s]

{'loss': 1173.20654296875, 'l2_loss': 761.582275390625, 'l1_loss': 205.81211853027344, 'l0_loss': 101.738525390625, 'l1_coeff': 2, 'lr': 2.1282768978700195e-05, 'explained_variance': 0.7751990556716919, 'explained_variance_A': 0.7722412347793579, 'explained_variance_B': 0.778176486492157}


 92%|█████████▏| 13417/14648 [37:44<01:25, 14.35it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.17it/s]
 92%|█████████▏| 13481/14648 [37:55<01:21, 14.33it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.79it/s]
 92%|█████████▏| 13543/14648 [38:05<01:16, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.23it/s]
 93%|█████████▎| 13603/14648 [38:16<01:12, 14.38it/s]

{'loss': 1170.9666748046875, 'l2_loss': 758.8694458007812, 'l1_loss': 206.04861450195312, 'l0_loss': 102.07421875, 'l1_coeff': 2, 'lr': 1.7869333697433128e-05, 'explained_variance': 0.776222825050354, 'explained_variance_A': 0.7732902765274048, 'explained_variance_B': 0.779160737991333}


 93%|█████████▎| 13607/14648 [38:16<01:12, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.23it/s]
 93%|█████████▎| 13669/14648 [38:27<01:07, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.22it/s]
 94%|█████████▍| 13733/14648 [38:37<01:03, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
 94%|█████████▍| 13795/14648 [38:48<00:59, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:07<00:00,  8.75it/s]
 94%|█████████▍| 13803/14648 [38:56<06:17,  2.24it/s]

{'loss': 1167.1214599609375, 'l2_loss': 757.61181640625, 'l1_loss': 204.7548370361328, 'l0_loss': 100.9404296875, 'l1_coeff': 2, 'lr': 1.4455898416166058e-05, 'explained_variance': 0.7761411666870117, 'explained_variance_A': 0.7733922600746155, 'explained_variance_B': 0.7788332104682922}


 95%|█████████▍| 13859/14648 [39:00<00:54, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.38it/s]
 95%|█████████▌| 13921/14648 [39:10<00:50, 14.42it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.19it/s]
 95%|█████████▌| 13985/14648 [39:21<00:46, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00,  9.87it/s]
 96%|█████████▌| 14003/14648 [39:29<01:21,  7.96it/s]

{'loss': 1172.81787109375, 'l2_loss': 760.666015625, 'l1_loss': 206.07595825195312, 'l0_loss': 101.412353515625, 'l1_coeff': 2, 'lr': 1.1042463134898994e-05, 'explained_variance': 0.7760593295097351, 'explained_variance_A': 0.7728973031044006, 'explained_variance_B': 0.7791706919670105}


 96%|█████████▌| 14047/14648 [39:32<00:41, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 96%|█████████▋| 14111/14648 [39:43<00:37, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.33it/s]
 97%|█████████▋| 14173/14648 [39:53<00:33, 14.37it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.37it/s]
 97%|█████████▋| 14203/14648 [40:01<00:33, 13.21it/s]

{'loss': 1174.0526123046875, 'l2_loss': 760.8176879882812, 'l1_loss': 206.61746215820312, 'l0_loss': 101.65234375, 'l1_coeff': 2, 'lr': 7.629027853631926e-06, 'explained_variance': 0.7766748070716858, 'explained_variance_A': 0.7739760875701904, 'explained_variance_B': 0.7793198227882385}


 97%|█████████▋| 14237/14648 [40:04<00:28, 14.39it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.27it/s]
 98%|█████████▊| 14299/14648 [40:14<00:24, 14.40it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.34it/s]
 98%|█████████▊| 14363/14648 [40:25<00:19, 14.43it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.26it/s]
 98%|█████████▊| 14403/14648 [40:34<00:17, 14.15it/s]

{'loss': 1161.623291015625, 'l2_loss': 754.0944213867188, 'l1_loss': 203.76443481445312, 'l0_loss': 100.127197265625, 'l1_coeff': 2, 'lr': 4.215592572364857e-06, 'explained_variance': 0.7764337658882141, 'explained_variance_A': 0.773331880569458, 'explained_variance_B': 0.779484748840332}


 98%|█████████▊| 14425/14648 [40:35<00:15, 14.36it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.29it/s]
 99%|█████████▉| 14489/14648 [40:46<00:11, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.24it/s]
 99%|█████████▉| 14551/14648 [40:57<00:06, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.14it/s]
100%|█████████▉| 14603/14648 [41:07<00:03, 14.38it/s]

{'loss': 1165.5321044921875, 'l2_loss': 754.591796875, 'l1_loss': 205.47015380859375, 'l0_loss': 101.281494140625, 'l1_coeff': 2, 'lr': 8.021572910977937e-07, 'explained_variance': 0.7767516374588013, 'explained_variance_A': 0.7740790247917175, 'explained_variance_B': 0.7794264554977417}


100%|█████████▉| 14615/14648 [41:08<00:02, 14.41it/s]

Refreshing the buffer!


100%|██████████| 64/64 [00:06<00:00, 10.23it/s]
100%|██████████| 14648/14648 [41:16<00:00,  5.91it/s]


Saved as version 4 in checkpoints/version_14


: 

In [7]:
trainer.save()

Saved as version 5 in checkpoints/version_13


In [None]:
input_text = "The color of the grass is"

with torch.no_grad():
    output_tokens = base_model.generate()
    print(output_tokens)

tensor([[     2,    185,   2299,    577,    947,    573,   1618,    576,    476,
           8792,    575,    476,   1411,    575,  21237, 235336,    192,    109,
         235285,    791,    476]])


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
tokenizer.decode(output_tokens[0])

'<bos><h1>How to get the value of a variable in a function in Python?</h1>\n\nI have a'