In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time, json
import sys
sys.path.append("../")
import os
import torch
import numpy as np
from tqdm import tqdm

import logging
from src.utils import logging_utils
from src.utils import env_utils, experiment_utils
from src import functional
import wandb

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.DEBUG,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")

  from .autonotebook import tqdm as notebook_tqdm


2024-10-28 20:29:48 __main__ INFO     torch.__version__='2.5.0+cu124', torch.version.cuda='12.4'


In [3]:
from src.models import ModelandTokenizer

# MODEL_KEY = "meta-llama/Llama-3.2-3B-Instruct"
# MODEL_KEY = "meta-llama/Llama-3.1-8B-Instruct"

MODEL_KEY = "meta-llama/Llama-3.2-3B"
# MODEL_KEY = "google/gemma-2-2b"
# MODEL_KEY = "meta-llama/Llama-3.1-8B"

#! torch.adaptive precision
mt = ModelandTokenizer(
    model_key=MODEL_KEY,
    torch_dtype=torch.float32,
)

2024-10-28 20:29:49 accelerate.utils.modeling INFO     We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.65s/it]

2024-10-28 20:29:52 src.models INFO     loaded model </home/local_arnab/Codes/00_MODEL/meta-llama/Llama-3.2-3B> | size: 12255.675 MB | dtype: torch.float32 | device: cuda:0





### Utils

In [4]:
from src.tokens import prepare_input, find_token_range
from src.functional import interpret_logits, get_module_nnsight
from src.activation_manager import ActivationLoader, ActivationSample

def prepare_batch_input(batch: list[ActivationSample], mt: ModelandTokenizer):
    batch_prompts = [b.query for b in batch]
    batch_tokenized = prepare_input(
        prompts=batch_prompts,
        tokenizer=mt,
        return_offsets_mapping=True
    )

    int_tok_idx = []
    for idx in range(len(batch)):
        try:
            offset_mapping = batch_tokenized["offset_mapping"][idx]
            act_range = find_token_range(
                string=batch[idx].query,
                substring="#",
                occurrence=0,
                tokenizer=mt,
                offset_mapping=offset_mapping
            )
            int_tok_idx.append(act_range[1] - 1)
        except:
            logger.error(f"can't find '#' in \"{batch[idx].query}\" ==> bad training data")
            first_attn_token = batch_tokenized["attention_mask"].index(1) + 1
            int_tok_idx.append(first_attn_token) 

    batch_tokenized.pop("offset_mapping")

    return batch_tokenized, int_tok_idx

In [5]:
from src.functional import free_gpu_cache

@torch.inference_mode()
def evaluate_batch(batch: list[ActivationSample], mt: ModelandTokenizer):
    batch_tokenized, int_tok_idx = prepare_batch_input(batch, mt)
    # logger.debug(f"{batch_tokenized.input_ids.shape=}")
    activations = [b.activation for b in batch]

    with mt.trace(batch_tokenized):
        # patch activation at every layer
        module_names = mt.layer_names
        for idx, act, int_tok in zip(range(len(batch)), activations, int_tok_idx):
            for module_name in module_names:
                module = get_module_nnsight(mt, module_name)
                module.output[0][idx, int_tok, :] = torch.tensor(act, device=mt.device)
        last_logits = [
            mt.output.logits[idx, -1, :].save()
            for idx in range(len(batch))
        ]
        # output = mt.output.save() # do not save output, save some memory

    last_logits = torch.stack(last_logits)

    predicted_labels = [
        interpret_logits(
            tokenizer=mt,
            logits=last_logits[idx],
            # logits = output.logits[idx, -1, :],
            k = 2,
        )[0]
        for idx in range(len(batch))
    ]

    correct_labels = [b.label for b in batch]
    correct_count = 0

    for pred, correct in zip(predicted_labels, correct_labels):
        # print(f"{str(pred)=} | {correct=}")
        if pred.token.strip().lower() == correct.strip().lower():
            correct_count += 1

    free_gpu_cache()
    return correct_count

### Check Performance

In [6]:
# Loading PatchScope

from transformers import AutoModelForCausalLM
from nnsight import LanguageModel

finetuned_path = os.path.join(
    env_utils.DEFAULT_RESULTS_DIR, MODEL_KEY.split("/")[-1], "patchscope_finetuned"
)
finetuned_path = os.path.join(finetuned_path, os.listdir(finetuned_path)[-1])
tuned_model = AutoModelForCausalLM.from_pretrained(
    finetuned_path, torch_dtype=torch.float32
).to("cuda")

tuned_lm = LanguageModel(tuned_model)
patchscope = ModelandTokenizer(
    base_lm = tuned_lm,
    tokenizer = mt.tokenizer
)

FileNotFoundError: [Errno 2] No such file or directory: '/home/local_arnab/Codes/Projects/talkative_probes/results/Llama-3.2-3B/patchscope_finetuned'

In [22]:
from src.functional import get_concept_latents
from src.tokens import prepare_input

test = [
    ("The sky is blue.",  " yes"),
    ("The sky is green.", " no"),
    ("The sun rises in the north.", " no"),
    ("Michael Jordan used to play Cricket.", " no"),
    ("The capital of France is Paris.", " yes"),
    ("The capital of France is Berlin.", " no"),
]

activations = get_concept_latents(
    mt=mt, 
    queries=test, 
    interested_layers=[mt.layer_name_format.format(l) for l in range(10, 15)],
    check_answer=False,
)

lcc = LatentCacheCollection(latents=activations)

latent_arr = populate_latent_arr(lcc)

  0%|          | 0/6 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 6/6 [00:03<00:00,  1.89it/s]

2024-10-28 13:11:14 src.functional DEBUG    Collected 6 latents, out of 6





In [24]:
buffer = LatentSampleBuffer(latent_arr, batch_size=32)
len(buffer)

30

In [25]:
batch = next(iter(buffer))
len(batch)

30

In [26]:
evaluate_batch(batch, mt)  # Evaluate the batch

2024-10-28 13:11:34 __main__ DEBUG    batch_tokenized.input_ids.shape=torch.Size([30, 21])


  output = self.target(*args, **kwargs)


str(pred)='" No" (p=0.214)' | correct=' no'
str(pred)='" Yes" (p=0.163)' | correct=' yes'
str(pred)='" Yes" (p=0.310)' | correct=' no'
str(pred)='" yes" (p=0.171)' | correct=' yes'
str(pred)='" no" (p=0.226)' | correct=' yes'
str(pred)='" No" (p=0.220)' | correct=' no'
str(pred)='" Yes" (p=0.247)' | correct=' no'
str(pred)='" No" (p=0.182)' | correct=' yes'
str(pred)='" yes" (p=0.219)' | correct=' no'
str(pred)='" Yes" (p=0.152)' | correct=' yes'
str(pred)='" no" (p=0.212)' | correct=' yes'
str(pred)='" Yes" (p=0.176)' | correct=' yes'
str(pred)='" Yes" (p=0.236)' | correct=' no'
str(pred)='" yes" (p=0.251)' | correct=' no'
str(pred)='" Yes" (p=0.150)' | correct=' yes'
str(pred)='" Yes" (p=0.232)' | correct=' yes'
str(pred)='" yes" (p=0.240)' | correct=' no'
str(pred)='" no" (p=0.218)' | correct=' yes'
str(pred)='" No" (p=0.188)' | correct=' no'
str(pred)='" Yes" (p=0.172)' | correct=' yes'
str(pred)='" Yes" (p=0.268)' | correct=' no'
str(pred)='" No" (p=0.162)' | correct=' yes'
str(pr

0.4

In [27]:
patchscope.name = f"Patchscope_{MODEL_KEY.split('/')[-1]}"
patchscope.name

'Patchscope_Llama-3.2-3B'

In [28]:
evaluate_batch(batch, patchscope)  # Evaluate the batch again

2024-10-28 13:11:39 __main__ DEBUG    batch_tokenized.input_ids.shape=torch.Size([30, 21])
str(pred)='" yes" (p=0.958)' | correct=' no'
str(pred)='" yes" (p=0.974)' | correct=' yes'
str(pred)='" no" (p=0.749)' | correct=' no'
str(pred)='" yes" (p=0.649)' | correct=' yes'
str(pred)='" no" (p=0.970)' | correct=' yes'
str(pred)='" no" (p=1.000)' | correct=' no'
str(pred)='" no" (p=1.000)' | correct=' no'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pred)='" no" (p=1.000)' | correct=' no'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pred)='" no" (p=0.996)' | correct=' no'
str(pred)='" no" (p=0.999)' | correct=' no'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pred)='" no" (p=0.991)' | correct=' no'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pred)='" no" (p=0.994)' | correct=' no'
str(pred)='" yes" (p=1.000)' | correct=' yes'
str(pre

0.9

In [29]:
prompt = "The Space Needle is located"
inputs = prepare_input(prompt, mt)

with torch.inference_mode():
    output = mt._model.generate(**inputs, max_new_tokens=10, do_sample=False)

print(mt.tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The Space Needle is located in Seattle, Washington. It is a 605


In [30]:
with torch.inference_mode():
    output = patchscope._model.generate(**inputs, max_new_tokens=10, do_sample=False)

print(patchscope.tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The Space Needle is located yes yes yes no no no no no no no


In [1]:
# asking whether it discusses something
# whether some concept is present -> does it mention a man or a woman
# is it a math problem
# is "token" is Noun
# is this past tense

lst = ['companies_true_false.json', 'sp_en_trans.json', 'larger_than.json', 'smaller_than.json', 'cities.json']
lst

['neg_sp_en_trans.json',
 'companies_true_false.json',
 'neg_cities.json',
 'sp_en_trans.json',
 'common_claim_true_false.json',
 'larger_than.json',
 'smaller_than.json',
 'cities.json']

### LoRA (check later)

In [4]:
# from peft import LoraConfig, get_peft_model

# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# model = get_peft_model(mt._model, lora_config)

In [5]:
# type(model), type(model.model)

In [6]:
# for p in model.model.named_parameters():
#     print(p[0])

### Dataset Preparation (outdated)

In [9]:
from src.dataset import GMTDataset

ds = GMTDataset.from_csv(
    [
        # "sp_en_trans.csv", 
        "cities.csv"
    ], 
    # "sp_en_trans"
    "cities"
)

ds.select_few_shot(0)

queries = [ds.examples[i] for i in range(len(ds))]
interested_layers = mt.layer_names
q, a = queries[0]
print(q)
print(a)

2024-10-27 13:20:38 src.dataset INFO     initialized cities with 1493 examples.
The city of Moradabad is in India.
True


In [None]:
from src.functional import get_concept_latents

# don't need to run this if you already have cached the results

latents = get_concept_latents(
    mt=mt, 
    queries=queries, 
    interested_layers=interested_layers,
    check_answer=False,
)

In [10]:
latent_dir = os.path.join(
    env_utils.DEFAULT_RESULTS_DIR, 
    "cached_latents",
    MODEL_KEY.split("/")[-1],
)
os.makedirs(latent_dir, exist_ok=True)

In [8]:
# caching the latents

from src.utils.typing import LatentCacheCollection
from src.utils import env_utils

lcc = LatentCacheCollection(latents=latents)
lcc.detensorize()
with open(os.path.join(latent_dir, f"{ds.name}.json"), "w") as f:
    f.write(lcc.to_json())

In [11]:
# loading the latents

from src.utils.typing import LatentCacheCollection
from src.utils import env_utils

with open(os.path.join(latent_dir, f"{ds.name}.json"), "r") as f:
    dct = json.load(f)
lcc = LatentCacheCollection.from_dict(dct)
lcc.retensorize(device=mt.device)

In [5]:
# q, a = queries[10]
# print(q, a)
# yes_ans = "true" if str(a).lower().strip() == "true" else "false"
# no_ans = "false" if yes_ans == "true" else "true"
# latent_q = get_latent_qa(yes_ans, no_ans)
# print(latent_q)

### Patchscope Finetuning

In [7]:
from src.activation_manager import get_batch_paths
import random

activation_batch_paths = list(get_batch_paths())
random.shuffle(activation_batch_paths)

train_split = int(len(activation_batch_paths) * 0.8)
train_act_batch_paths = activation_batch_paths[:train_split]
test_act_batch_paths = activation_batch_paths[train_split:]

train_act_loader = ActivationLoader(
    latent_cache_files=train_act_batch_paths,
    batch_size=32, shuffle=True
)

validate_act_loader = ActivationLoader(
    latent_cache_files=test_act_batch_paths,
    batch_size=32, shuffle=True
)

len(train_act_loader.latent_cache_files), len(validate_act_loader.latent_cache_files)

(541, 136)

In [8]:
batch = train_act_loader.next_batch()

In [12]:
batch_tokenized, int_tok_idx = prepare_batch_input(batch, mt)

In [10]:
batch_tokenized.input_ids.shape

torch.Size([32, 21])

In [13]:
@torch.inference_mode()
def evaluate(
    mt: ModelandTokenizer, 
    eval_set: list[ActivationSample], 
    batch_size=32
):
    correct_count = 0
    total_count = 0
    for i in tqdm(range(0, len(eval_set), batch_size)):
        batch = eval_set[i:i+batch_size]
        with torch.no_grad():
            correct_count += evaluate_batch(batch, mt)
            total_count += len(batch)
    return correct_count / total_count

In [14]:
def get_validation_set(num=200):
    cur_eval_batch = []
    cur_eval_loader = ActivationLoader(
        latent_cache_files = random.choices(
            validate_act_loader.latent_cache_files,
            k=25
        ),
        batch_size=train_act_loader.batch_size,
    )
    while(True):
        try:
            cur_eval_batch.extend(cur_eval_loader.next_batch())
        except StopIteration:
            break
    random.shuffle(cur_eval_batch)
    cur_eval_batch = cur_eval_batch[:num]

    return cur_eval_batch

cur_eval_batch = get_validation_set(500)
evaluate(mt, cur_eval_batch)

  output = self.target(*args, **kwargs)
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 16/16 [00:07<00:00,  2.04it/s]


0.416

In [80]:
experiment_utils.set_seed(123456)

model = mt._model
model.train()
############################## Hyperparameters ##############################
learning_rate = 5e-5
log_steps = 50
checkpoint_interval = 1000
num_warmup_steps = 1000
limit_training_steps = 10000
batch_size = train_act_loader.batch_size
num_final_layers_to_tune = 10
############################################################################

2024-10-28 18:55:07 src.utils.experiment_utils INFO     setting all seeds to 123456


In [81]:
import shutil
def remove_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)

# remove_dir(model_save_dir)
remove_dir(".wandb")

In [82]:
wandb_logging = False

if wandb_logging:
    wandb.init(
        entity="dl-homeworks",
        project="talkative_probes",
        name=f"{MODEL_KEY.split('/')[-1]}_patchscope_tune",
        config={
            "model_key": MODEL_KEY.split("/")[-1],
            "learning_rate": learning_rate,
            "wandb_log_interval": log_steps,
            "checkpoint_interval": checkpoint_interval,
            "num_warmup_steps": num_warmup_steps,
            "batch_size": batch_size,
        },
    )

In [83]:
from transformers import get_linear_schedule_with_warmup
import baukit

tunable_params = []
for layer_name in mt.layer_names[-num_final_layers_to_tune:]:
    module = baukit.get_module(model, layer_name)
    for param in module.parameters():
        param.requires_grad = True
        tunable_params.append(param)
    # tunable_params.extend(list(module.parameters()))

optimizer = torch.optim.AdamW(tunable_params, lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=limit_training_steps,
)
loss_func = torch.nn.CrossEntropyLoss()

In [85]:
checkpoint_save_dir = os.path.join(
    env_utils.DEFAULT_RESULTS_DIR,
    "patchscope_test",
    MODEL_KEY.split("/")[-1], 
)
checkpoint_save_dir

'/home/local_arnab/Codes/Projects/talkative_probes/results/patchscope_test/Llama-3.2-3B'

In [86]:
from src.functional import free_gpu_cache

for step in tqdm(range(limit_training_steps), desc="Training"):
    optimizer.zero_grad()

    try:
        batch = train_act_loader.next_batch()
    except StopIteration:
        logger.info(f"End of training data at step {step + 1}")
        break
    
    batch_tokenized, int_tok_idx = prepare_batch_input(batch, mt)

    activations = [b.activation for b in batch]

    with mt.trace(batch_tokenized):
        # replace the latent on all the residual layers
        module_names = mt.layer_names 
        for idx, act, int_tok in zip(range(len(batch)), activations, int_tok_idx):
            for module_name in module_names:
                module = get_module_nnsight(mt, module_name)
                module.output[0][idx, int_tok, :] = torch.tensor(act, device=mt.device).to(mt.dtype)
        
        # output = mt.output.save()
        last_logits = [
            mt.output.logits[idx, -1, :].save()
            for idx in range(len(batch))
        ]

    last_logits = torch.stack(last_logits)
    batch_labels = [
        mt.tokenizer(b.label).input_ids[-1] for b in batch
    ]
    batch_labels = torch.tensor(batch_labels, device=mt.device)

    # Cross-entropy loss
    patchscope_loss = loss_func(last_logits, batch_labels)
    
    # TODO: include natural text and generation loss
    loss = patchscope_loss
    
    loss.backward()
    optimizer.step()
    scheduler.step()

    free_gpu_cache()

    if (step + 1) % log_steps == 0:
        cur_eval_batch = get_validation_set(500)
        eval_accuracy = evaluate(mt, cur_eval_batch)
        log_data = {
            "loss": loss.item(),
            "learning_rate": scheduler.get_last_lr()[0],
            "eval_accuracy": eval_accuracy
        }
        logger.info(f"Step {step + 1}: {log_data}")
        if wandb_logging:
            wandb.log(log_data)

    if ((step + 1) % checkpoint_interval == 0) or (step + 1) == limit_training_steps:
        if len(os.listdir(checkpoint_save_dir)) > 0:
            last_checkpoint_path = os.path.join(checkpoint_save_dir, os.listdir(checkpoint_save_dir)[-1])
            remove_dir(last_checkpoint_path)
        
        new_checkpoint_path = os.path.join(checkpoint_save_dir, f"checkpoint-{step + 1}")
        model.save_pretrained(new_checkpoint_path)


print("Training completed!")

Training:   0%|          | 0/10000 [00:00<?, ?it/s]

2024-10-28 18:59:23 __main__ INFO     torch.Size([32, 21])


  output = self.target(*args, **kwargs)
Training:   0%|          | 1/10000 [00:01<4:07:06,  1.48s/it]

2024-10-28 18:59:25 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 2/10000 [00:02<3:45:37,  1.35s/it]

2024-10-28 18:59:26 __main__ INFO     torch.Size([32, 20])


Training:   0%|          | 3/10000 [00:03<3:27:28,  1.25s/it]

2024-10-28 18:59:27 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 4/10000 [00:05<3:26:40,  1.24s/it]

2024-10-28 18:59:28 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 5/10000 [00:06<3:22:02,  1.21s/it]

2024-10-28 18:59:30 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 6/10000 [00:07<3:23:40,  1.22s/it]

2024-10-28 18:59:31 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 7/10000 [00:08<3:28:01,  1.25s/it]

2024-10-28 18:59:32 __main__ INFO     torch.Size([4, 17])


Training:   0%|          | 8/10000 [00:09<2:50:09,  1.02s/it]

2024-10-28 18:59:33 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 9/10000 [00:10<3:08:45,  1.13s/it]

2024-10-28 18:59:34 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 10/10000 [00:11<3:11:41,  1.15s/it]

2024-10-28 18:59:35 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 11/10000 [00:13<3:18:57,  1.20s/it]

2024-10-28 18:59:36 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 12/10000 [00:14<3:19:15,  1.20s/it]

2024-10-28 18:59:38 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 13/10000 [00:15<3:27:49,  1.25s/it]

2024-10-28 18:59:39 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 14/10000 [00:16<3:24:51,  1.23s/it]

2024-10-28 18:59:40 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 15/10000 [00:18<3:31:20,  1.27s/it]

2024-10-28 18:59:42 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 16/10000 [00:19<3:27:33,  1.25s/it]

2024-10-28 18:59:43 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 17/10000 [00:20<3:32:32,  1.28s/it]

2024-10-28 18:59:44 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 18/10000 [00:22<3:33:35,  1.28s/it]

2024-10-28 18:59:45 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 19/10000 [00:23<3:32:21,  1.28s/it]

2024-10-28 18:59:47 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 20/10000 [00:24<3:29:25,  1.26s/it]

2024-10-28 18:59:48 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 21/10000 [00:25<3:29:03,  1.26s/it]

2024-10-28 18:59:49 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 22/10000 [00:27<3:25:28,  1.24s/it]

2024-10-28 18:59:50 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 23/10000 [00:28<3:27:12,  1.25s/it]

2024-10-28 18:59:52 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 24/10000 [00:29<3:31:08,  1.27s/it]

2024-10-28 18:59:53 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 25/10000 [00:30<3:31:27,  1.27s/it]

2024-10-28 18:59:54 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 26/10000 [00:32<3:30:32,  1.27s/it]

2024-10-28 18:59:56 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 27/10000 [00:33<3:40:03,  1.32s/it]

2024-10-28 18:59:57 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 28/10000 [00:34<3:34:18,  1.29s/it]

2024-10-28 18:59:58 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 29/10000 [00:36<3:34:07,  1.29s/it]

2024-10-28 18:59:59 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 30/10000 [00:37<3:30:08,  1.26s/it]

2024-10-28 19:00:01 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 31/10000 [00:38<3:29:14,  1.26s/it]

2024-10-28 19:00:02 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 32/10000 [00:39<3:26:38,  1.24s/it]

2024-10-28 19:00:03 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 33/10000 [00:41<3:33:00,  1.28s/it]

2024-10-28 19:00:04 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 34/10000 [00:42<3:29:15,  1.26s/it]

2024-10-28 19:00:06 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 35/10000 [00:43<3:31:52,  1.28s/it]

2024-10-28 19:00:07 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 36/10000 [00:44<3:28:25,  1.26s/it]

2024-10-28 19:00:08 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 37/10000 [00:46<3:32:04,  1.28s/it]

2024-10-28 19:00:10 __main__ INFO     torch.Size([32, 20])


Training:   0%|          | 38/10000 [00:47<3:32:42,  1.28s/it]

2024-10-28 19:00:11 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 39/10000 [00:48<3:31:43,  1.28s/it]

2024-10-28 19:00:12 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 40/10000 [00:50<3:29:19,  1.26s/it]

2024-10-28 19:00:13 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 41/10000 [00:51<3:29:46,  1.26s/it]

2024-10-28 19:00:15 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 42/10000 [00:52<3:33:06,  1.28s/it]

2024-10-28 19:00:16 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 43/10000 [00:53<3:32:12,  1.28s/it]

2024-10-28 19:00:17 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 44/10000 [00:55<3:33:43,  1.29s/it]

2024-10-28 19:00:19 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 45/10000 [00:56<3:42:33,  1.34s/it]

2024-10-28 19:00:20 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 46/10000 [00:57<3:35:55,  1.30s/it]

2024-10-28 19:00:21 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 47/10000 [00:59<3:35:22,  1.30s/it]

2024-10-28 19:00:22 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 48/10000 [01:00<3:31:36,  1.28s/it]

2024-10-28 19:00:24 __main__ INFO     torch.Size([32, 21])


Training:   0%|          | 49/10000 [01:01<3:37:50,  1.31s/it]

2024-10-28 19:00:25 __main__ INFO     torch.Size([32, 21])


100%|██████████| 16/16 [00:08<00:00,  1.87it/s]

2024-10-28 19:00:36 __main__ INFO     Step 50: {'loss': 0.6848629713058472, 'learning_rate': 2.5e-06, 'eval_accuracy': 0.52}



Training:   0%|          | 50/10000 [01:12<11:48:35,  4.27s/it]

2024-10-28 19:00:36 __main__ INFO     torch.Size([32, 21])


Training:   1%|          | 51/10000 [01:14<9:19:14,  3.37s/it] 

2024-10-28 19:00:38 __main__ INFO     torch.Size([32, 21])


Training:   1%|          | 52/10000 [01:15<7:40:24,  2.78s/it]

2024-10-28 19:00:39 __main__ INFO     torch.Size([32, 21])


Training:   1%|          | 53/10000 [01:16<6:25:18,  2.32s/it]

2024-10-28 19:00:40 __main__ INFO     torch.Size([32, 21])


Training:   1%|          | 54/10000 [01:18<5:29:22,  1.99s/it]

2024-10-28 19:00:41 __main__ INFO     torch.Size([32, 21])


Training:   1%|          | 54/10000 [01:19<4:02:53,  1.47s/it]


KeyboardInterrupt: 