<a href="https://colab.research.google.com/github/aikindergarten/rechallenge_smart/blob/master/glue_benchmark_hyperparameter_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# hide
!nvidia-smi

Sat May 15 11:40:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P5000        On   | 00000000:00:05.0 Off |                  Off |
| 26%   27C    P8     6W / 180W |      1MiB / 16278MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# hide
import sys
if 'google.colab' in sys.modules: 
    !pip install -Uqq fastai transformers datasets wandb
    !pip install -q git+git://github.com/aikindergarten/fasthugs.git
    !pip install -q git+git://github.com/aikindergarten/vat.git

In [3]:
#all_slow

# GLUE Benchmark

In [4]:
from transformers import AutoModelForSequenceClassification
from fastai.text.all import *
# from fastai.callback.wandb import *

from fasthugs.learner import TransLearner
from fasthugs.data import TransformersTextBlock, TextGetter, get_splits

from datasets import load_dataset, concatenate_datasets
import wandb
import gc

## Setup

In [5]:
%env WANDB_ENTITY=fastai_community
%env WANDB_PROJECT=rechallenge-smart

env: WANDB_ENTITY=fastai_community
env: WANDB_PROJECT=rechallenge-smart


> Note: You can adjust maximum `dls` bs size here. The value from sweep config if it exeeds bs defined here will be used for gradient accumulation. 

In [6]:
ds_name = 'glue'
model_name = "bert-base-uncased"
# model_name = 'roberta-large'

max_len = 512
bs = 16
val_bs = bs*2

n_epoch = 6
lr = 2e-5
wd = 0.
opt_func = Adam

In [7]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
def validate_task():
    assert task in GLUE_TASKS

In [8]:
glue_metrics = {
    'cola':[MatthewsCorrCoef()],
    'sst2':[accuracy],
    'mrpc':[F1Score(), accuracy],
    'stsb':[PearsonCorrCoef(), SpearmanCorrCoef()],
    'qqp' :[F1Score(), accuracy],
    'mnli':[accuracy],
    'qnli':[accuracy],
    'rte' :[accuracy],
    'wnli':[accuracy],
}

glue_textfields = {
    'cola':['sentence', None],
    'sst2':['sentence', None],
    'mrpc':['sentence1', 'sentence2'],
    'stsb':['sentence1', 'sentence2'],
    'qqp' :['question1', 'question2'],
    'mnli':['premise', 'hypothesis'],
    'qnli':['question', 'sentence'],
    'rte' :['sentence1', 'sentence2'],
    'wnli':['sentence1', 'sentence2'],
}

glue_num_labels = {'mnli':3, 'stsb':1}

In [9]:
#collapse_input
def layerwise_splitter(model):
    emb = L(model.base_model.embeddings)
    layers = L(model.base_model.encoder.layer.children())
    clf = L(m for m in list(model.children())[1:] if params(m))
    groups = emb + layers + clf
    return groups.map(params)

### wandb callback

Small modifications to WadnbCallback to log best values. To be removed when becomes obsolete.

In [10]:
# Cell
import wandb

# Cell
class WandbCallback(Callback):
    "Saves model topology, losses & metrics"
    remove_on_fetch,order = True,Recorder.order+1
    # Record if watch has been called previously (even in another instance)
    _wandb_watch_called = False

    def __init__(self, log="gradients", log_preds=True, log_model=True, log_dataset=False, dataset_name=None, valid_dl=None, n_preds=36, seed=12345, reorder=True, compare=None):
        # Check if wandb.init has been called
        if wandb.run is None:
            raise ValueError('You must call wandb.init() before WandbCallback()')
        # W&B log step
        self._wandb_step = wandb.run.step - 1  # -1 except if the run has previously logged data (incremented at each batch)
        self._wandb_epoch = 0 if not(wandb.run.step) else math.ceil(wandb.run.summary['epoch']) # continue to next epoch
        store_attr('log,log_preds,log_model,log_dataset,dataset_name,valid_dl,n_preds,seed,reorder,compare')

    def before_fit(self):
        "Call watch method to log model topology, gradients & weights"
        self.run = not hasattr(self.learn, 'lr_finder') and not hasattr(self, "gather_preds") and rank_distrib()==0
        if not self.run: return

        # Log config parameters
        log_config = self.learn.gather_args()
        _format_config(log_config)
        try:
            wandb.config.update(log_config, allow_val_change=True)
        except Exception as e:
            print(f'WandbCallback could not log config parameters -> {e}')

        if not WandbCallback._wandb_watch_called:
            WandbCallback._wandb_watch_called = True
            # Logs model topology and optionally gradients and weights
            wandb.watch(self.learn.model, log=self.log)

        # log dataset
        assert isinstance(self.log_dataset, (str, Path, bool)), 'log_dataset must be a path or a boolean'
        if self.log_dataset is True:
            if Path(self.dls.path) == Path('.'):
                print('WandbCallback could not retrieve the dataset path, please provide it explicitly to "log_dataset"')
                self.log_dataset = False
            else:
                self.log_dataset = self.dls.path
        if self.log_dataset:
            self.log_dataset = Path(self.log_dataset)
            assert self.log_dataset.is_dir(), f'log_dataset must be a valid directory: {self.log_dataset}'
            metadata = {'path relative to learner': os.path.relpath(self.log_dataset, self.learn.path)}
            log_dataset(path=self.log_dataset, name=self.dataset_name, metadata=metadata)

        # log model
        if self.log_model and not hasattr(self, 'save_model'):
            print('WandbCallback requires use of "SaveModelCallback" to log best model')
            self.log_model = False

        if self.log_preds:
            try:
                if not self.valid_dl:
                    #Initializes the batch watched
                    wandbRandom = random.Random(self.seed)  # For repeatability
                    self.n_preds = min(self.n_preds, len(self.dls.valid_ds))
                    idxs = wandbRandom.sample(range(len(self.dls.valid_ds)), self.n_preds)
                    if isinstance(self.dls,  TabularDataLoaders):
                        test_items = getattr(self.dls.valid_ds.items, 'iloc', self.dls.valid_ds.items)[idxs]
                        self.valid_dl = self.dls.test_dl(test_items, with_labels=True, process=False)
                    else:
                        test_items = [getattr(self.dls.valid_ds.items, 'iloc', self.dls.valid_ds.items)[i] for i in idxs]
                        self.valid_dl = self.dls.test_dl(test_items, with_labels=True)
                self.learn.add_cb(FetchPredsCallback(dl=self.valid_dl, with_input=True, with_decoded=True, reorder=self.reorder))
            except Exception as e:
                self.log_preds = False
                print(f'WandbCallback was not able to prepare a DataLoader for logging prediction samples -> {e}')

        self.best_metrics = {}
        if self.compare is None:
            self.compare = [np.less if (('loss' in n) or ('error' in n)) else np.greater for n in self.recorder.metric_names[2:-1]]
        elif not islisty(self.compare): self.compare = [self.compare]


    def after_batch(self):
        "Log hyper-parameters and training loss"
        if self.training:
            self._wandb_step += 1
            self._wandb_epoch += 1/self.n_iter
            hypers = {f'{k}_{i}':v for i,h in enumerate(self.opt.hypers) for k,v in h.items()}
            wandb.log({'epoch': self._wandb_epoch, 'train_loss': to_detach(self.smooth_loss.clone()), 'raw_loss': to_detach(self.loss.clone()), **hypers}, step=self._wandb_step)

    def log_predictions(self, preds):
        inp,preds,targs,out = preds
        b = tuplify(inp) + tuplify(targs)
        x,y,its,outs = self.valid_dl.show_results(b, out, show=False, max_n=self.n_preds)
        wandb.log(wandb_process(x, y, its, outs), step=self._wandb_step)

    def after_epoch(self):
        "Log validation loss and custom metrics & log prediction samples"
        # Correct any epoch rounding error and overwrite value
        self._wandb_epoch = round(self._wandb_epoch)
        wandb.log({'epoch': self._wandb_epoch}, step=self._wandb_step)
        # Log sample predictions
        if self.log_preds:
            try:
                self.log_predictions(self.learn.fetch_preds.preds)
            except Exception as e:
                self.log_preds = False
                print(f'WandbCallback was not able to get prediction samples -> {e}')
        metrics = {n:s for n,s in zip(self.recorder.metric_names, self.recorder.log) if n not in ['train_loss', 'epoch', 'time']}
        wandb.log(metrics, step=self._wandb_step)
        self.update_best(metrics)
        wandb.log(self.best_metrics, step=self._wandb_step)


    def after_fit(self):
        if self.log_model:
            if self.save_model.last_saved_path is None:
                print('WandbCallback could not retrieve a model to upload')
            else:
                metadata = {n:s for n,s in zip(self.recorder.metric_names, self.recorder.log) if n not in ['train_loss', 'epoch', 'time']}
                log_model(self.save_model.last_saved_path, metadata=metadata)
        self.run = True
        if self.log_preds: self.remove_cb(FetchPredsCallback)
        wandb.log({})  # ensure sync of last step
        self._wandb_step += 1

    def update_best(self, metrics):
        for is_better, (n,v) in zip(self.compare, metrics.items()):
            current_best = self.best_metrics.get(f'best_{n}', None)
            if current_best is None or is_better(v, current_best):
                self.best_metrics[f'best_{n}'] = v

# Cell
@patch
def gather_args(self:Learner):
    "Gather config parameters accessible to the learner"
    # args stored by `store_attr`
    cb_args = {f'{cb}':getattr(cb,'__stored_args__',True) for cb in self.cbs}
    args = {'Learner':self, **cb_args}
    # input dimensions
    try:
        n_inp = self.dls.train.n_inp
        args['n_inp'] = n_inp
        xb = self.dls.train.one_batch()[:n_inp]
        args.update({f'input {n+1} dim {i+1}':d for n in range(n_inp) for i,d in enumerate(list(detuplify(xb[n]).shape))})
    except: print(f'Could not gather input dimensions')
    # other useful information
    with ignore_exceptions():
        args['batch size'] = self.dls.bs
        args['batch per epoch'] = len(self.dls.train)
        args['model parameters'] = total_params(self.model)[0]
        args['device'] = self.dls.device.type
        args['frozen'] = bool(self.opt.frozen_idx)
        args['frozen idx'] = self.opt.frozen_idx
        args['dataset.tfms'] = f'{self.dls.dataset.tfms}'
        args['dls.after_item'] = f'{self.dls.after_item}'
        args['dls.before_batch'] = f'{self.dls.before_batch}'
        args['dls.after_batch'] = f'{self.dls.after_batch}'
    return args

# Cell
def _make_plt(img):
    "Make plot to image resolution"
    # from https://stackoverflow.com/a/13714915
    my_dpi = 100
    fig = plt.figure(frameon=False, dpi=my_dpi)
    h, w = img.shape[:2]
    fig.set_size_inches(w / my_dpi, h / my_dpi)
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    return fig, ax

# Cell
def _format_config_value(v):
    if isinstance(v, list):
        return [_format_config_value(item) for item in v]
    elif hasattr(v, '__stored_args__'):
        return {**_format_config(v.__stored_args__), '_name': v}
    return v

# Cell
def _format_config(config):
    "Format config parameters before logging them"
    for k,v in config.items():
        if isinstance(v, dict):
            config[k] = _format_config(v)
        else:
            config[k] = _format_config_value(v)
    return config

# Cell
def _format_metadata(metadata):
    "Format metadata associated to artifacts"
    for k,v in metadata.items(): metadata[k] = str(v)

# Cell
def log_dataset(path, name=None, metadata={}, description='raw dataset'):
    "Log dataset folder"
    # Check if wandb.init has been called in case datasets are logged manually
    if wandb.run is None:
        raise ValueError('You must call wandb.init() before log_dataset()')
    path = Path(path)
    if not path.is_dir():
        raise f'path must be a valid directory: {path}'
    name = ifnone(name, path.name)
    _format_metadata(metadata)
    artifact_dataset = wandb.Artifact(name=name, type='dataset', metadata=metadata, description=description)
    # log everything except "models" folder
    for p in path.ls():
        if p.is_dir():
            if p.name != 'models': artifact_dataset.add_dir(str(p.resolve()), name=p.name)
        else: artifact_dataset.add_file(str(p.resolve()))
    wandb.run.use_artifact(artifact_dataset)

# Cell
def log_model(path, name=None, metadata={}, description='trained model'):
    "Log model file"
    if wandb.run is None:
        raise ValueError('You must call wandb.init() before log_model()')
    path = Path(path)
    if not path.is_file():
        raise f'path must be a valid file: {path}'
    name = ifnone(name, f'run-{wandb.run.id}-model')
    _format_metadata(metadata)
    artifact_model = wandb.Artifact(name=name, type='model', metadata=metadata, description=description)
    with artifact_model.new_file(name, mode='wb') as fa:
        fa.write(path.read_bytes())
    wandb.run.log_artifact(artifact_model)

# Cell
@typedispatch
def wandb_process(x:TensorImage, y, samples, outs):
    "Process `sample` and `out` depending on the type of `x/y`"
    res_input, res_pred, res_label = [],[],[]
    for s,o in zip(samples, outs):
        img = s[0].permute(1,2,0)
        res_input.append(wandb.Image(img, caption='Input data'))
        for t, capt, res in ((o[0], "Prediction", res_pred), (s[1], "Ground Truth", res_label)):
            fig, ax = _make_plt(img)
            # Superimpose label or prediction to input image
            ax = img.show(ctx=ax)
            ax = t.show(ctx=ax)
            res.append(wandb.Image(fig, caption=capt))
            plt.close(fig)
    return {"Inputs":res_input, "Predictions":res_pred, "Ground Truth":res_label}

# Cell
@typedispatch
def wandb_process(x:TensorImage, y:(TensorCategory,TensorMultiCategory), samples, outs):
    return {"Prediction Samples": [wandb.Image(s[0].permute(1,2,0), caption=f'Ground Truth: {s[1]}\nPrediction: {o[0]}')
            for s,o in zip(samples,outs)]}

# Cell
@typedispatch
def wandb_process(x:TensorImage, y:TensorMask, samples, outs):
    res = []
    codes = getattr(y, 'codes', None)
    class_labels = {i:f'{c}' for i,c in enumerate(codes)} if codes is not None else None
    for s,o in zip(samples, outs):
        img = s[0].permute(1,2,0)
        masks = {}
        for t, capt in ((o[0], "Prediction"), (s[1], "Ground Truth")):
            masks[capt] = {'mask_data':t.numpy().astype(np.uint8)}
            if class_labels: masks[capt]['class_labels'] = class_labels
        res.append(wandb.Image(img, masks=masks))
    return {"Prediction Samples":res}

# Cell
@typedispatch
def wandb_process(x:TensorText, y:(TensorCategory,TensorMultiCategory), samples, outs):
    data = [[s[0], s[1], o[0]] for s,o in zip(samples,outs)]
    return {"Prediction Samples": wandb.Table(data=data, columns=["Text", "Target", "Prediction"])}

# Cell
## @typedispatch
# def wandb_process(x:Tabular, y:Tabular, samples, outs):
#     df = x.all_cols
#     for n in x.y_names: df[n+'_pred'] = y[n].values
#     return {"Prediction Samples": wandb.Table(dataframe=df)}

## Running a GLUE task

In [11]:
task = 'qnli'
validate_task()

In [12]:
ds = load_dataset(ds_name, task)

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [13]:
len(ds['train']), len(ds['validation'])

(104743, 5463)

In [14]:
train_idx, valid_idx = get_splits(ds)
train_ds = concatenate_datasets([ds['train'], ds['validation']])

In [15]:
train_ds[0]

{'idx': 0,
 'label': 1,
 'question': 'When did the third Digimon series begin?',
 'sentence': 'Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese.'}

Here I use number of characters as a proxy for length of tokenized text to speed up `dls` creation.

In [16]:
lens = train_ds.map(lambda s: {'len': sum([len(s[i]) for i in glue_textfields[task] if i])},
                    remove_columns=train_ds.column_names, num_proc=2, keep_in_memory=True)
train_lens = lens.select(train_idx)['len']
valid_lens = lens.select(valid_idx)['len']





In [17]:
dblock = DataBlock(blocks = [TransformersTextBlock(pretrained_model_name=model_name), CategoryBlock()],
                   get_x=TextGetter(*glue_textfields[task]),
                   get_y=ItemGetter('label'),
                   splitter=IndexSplitter(valid_idx))

In [18]:
%%time
dl_kwargs=[{'res':train_lens}, {'val_res':valid_lens}]
dls = dblock.dataloaders(train_ds, bs=bs, val_bs=val_bs, dl_kwargs=dl_kwargs, shuffle=True)

CPU times: user 29.1 s, sys: 1.18 s, total: 30.3 s
Wall time: 30.2 s


In [19]:
dls.show_batch(max_n=4)

Unnamed: 0,text,text_,category
0,what italian prime minister attended yale?,"among the best - known are u. s. presidents william howard taft, gerald ford, george h. w. bush, bill clinton and george w. bush ; royals crown princess victoria bernadotte, prince rostislav romanov and prince akiiki hosea nyabongo ; heads of state, including italian prime minister mario monti, turkish prime minister tansu ciller, mexican president ernesto zedillo, german president karl carstens, and philippines president jose paciano laurel ; u. s. supreme court justices sonia sotomayor, samuel alito and clarence thomas ; u. s. secretaries of state john kerry, hillary clinton, cyrus vance, and dean acheson ; authors sinclair lewis, stephen vincent benet, and tom wolfe ; lexicographer noah webster ; inventors samuel f. b. morse and eli whitney ; patriot and "" first spy "" nathan hale ; theologian jonathan edwards ; actors, directors and producers paul newman, henry winkler, vincent price, meryl streep, sigourney weaver, jodie foster, angela bassett,",0
1,nasser was a leader of what country?,"tito met many world leaders during his rule, such as soviet rulers joseph stalin, nikita khrushchev and leonid brezhnev ; egypt's gamal abdel nasser, indian politicians jawaharlal nehru and indira gandhi ; british prime ministers winston churchill, james callaghan and margaret thatcher ; u. s. presidents dwight d. eisenhower, john f. kennedy, richard nixon, gerald ford and jimmy carter ; other political leaders, dignitaries and heads of state that tito met at least once in his lifetime included che guevara, fidel castro, yasser arafat, willy brandt, helmut schmidt, georges pompidou, queen elizabeth ii, hua guofeng, kim il sung, sukarno, sheikh mujibur rahman, suharto, idi amin, haile selassie, kenneth kaunda, gaddafi, erich honecker, nicolae ceausescu, janos kadar and urho kekkonen.",0
2,how many international universities operate in london?,"a number of universities in london are outside the university of london system, including brunel university, city university london, imperial college london, kingston university, london metropolitan university, middlesex university, university of east london, university of west london and university of westminster, ( with over 34, 000 students, the largest unitary university in london ), london south bank university, middlesex university, university of the arts london ( the largest university of art, design, fashion, communication and the performing arts in europe ), university of east london, the university of west london and the university of westminster.",1
3,when was the philadelphia baptist confession created?,"shared doctrines would include beliefs about one god ; the virgin birth ; miracles ; atonement for sins through the death, burial, and bodily resurrection of jesus ; the trinity ; the need for salvation ( through belief in jesus christ as the son of god, his death and resurrection, and confession of christ as lord ) ; grace ; the kingdom of god ; last things ( eschatology ) ( jesus christ will return personally and visibly in glory to the earth, the dead will be raised, and christ will judge everyone in righteousness ) ; and evangelism and missions.",1


### Sweeps

In [20]:
def get_learner(model_name, task, **kwargs):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=glue_num_labels.get(task, 2))
    # set task specific dropouts as in paper
    if   task=='cola': 
        if 'roberta' in model_name: model.classifier.dropout.p = 0.05
        else:                       model.dropout.p = 0.05
    elif task=='mnli':
        if 'roberta' in model_name: model.classifier.dropout.p = 0.3
        else:                       model.dropout.p = 0.3

    metrics = glue_metrics[task]
    return TransLearner(dls, model, metrics=metrics, cbs=GradientClip(), **kwargs)

In [21]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfastai_community[0m (use `wandb login --relogin` to force relogin)


True

In [22]:
def train():
    with wandb.init() as run:
        cfg = run.config
        dls.train.bs = min(bs, cfg.bs)
        opt_func = Adam if cfg.opt==0 else RAdam
        learn = get_learner(model_name, task, opt_func=opt_func)
        
        cbs = [WandbCallback(log_preds=False, log_model=False)]
        if dls.train.bs < cfg.bs: cbs += [GradientAccumulation(cfg.bs)]
        learn.fit_one_cycle(n_epoch, cfg.lr, wd=0., cbs=cbs, pct_start=0.1)
        del learn
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

In [23]:
metrics = glue_metrics[task]
metric_to_monitor = metrics[0].name if isinstance(metrics[0], Metric) else metrics[0].__name__
sweep_name = f"glue-{task}-sweep"
sweep_config = {
    "project":"rechallenge-smart",
    "entity": "fastai_cimmunity",
    "name": sweep_name,
    "method": "grid",
    "parameters": {
        "lr": {"values":[1e-5,2e-5,3e-5,5e-5]},
        "bs": {"values":[16,32,64]},
        "opt": {"values":[0, 1]}
    }
}

Total number of runs per sweep is 24.

If you'r starting a new sweep run this:

In [24]:
sweep_id = wandb.sweep(sweep_config)

Create sweep with ID: 8v3vm1nu
Sweep URL: https://wandb.ai/fastai_community/rechallenge-smart/sweeps/8v3vm1nu


But if you want to add to existing one set `sweep_id` accordingly:

In [24]:
# sweep_id = '8fh54e6n'

In [25]:
wandb.agent(sweep_id, function=train)

[34m[1mwandb[0m: Agent Starting Run: kwg6xy2t with config:
[34m[1mwandb[0m: 	bs: 16
[34m[1mwandb[0m: 	lr: 1e-05
[34m[1mwandb[0m: 	opt: 1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Could not gather input dimensions


epoch,train_loss,valid_loss,accuracy,time
0,0.344502,0.267861,0.887241,23:17
1,0.250752,0.2521,0.910306,23:18
2,0.183412,0.279516,0.913051,23:20
3,0.139958,0.369258,0.910306,23:13
4,0.104389,0.402042,0.912502,23:05
5,0.085061,0.43533,0.913418,23:00


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,6.0
train_loss,0.08506
raw_loss,0.00157
wd_0,0.0
sqr_mom_0,0.99
lr_0,0.0
mom_0,0.95
eps_0,1e-05
beta_0,0.0
wd_1,0.0


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▆▅▅▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▂▁▁▁▁▁▁▁
raw_loss,▅▄▄▄▅▅▃▄█▂▅▂▁▄▄▁▁▂▃▁▁▁▁▃▃▁▃▁▁▁▁▁▁▁▃▁▂▁▁▁
wd_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
sqr_mom_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
lr_0,▂▂▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
mom_0,█▇▄▂▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇██████
eps_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
beta_0,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
wd_1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
