<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# NeurIPS - Run Debug

Goal : understand issues with ConvLNP

Data : MNIST

Models : SelfAttnLNP and ConvLNPXL and ConvCNPXL 

Loss : NLLLloss and CNPF

Runs : 5

In [1]:
import os

os.chdir("..")

In [2]:
import neuralproc

In [3]:
import submitit

from train_imgs import main, parse_arguments

log_folder = "logs/%j"

In [4]:
def repeat_list(l, n):
    return [i for i in l for _ in range(n)]

class Run:
    def checkpoint(self, args):
        """Resubmits the same callable with the same arguments but makes sure continnue from last chckpnt."""
        args.is_continue_train = True
        return submitit.utils.DelayedSubmission(self, args)

    def __call__(self, args):
        job_env = submitit.utils.JobEnvironment()
        args.starting_run = args.starting_run * job_env.num_tasks + job_env.local_rank
        return main(args)

In [5]:
global_kwargs = "--max-epochs 50 --n-z-samples 16 --is-progressbar"
n_runs = 3 # 5

kwargs = [
          ("minsig_ConvNPFXL", "ConvNPF", "NllLNPF","--n-blocks 7 --kernel-size 9 --init-kernel-size 11 --n-conv-layers 2 --batch-size 32 --min-sigma 0.1"),
           ("minsig_SelfAttnNPF", "SelfAttnNPF", "NllLNPF", "--batch-size 32 --min-sigma 0.1"),
          ("homosked_SelfAttnNPF", "SelfAttnNPF", "NllLNPF", "--batch-size 32 --is-change-sked"),
          ("heterosked_ConvNPFXL", "ConvNPF", "NllLNPF","--n-blocks 7 --kernel-size 9 --init-kernel-size 11 --n-conv-layers 2 --batch-size 32 --is-change-sked"),
         ("both_ConvNPFXL", "ConvNPF", "NllLNPF","--n-blocks 7 --kernel-size 9 --init-kernel-size 11 --n-conv-layers 2 --batch-size 32 --is-both-paths"),
         ("global_ConvNPFXL", "ConvNPF", "NllLNPF","--n-blocks 7 --kernel-size 9 --init-kernel-size 11 --n-conv-layers 2 --batch-size 32 --is-global"),
    ("convnlp_ConvNPFXL", "ConvNPF", "CNPF","--n-blocks 7 --kernel-size 9 --init-kernel-size 11 --n-conv-layers 2 --batch-size 32 --is-convlnp-arch")
          ]

args = [parse_arguments(f"{model} {loss} {data} --starting-run {s} --name {name}_{loss} --chckpnt-dirname results/neurips/debug/ {global_kwargs} {other}".split())
            for s in range(n_runs)
            for data in ["mnist"]
            for name,model,loss,other in kwargs]

In [6]:
executor=submitit.SlurmExecutor(folder=log_folder, max_num_timeout=3)
executor.update_parameters(num_gpus=1, 
                           time=60*24*2,  
                           cpus_per_task=10, 
                           mem='32GB',
                          partition="priority",
                           comment="neurips"
                          )

In [7]:
jobs = executor.map_array(Run(), args)

In [8]:
############################################################

In [19]:
jobs

[SlurmJob<job_id=26741186_0, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_1, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_2, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_3, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_4, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_5, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_6, task_id=0, state="FAILED">,
 SlurmJob<job_id=26741186_7, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_8, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_9, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_10, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_11, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_12, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_13, task_id=0, state="FAILED">,
 SlurmJob<job_id=26741186_14, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_15, task_id=0, state="COMPLETED">,
 SlurmJob<job_id=26741186_16, task_id=0,

In [13]:
for job in jobs:
    print("--------------------------------")
    print(job.stdout())

--------------------------------
submitit INFO (2020-05-23 14:38:48,170) - Starting with JobEnvironment(job_id=26741186_0, hostname=learnfair0617, local_rank=0(1), node=0(1), global_rank=0(1))
submitit INFO (2020-05-23 14:38:48,170) - Loading pickle: /private/home/yannd/projects/NPF/logs/26741186_0/26741186_0_submitted.pkl

--- Training mnist/minsig_ConvNPFXL_NllLNPF/run_0 ---

  epoch    train_loss    valid_loss    cp       dur
-------  ------------  ------------  ----  --------
      1     [36m-834.7973[0m    [32m-1086.4018[0m     +  709.5262
      2    [36m-1080.0863[0m    [32m-1131.4718[0m     +  709.0926
      3    [36m-1115.6658[0m    -1121.4488        708.9413
      4    [36m-1147.3325[0m    [32m-1165.7302[0m     +  708.6442
      5    [36m-1148.7047[0m    [32m-1184.2046[0m     +  708.3869
      6    [36m-1173.4050[0m    -1158.8143        708.2132
      7    [36m-1174.5241[0m    [32m-1211.9551[0m     +  708.1514
      8    [36m-1189.7616[0m    -1209.459

In [11]:
for job in jobs:
    print("--------------------------------")
    print(job.stderr())

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------

--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None
--------------------------------
None


In [9]:
for job in jobs:
    job.cancel()