In [1]:
import hashlib
import os
import sys

import gin
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader

In [2]:
%cd /home/ubuntu/

/home/ubuntu/RAVE


In [3]:
import rave
import rave.core
import rave.dataset

In [4]:
import torch, time, gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [5]:
NAME = "separate_phases_04"
CONFIG = ["v2"]
DB_PATH = "/home/ubuntu/preprocessed/"
MAX_STEPS = 1500
VAL_EVERY = 100
N_SIGNAL = 131072
BATCH = 8
ckpt = None
OVERRIDE = []
WORKERS = 8
GPU = None
DERIVATIVE = False
NORMALIZE = False
PROGRESS = True

In [6]:
def add_gin_extension(config_name: str) -> str:
    if config_name[-4:] != '.gin':
        config_name += '.gin'
    return config_name

In [7]:
def setup():
    torch.backends.cudnn.benchmark = True
    gin.parse_config_files_and_bindings(
        map(add_gin_extension, CONFIG),
        OVERRIDE,
    )
    model = rave.RAVE()
    if DERIVATIVE:
        model.integrator = rave.dataset.get_derivator_integrator(model.sr)[1]

    dataset = rave.dataset.get_dataset(
        DB_PATH, model.sr, N_SIGNAL, derivative=DERIVATIVE, normalize=NORMALIZE
    )
    train, val = rave.dataset.split_dataset(dataset, 98)
    num_workers = WORKERS

    if os.name == "nt" or sys.platform == "darwin":
        num_workers = 0

    train = DataLoader(
        train, BATCH, True, drop_last=True, num_workers=num_workers
    )
    val = DataLoader(val, BATCH, False, num_workers=num_workers)

    # CHECKPOINT CALLBACKS
    validation_checkpoint = pl.callbacks.ModelCheckpoint(
        monitor="validation", filename="best")
    last_checkpoint = pl.callbacks.ModelCheckpoint(filename="last")
    val_check = {}
    if len(train) >= VAL_EVERY:
        val_check["val_check_interval"] = VAL_EVERY
    else:
        nepoch = VAL_EVERY // len(train)
        val_check["check_val_every_n_epoch"] = nepoch
    gin_hash = hashlib.md5(
        gin.operative_config_str().encode()).hexdigest()[:10]
    RUN_NAME = f'{NAME}_{gin_hash}'
    os.makedirs(os.path.join("runs", RUN_NAME), exist_ok=True)
    if GPU == [-1]:
        gpu = 0
    else:
        gpu = GPU or rave.core.setup_gpu()
    print('selected gpu:', gpu)
    accelerator = None
    devices = None
    if GPU == [-1]:
        pass
    elif torch.cuda.is_available():
        accelerator = "cuda"
        devices = GPU or rave.core.setup_gpu()
    elif torch.backends.mps.is_available():
        print(
            "Training on mac is not available yet. Use --gpu -1 to train on CPU (not recommended)."
        )
        exit()
        accelerator = "mps"
        devices = 1
    
    trainer = pl.Trainer(
        logger=pl.loggers.TensorBoardLogger(
            "runs",
            name=RUN_NAME,
        ),
        accelerator=accelerator,
        devices=devices,
        callbacks=[
            validation_checkpoint,
            last_checkpoint,
            rave.model.WarmupCallback(),
            rave.model.QuantizeCallback(),
            rave.core.LoggerCallback(rave.core.ProgressLogger(RUN_NAME)),
        ],
        max_epochs=100000,
        max_steps=MAX_STEPS,
        profiler="simple",
        enable_progress_bar=PROGRESS,
        **val_check,
    )
    run = rave.core.search_for_run(ckpt)
    if run is not None:
        step = torch.load(run, map_location='cpu')["global_step"]
        trainer.fit_loop.epoch_loop._batches_that_stepped = step

    with open(os.path.join("runs", RUN_NAME, "config.gin"), "w") as config_out:
        config_out.write(gin.operative_config_str())

    return trainer, model, train, val, run

In [8]:
trainer, model, train, val, run = setup()

ERROR:root:Path not found: v2.gin
ERROR:root:Path not found: /home/ubuntu/RAVE/rave/v2.gin
ERROR:root:Path not found: configs/v1.gin
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


train set: 3436 examples
val set: 71 examples
selected gpu: [0]


In [None]:
start_timer()
trainer.fit(model, train, val, ckpt_path=run)
end_timer_and_print("Default precision:")

You are using a CUDA device ('NVIDIA A10') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                     | Type                  | Params
-------------------------------------------------------------------
0 | pqmf                     | CachedPQMF            | 16.7 K
1 | encoder                  | VariationalEncoder    | 16.1 M
2 | decoder                  | GeneratorV2           | 15.5 M
3 | discriminator            | CombineDiscriminators | 27.1 M
4 | audio_distance           | AudioDistanceV1       | 0     
5 | multiband_audio_distance | AudioDistanceV1       | 0     
-------------------------------------------------------------------
58.7 M    Trainable params
0        

Sanity Checking: 0it [00:00, ?it/s]

Computing receptive field for this configuration...
Compression ratio: 2048x (~21.5Hz @ 44100Hz)
Receptive field: 614.88ms <-- x --> 599.27ms


Training: 0it [00:00, ?it/s]



# Saving/Resuming

To save/resume Amp-enabled runs with bitwise accuracy, use
[scaler.state_dict](https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.state_dict) and
[scaler.load_state_dict](https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.load_state_dict).

When saving, save the ``scaler`` state dict alongside the usual model and optimizer state ``dicts``.
Do this either at the beginning of an iteration before any forward passes, or at the end of
an iteration after ``scaler.update()``.

```
checkpoint = {"model": net.state_dict(),
              "optimizer": opt.state_dict(),
              "scaler": scaler.state_dict()}
```

Write checkpoint as desired, e.g.,

```
torch.save(checkpoint, "filename")
```

When resuming, load the ``scaler`` state dict alongside the model and optimizer state ``dicts``.
Read checkpoint as desired, for example:

```
dev = torch.cuda.current_device()
checkpoint = torch.load("filename",
                        map_location = lambda storage, loc: storage.cuda(dev))
```

If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp,
load model and optimizer states from the checkpoint as usual.  The checkpoint won't contain a saved ``scaler`` state, so
use a fresh instance of ``GradScaler``.

If a checkpoint was created from a run *with* Amp and you want to resume training *without* ``Amp``,
load model and optimizer states from the checkpoint as usual, and ignore the saved ``scaler`` state.


```
net.load_state_dict(checkpoint["model"])
opt.load_state_dict(checkpoint["optimizer"])
scaler.load_state_dict(checkpoint["scaler"])
```