In [1]:
import os
import sys
import yaml
from collections import OrderedDict
from source.utils.logger import Logger, log_dict
from source.utils.wrappers import setup_data, train_model, calibrate_model, test_model
logger = Logger().get_logger()

2022-07-25 19:27:11.751682: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-25 19:27:11.751702: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def validate_config(conf):
    if 'data_opts' in conf:
        # Append data_dir to paths
        data_dir = conf['global_opts']['data_dir']
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        for ds_name, ds_config in conf['data_opts']['datasets'].items():
            if os.path.dirname(ds_config['filename']) != data_dir:
                conf['data_opts']['datasets'][ds_name]['filename'] = \
                    os.path.join(data_dir, ds_config['filename'])

    if 'classif_opts' in conf:
        assert 'datasets' in conf['classif_opts'], 'datasets for train and valid must be ' \
                                               'specified in classif_opts'
        # Append data_dir to paths
        data_dir = conf['global_opts']['data_dir']
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        for ds_name, ds_config, in conf['classif_opts']['datasets'].items():
            if os.path.dirname(ds_config['filename']) != data_dir:
                conf['classif_opts']['datasets'][ds_name]['filename'] = \
                    os.path.join(data_dir, ds_config['filename'])

    if 'calib_opts' in conf:
        assert 'datasets' in conf['classif_opts'], 'datasets for calibration must be ' \
                                               'specified in calib_opts'
        # Append data_dir to paths
        data_dir = conf['global_opts']['data_dir']
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        for ds_name, ds_config, in conf['calib_opts']['datasets'].items():
            if os.path.dirname(ds_config['filename']) != data_dir:
                conf['calib_opts']['datasets'][ds_name]['filename'] = \
                    os.path.join(data_dir, ds_config['filename'])

    return conf

In [3]:
with open('training_config.yaml', 'rb') as fin:
    config = yaml.load(fin, yaml.FullLoader)

In [4]:
# Setup logger
logger = Logger(output_dir=os.path.join(config['global_opts']['results_dir'], 'logs'),
                c_level=config['logging_opts']['console_level'],
                f_level=config['logging_opts']['file_level'])

In [5]:
logger.info('Validating config')
config = validate_config(config)
log_dict(config)

2022-07-25 19:27:12 - INFO   - 2578880424 - Validating config


In [6]:
# Setup function calls for each step
steps = OrderedDict((
    ('setup_data', lambda: setup_data(config)),
    ('train_model', lambda: train_model(config)),
    ('calibrate_model', lambda: calibrate_model(config)),
    ('test_model', lambda: test_model(config))
))

In [7]:
for step, fct in steps.items():
    try:
        logger.info('Running step: {}'.format(step))
        fct()
    except Exception as exc:
        logger.error(e)
        raise exc

2022-07-25 19:27:13 - INFO   - 3646807458 - Running step: setup_data
2022-07-25 19:27:13 - INFO   - wrappers   - Setting up dataset train_data
2022-07-25 19:27:14 - INFO   - wrappers   - Setting up dataset valid_data
2022-07-25 19:27:15 - INFO   - wrappers   - setup_data took 0:00:02
2022-07-25 19:27:15 - INFO   - 3646807458 - Running step: train_model
2022-07-25 19:27:15 - INFO   - trainer    - Setting random seed: 42
2022-07-25 19:27:15 - INFO   - trainer    - Setting model: DerivNet


2022-07-25 19:27:15.209187: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-25 19:27:15.209209: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-25 19:27:15.209227: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DS1OBS): /proc/driver/nvidia/version does not exist
2022-07-25 19:27:15.209420: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


2022-07-25 19:27:15 - INFO   - trainer    - Setting optimizer: adam
2022-07-25 19:27:15 - INFO   - trainer    - Setting loss function: CrossEnt
2022-07-25 19:27:15 - INFO   - layer_utils - Model: "DerivNet"
2022-07-25 19:27:15 - INFO   - layer_utils - __________________________________________________________________________________________________
2022-07-25 19:27:15 - INFO   - layer_utils - Layer (type)                    Output Shape         Param #     Connected to                     
2022-07-25 19:27:15 - INFO   - layer_utils - input (InputLayer)              [(None, 50, 1)]      0                                            
2022-07-25 19:27:15 - INFO   - layer_utils - __________________________________________________________________________________________________
2022-07-25 19:27:15 - INFO   - layer_utils - mvn (MVN)                       (None, 50, 1)        202         input[0][0]                      
2022-07-25 19:27:15 - INFO   - layer_utils - ____________________________

2022-07-25 19:27:17.204089: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


2022-07-25 19:27:17 - INFO   - trainer    - MVN updated with 29696.0 samples, disabling updates for layer mvn
Epoch 1/20
2022-07-25 19:27:20 - INFO   - metrics    - 	train_loss: 1.0789	valid_loss: 1.0773
2022-07-25 19:27:21 - INFO   - metrics    - 	train_prec: 0.5670	train_rec: 0.6676
2022-07-25 19:27:21 - INFO   - metrics    - 	valid_prec: 0.5558	valid_rec: 0.6667


  prec_[lab] = np.nan_to_num(np.sum(preds_[pos_idx] == lab) /


Epoch 2/20
2022-07-25 19:27:22 - INFO   - metrics    - 	train_loss: 1.0570	valid_loss: 1.0546
2022-07-25 19:27:22 - INFO   - metrics    - 	train_prec: 0.5793	train_rec: 0.6696
2022-07-25 19:27:22 - INFO   - metrics    - 	valid_prec: 0.5973	valid_rec: 0.6668
Epoch 3/20
2022-07-25 19:27:23 - INFO   - metrics    - 	train_loss: 1.0218	valid_loss: 1.0192
2022-07-25 19:27:23 - INFO   - metrics    - 	train_prec: 0.5556	train_rec: 0.6667
2022-07-25 19:27:23 - INFO   - metrics    - 	valid_prec: 0.5556	valid_rec: 0.6667
Epoch 4/20
2022-07-25 19:27:24 - INFO   - metrics    - 	train_loss: 0.9718	valid_loss: 0.9679
2022-07-25 19:27:24 - INFO   - metrics    - 	train_prec: 0.5556	train_rec: 0.6667
2022-07-25 19:27:24 - INFO   - metrics    - 	valid_prec: 0.5556	valid_rec: 0.6667
Epoch 5/20
2022-07-25 19:27:25 - INFO   - metrics    - 	train_loss: 0.9071	valid_loss: 0.9000
2022-07-25 19:27:25 - INFO   - metrics    - 	train_prec: 0.5556	train_rec: 0.6667
2022-07-25 19:27:26 - INFO   - metrics    - 	valid

2022-07-25 19:27:45 - INFO   - utils      - Loading data/train_data.pkl.gz
2022-07-25 19:27:45 - INFO   - utils      - Loaded data: 10000 (linear class), 10000 (quadratic class), 10000 (cubic class)
2022-07-25 19:27:45 - INFO   - utils      - 0 features contain nan values
2022-07-25 19:27:45 - INFO   - utils      - After removing nans: 10000 (linear class), 10000 (quadratic class), 10000 (cubic class)
2022-07-25 19:27:45 - INFO   - utils      - Renamed {features: features, label: label}
2022-07-25 19:27:45 - INFO   - utils      - OneHotEncoding: [1. 0. 0.] (linear class), [0. 1. 0.] (quadratic class), [0. 0. 1.] (cubic class)
2022-07-25 19:27:45 - INFO   - utils      - Loading data/train_data.pkl.gz
2022-07-25 19:27:46 - INFO   - utils      - Loaded data: 10000 (linear class), 10000 (quadratic class), 10000 (cubic class)
2022-07-25 19:27:46 - INFO   - utils      - 0 features contain nan values
2022-07-25 19:27:46 - INFO   - utils      - After removing nans: 10000 (linear class), 10000 