In [1]:
import os
import urllib
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

# Reading Data

In [2]:
def load_UCI(dset_name, splits=10, seed=0, separate_targets=True, save_dir='data/'):
    mkdir(save_dir)

    if dset_name == 'boston':
        if not os.path.isfile(save_dir+'housing.data'):
            urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
                               filename=save_dir+'housing.data')
        data = pd.read_csv(save_dir + 'housing.data', header=0, delimiter="\s+").values
        y_idx = [-1]

    elif dset_name == 'concrete':
        if not os.path.isfile(save_dir+'Concrete_Data.xls'):
            urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls",
                               filename=save_dir+'Concrete_Data.xls')
        data = pd.read_excel(save_dir+ 'Concrete_Data.xls', header=0, delimiter="\s+").values
        y_idx = [-1]

    elif dset_name == 'energy':
        if not os.path.isfile(save_dir+'ENB2012_data.xlsx'):
            urllib.urlretrieve("http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx",
                               filename=save_dir+'ENB2012_data.xlsx')
        data = pd.read_excel(save_dir+'ENB2012_data.xlsx', header=0, delimiter="\s+").values
        y_idx = [-2, -1]

    elif dset_name == 'power':
        if not os.path.isfile(save_dir+'CCPP.zip'):
            urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip",
                               filename=save_dir+'CCPP.zip')
        zipped = zipfile.ZipFile(save_dir+"CCPP.zip")
        data = pd.read_excel(zipped.open('CCPP/Folds5x2_pp.xlsx'), header=0, delimiter="\t").values
        y_idx = [-1]

    elif dset_name == 'wine':
        if not os.path.isfile(save_dir+'winequality-red.csv'):
            urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
                               filename=save_dir+'winequality-red.csv')
        data = pd.read_csv(save_dir+'winequality-red.csv', header=1, delimiter=';').values
        y_idx = [-1]

    elif dset_name == 'yatch':
        if not os.path.isfile(save_dir+'yacht_hydrodynamics.data'):
            urllib.urlretrieve("http://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data",
                               filename=save_dir+'yacht_hydrodynamics.data')
        data = pd.read_csv(save_dir+'yacht_hydrodynamics.data', header=1, delimiter='\s+').values
        y_idx = [-1]

    elif dset_name == 'kin8nm':
        if not os.path.isfile(save_dir+'dataset_2175_kin8nm.csv'):
            urllib.urlretrieve("https://www.openml.org/data/get_csv/3626/dataset_2175_kin8nm.csv",
                               filename=save_dir+'dataset_2175_kin8nm.csv')
        data = pd.read_csv(save_dir+'dataset_2175_kin8nm.csv', header=1, delimiter=',').values
        y_idx = [-1]

    elif dset_name == 'kin8nm':
        if not os.path.isfile(save_dir+'dataset_2175_kin8nm.csv'):
            urllib.urlretrieve("https://www.openml.org/data/get_csv/3626/dataset_2175_kin8nm.csv",
                               filename=save_dir+'dataset_2175_kin8nm.csv')
        data = pd.read_csv(save_dir+'dataset_2175_kin8nm.csv', header=1, delimiter=',').values
        y_idx = [-1]

    elif dset_name == 'naval':
        if not os.path.isfile(save_dir + 'UCI%20CBM%20Dataset.zip'):
            urllib.urlretrieve("http://archive.ics.uci.edu/ml/machine-learning-databases/00316/UCI%20CBM%20Dataset.zip",
                               filename=save_dir + 'UCI%20CBM%20Dataset.zip')
        zipped = zipfile.ZipFile(save_dir + "UCI%20CBM%20Dataset.zip")
        data = pd.read_csv(zipped.open('UCI CBM Dataset/data.txt'), header='infer', delimiter="\s+").values
        y_idx = [-2, -1]

    elif dset_name == 'protein':
        if not os.path.isfile(save_dir+'CASP.csv'):
            urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv",
                               filename=save_dir+'CASP.csv')
        data = pd.read_csv(save_dir+'CASP.csv', header=1, delimiter=',').values
        y_idx = [0]

    elif dset_name == 'default_credit':
        if not os.path.isfile(save_dir + 'default of credit card clients.xls'):
            urllib.request.urlretrieve(
                "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls",
                filename=save_dir + 'default of credit card clients.xls')
        data = pd.read_excel(save_dir + 'default of credit card clients.xls', header=[0, 1], index_col=0, # delimiter="\s+"
                             ).values
        y_idx = [-1]  # OK


    else:
        raise Exception('Dataset name doesnt match any known datasets.')

    np.random.seed(seed)
    data = data[np.random.permutation(np.arange(len(data)))]

    kf = KFold(n_splits=splits)
    for j, (train_index, test_index) in enumerate(kf.split(data)):

        if separate_targets:
            x_idx = list(range(data.shape[1]))
            for e in y_idx:
                x_idx.remove(x_idx[e])

            x_idx = np.array(x_idx)
            y_idx = np.array(y_idx)
            x_train, y_train = data[train_index, :], data[train_index, :]
            x_train, y_train = x_train[:, x_idx], y_train[:, y_idx]
            x_test, y_test = data[test_index, :], data[test_index, :]
            x_test, y_test = x_test[:, x_idx], y_test[:, y_idx]

            x_means, x_stds = x_train.mean(axis=0), x_train.std(axis=0)
            y_means, y_stds = y_train.mean(axis=0), y_train.std(axis=0)

            y_stds[y_stds < 1e-10] = 1
            x_stds[x_stds < 1e-10] = 1

            x_train = ((x_train - x_means) / x_stds).astype(np.float32)
            y_train = ((y_train - y_means) / y_stds).astype(np.float32)

            x_test = ((x_test - x_means) / x_stds).astype(np.float32)
            y_test = ((y_test - y_means) / y_stds).astype(np.float32)

            return x_train, x_test, x_means, x_stds, y_train, y_test, y_means, y_stds

        else:
            x_train, x_test = data[train_index, :], data[test_index, :]
            x_means, x_stds = x_train.mean(axis=0), x_train.std(axis=0)

            x_stds[x_stds < 1e-10] = 1

            x_train = ((x_train - x_means) / x_stds).astype(np.float32)
            x_test = ((x_test - x_means) / x_stds).astype(np.float32)

            return x_train, x_test, x_means, x_stds

def mkdir(paths):
    if not isinstance(paths, (list, tuple)):
        paths = [paths]
    for path in paths:
        if not os.path.isdir(path):
            os.makedirs(path, mode=0o777)
            
def unnormalise_cat_vars(x, x_means, x_stds, input_dim_vec):
    input_dim_vec = np.array(input_dim_vec)
    unnorm_x = np.multiply(x, x_stds) + x_means

    fixed_unnorm = unnorm_x.round()
    fixed_unnorm -= fixed_unnorm.min(axis=0).reshape([1, fixed_unnorm.shape[1]])  # this sets all mins to 0
    for idx, dims in enumerate(input_dim_vec):
        if dims > 1:
            vec = fixed_unnorm[:, idx]
            vec[vec > dims - 1] = dims - 1
            fixed_unnorm[:, idx] = vec

    x[:, input_dim_vec > 1] = fixed_unnorm[:, input_dim_vec > 1]
    return x

In [3]:
names = ['wine', 'default_credit', 'compas', 'lsat']
widths = [300, 300, 300, 300] # [200, 200, 200, 200]
depths = [3, 3, 3, 3] # We go deeper because we are using residual models
latent_dims = [6, 8, 4, 4]

In [4]:
x_train, x_test, x_means, x_stds, y_train, y_test, y_means, y_stds = \
    load_UCI(dset_name='wine', splits=10, seed=42, separate_targets=True, save_dir='../data/')
print('Wine', x_train.shape, x_test.shape)

Wine (1438, 11) (160, 11)


In [5]:
x_train, x_test, x_means, x_stds, y_train, y_test, y_means, y_stds = \
load_UCI(dset_name='default_credit', splits=10, seed=42, separate_targets=True, save_dir='../data/')
input_dim_vec = [1, 2, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ,1 ,1 ,1 ,1 ,1]

In [6]:
x_train = unnormalise_cat_vars(x_train, x_means, x_stds, input_dim_vec)
x_test = unnormalise_cat_vars(x_test, x_means, x_stds, input_dim_vec)
# target unnormalisation
y_train = unnormalise_cat_vars(y_train, y_means, y_stds, [2])
y_test = unnormalise_cat_vars(y_test, y_means, y_stds, [2])

In [7]:
dname = 'default_credit'
print(dname)

default_credit


In [8]:
x_train.shape

(27000, 23)

In [9]:
y_train.shape

(27000, 1)

In [10]:
input_dim_vec = [1, 2, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ,1 ,1 ,1 ,1 ,1]
len(input_dim_vec)

23

In [11]:
sum(input_dim_vec)

29

# Defining model

## imports

In [12]:
import torch
import sys
cuda = torch.cuda.is_available()
from torch.nn import MSELoss,CrossEntropyLoss
import torch.backends.cudnn as cudnn
from torch.distributions.normal import Normal

import torch.nn as nn
from torch.autograd import Variable
from torch.nn.functional import softplus
from torch.distributions import kl_divergence

from torch.optim.optimizer import Optimizer
import math

## Util functions

In [13]:
def cprint(color, text, **kwargs):
    if color[0] == '*':
        pre_code = '1;'
        color = color[1:]
    else:
        pre_code = ''
    code = {
        'a': '30',
        'r': '31',
        'g': '32',
        'y': '33',
        'b': '34',
        'p': '35',
        'c': '36',
        'w': '37'
    }
    print("\x1b[%s%sm%s\x1b[0m" % (pre_code, code[color], text), **kwargs)
    sys.stdout.flush()

# (used in sub network)
def normal_parse_params(params, min_sigma=1e-3):
    """
    Take a Tensor (e. g. neural network output) and return
    torch.distributions.Normal distribution.
    This Normal distribution is component-wise independent,
    and its dimensionality depends on the input shape.
    First half of channels is mean of the distribution,
    the softplus of the second half is std (sigma), so there is
    no restrictions on the input tensor.
    min_sigma is the minimal value of sigma. I. e. if the above
    softplus is less than min_sigma, then sigma is clipped
    from below with value min_sigma. This regularization
    is required for the numerical stability and may be considered
    as a neural network architecture choice without any change
    to the probabilistic model.
    """
    n = params.shape[0]
    d = params.shape[1]
    mu = params[:, :d // 2]
    sigma_params = params[:, d // 2:]
    sigma = softplus(sigma_params)
    sigma = sigma.clamp(min=min_sigma)
    distr = Normal(mu, sigma)
    return distr

## (used in the next function)
def torch_onehot(y, Nclass):
    if y.is_cuda:
        y = y.type(torch.cuda.LongTensor)
    else:
        y = y.type(torch.LongTensor)
    y_onehot = torch.zeros((y.shape[0], Nclass)).type(y.type())
    # In your for loop
    y_onehot.scatter_(1, y.unsqueeze(1), 1)
    return y_onehot

## (used in the fit of the main network)
def gauss_cat_to_flat(x, input_dim_vec):
    output = []
    for idx, dim in enumerate(input_dim_vec):
        if dim == 1:
            output.append(x[:, idx].unsqueeze(1))
        elif dim > 1:
            oh_vec = torch_onehot(x[:, idx], dim).type(x.type())
            output.append(oh_vec)
        else:
            raise ValueError('Error, invalid dimension value')
    return torch.cat(output, dim=1)

## (also used in the fit of the main network)
def flat_to_gauss_cat(x, input_dim_vec):
    output = []
    cum_dims = 0
    for idx, dims in enumerate(input_dim_vec):
        if dims == 1:
            output.append(x[:, cum_dims].unsqueeze(1))
            cum_dims += 1

        elif dims > 1:
            output.append(x[:, cum_dims:cum_dims + dims].max(dim=1)[1].type(x.type()).unsqueeze(1))
            cum_dims += dims

        else:
            raise ValueError('Error, invalid dimension value')

    return torch.cat(output, dim=1)

## (also used in the fit of the main network)
def to_variable(var=(), cuda=True, volatile=False):
    out = []
    for v in var:
        if isinstance(v, np.ndarray):
            v = torch.from_numpy(v).type(torch.FloatTensor)
        if not v.is_cuda and cuda:
            v = v.cuda()
        if not isinstance(v, Variable):
            v = Variable(v, volatile=volatile)
        out.append(v)
    return out

## Skip connection

In [14]:
class SkipConnection(nn.Module):
    """
    Skip-connection over the sequence of layers in the constructor.
    The module passes input data sequentially through these layers
    and then adds original data to the result.
    """
    def __init__(self, *args):
        super(SkipConnection, self).__init__()
        self.inner_net = nn.Sequential(*args)

    def forward(self, input):
        return input + self.inner_net(input)

def preact_leaky_MLPBlock(width):
    return SkipConnection(
        nn.LeakyReLU(),
        nn.BatchNorm1d(num_features=width),
        nn.Linear(width, width),
        )

## Encoder

In [15]:
class MLP_preact_recognition_net(nn.Module):
    def __init__(self, input_dim, width, depth, latent_dim):
        super(MLP_preact_recognition_net, self).__init__()
        # input layer
        proposal_layers = [nn.Linear(input_dim, width)]
        # body
        for i in range(depth-1):
            proposal_layers.append(
                preact_leaky_MLPBlock(width)) ## *dependency
        # output layer
        proposal_layers.extend(
            [nn.LeakyReLU(), nn.BatchNorm1d(num_features=width),
            nn.Linear(width, latent_dim * 2)])

        self.block = nn.Sequential(*proposal_layers)

    def forward(self, x):
        return self.block(x)

## Generator

In [16]:
class MLP_preact_generator_net(nn.Module):
    def __init__(self, input_dim, width, depth, latent_dim):
        super(MLP_preact_generator_net, self).__init__()
        # input layer
        generative_layers = [nn.Linear(latent_dim, width), nn.LeakyReLU(), nn.BatchNorm1d(num_features=width)]
        # body
        for i in range(depth-1):
            generative_layers.append(
                    # skip-connection from prior network to generative network
                    preact_leaky_MLPBlock(width))  ## *dependency
        # output layer
        generative_layers.extend([
            nn.Linear(width,
                      input_dim),
        ])
        self.block = nn.Sequential(*generative_layers)

    def forward(self, x):
        return self.block(x)

## rms cat loglike

In [43]:
class rms_cat_loglike(nn.Module):

    def __init__(self, input_dim_vec, reduction='none'):
        super(rms_cat_loglike, self).__init__()
        self.reduction = reduction
        self.input_dim_vec = input_dim_vec
        self.mse = MSELoss(reduction='none')  # takes(input, target)
        self.ce = CrossEntropyLoss(reduction='none')

    def forward(self, x, y):

        log_prob_vec = []
        cum_dims = 0
        for idx, dims in enumerate(self.input_dim_vec):
            if dims == 1:
                # Gaussian_case
                log_prob_vec.append(-self.mse(x[:, cum_dims], y[:, idx]).unsqueeze(1))
                cum_dims += 1

            elif dims > 1:
                if x.shape[1] == y.shape[1]:
                    raise Exception('Input and target seem to be in flat format. Need integer cat targets.')
                                
                if y.is_cuda:
                    tget = y[:, idx].type(torch.cuda.LongTensor)
                else:
                    tget = y[:, idx].type(torch.LongTensor)

                log_prob_vec.append(-self.ce(x[:, cum_dims:cum_dims + dims], tget).unsqueeze(1))
                cum_dims += dims

            else:
                raise ValueError('Error, invalid dimension value')

        log_prob_vec = torch.cat(log_prob_vec, dim=1)

        if self.reduction == 'none':
            return log_prob_vec
        elif self.reduction == 'sum':
            return log_prob_vec.sum()
        elif self.reduction == 'average':
            return log_prob_vec.mean()

## sub network

In [44]:
class VAE_gauss_cat(nn.Module):
    def __init__(self, input_dim_vec, width, depth, latent_dim, pred_sig=False):
        super(VAE_gauss_cat, self).__init__()

        input_dim = 0
        self.input_dim_vec = input_dim_vec
        for e in input_dim_vec:
            input_dim += e
        
        self.encoder = MLP_preact_recognition_net(input_dim, width, depth, latent_dim) ## *dependency
        if pred_sig:
            raise NotImplementedError()
            # self.decoder = generator_net(2*input_dim, width, depth, latent_dim)
            # self.rec_loglike = GaussianLoglike(min_sigma=1e-2)
        else:
            self.decoder = MLP_preact_generator_net(input_dim, width, depth, latent_dim)
            self.rec_loglike = rms_cat_loglike(self.input_dim_vec, reduction='none') ## *dependency
        self.pred_sig = pred_sig

    def encode(self, x):
        """Works with flattened representATION"""
        approx_post_params = self.encoder(x)
        approx_post = normal_parse_params(approx_post_params, 1e-3)
        return approx_post

    def decode(self, z_sample):
        """Works with flattened representATION"""
        rec_params = self.decoder(z_sample)
        return rec_params

    def vlb(self, prior, approx_post, x, rec_params):
        """Works with flattened representATION"""
        if self.pred_sig:
            pass
        else:
            rec = self.rec_loglike(rec_params, x).view(x.shape[0], -1).sum(-1)
        kl = kl_divergence(approx_post, prior).view(x.shape[0], -1).sum(-1)
        return rec - kl

    def iwlb(self, prior, approx_post, x, K=50):
        estimates = []
        for i in range(K):
            latent = approx_post.rsample()
            rec_params = self.decode(latent)
            if self.pred_sig:
                pass
            else:
                rec_loglike = self.rec_loglike(rec_params, x).view(x.shape[0], -1).sum(-1)

            prior_log_prob = prior.log_prob(latent)
            prior_log_prob = prior_log_prob.view(x.shape[0], -1)
            prior_log_prob = prior_log_prob.sum(-1)

            proposal_log_prob = approx_post.log_prob(latent)
            proposal_log_prob = proposal_log_prob.view(x.shape[0], -1)
            proposal_log_prob = proposal_log_prob.sum(-1)

            estimate = rec_loglike + prior_log_prob - proposal_log_prob
            estimates.append(estimate[:, None])

        return torch.logsumexp(torch.cat(estimates, 1), 1) - np.log(K)

## Base network

In [45]:
class BaseNet(object):
    def __init__(self):
        cprint('c', '\nNet:')

    def get_nb_parameters(self):
        return np.sum(p.numel() for p in self.model.parameters())

    def set_mode_train(self, train=True):
        if train:
            self.model.train()
        else:
            self.model.eval()

    def update_lr(self, epoch, gamma=0.99):
        self.epoch += 1
        if self.schedule is not None:
            if len(self.schedule) == 0 or epoch in self.schedule:
                self.lr *= gamma
                print('learning rate: %f  (%d)\n' % (self.lr, epoch))
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr

    def save(self, filename):
        cprint('c', 'Writting %s\n' % filename)
        torch.save({
            'epoch': self.epoch,
            'lr': self.lr,
            'model': self.model,
            'optimizer': self.optimizer}, filename)

    def load(self, filename):
        cprint('c', 'Reading %s\n' % filename)
        state_dict = torch.load(filename)
        self.epoch = state_dict['epoch']
        self.lr = state_dict['lr']
        self.model = state_dict['model']
        self.optimizer = state_dict['optimizer']
        print('  restoring epoch: %d, lr: %f' % (self.epoch, self.lr))
        return self.epoch

## Reactified Adam (RAdam) Optimizer

In [46]:
class RAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))

        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                state['step'] += 1
                buffered = self.buffer[int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        step_size = math.sqrt(
                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
                                        N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    buffered[2] = step_size

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                # more conservative since it's an approximated value
                if N_sma >= 5:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
                else:
                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)

                p.data.copy_(p_data_fp32)

        return loss

## Main network

In [47]:
class VAE_gauss_cat_net(BaseNet):
    def __init__(self, input_dim_vec, width, depth, latent_dim, pred_sig=False, lr=1e-3, cuda=True, flatten=True):
        super(VAE_gauss_cat_net, self).__init__()
        cprint('y', 'VAE_gauss_net')

        self.cuda = cuda
        self.input_dim = 0
        self.input_dim_vec = input_dim_vec
        for e in self.input_dim_vec:
            self.input_dim += e
        self.flatten = flatten
        if not self.flatten:
            pass
            # raise Exception('Error calculation not supported without flattening')

        self.width = width
        self.depth = depth
        self.latent_dim = latent_dim
        self.lr = lr
        self.pred_sig = pred_sig
        
        # Here create the network
        self.create_net()
        
        # Here create the optimizer
        self.create_opt()
        
        self.epoch = 0
        self.schedule = None

        if self.cuda:
            self.prior = self.prior = Normal(loc=torch.zeros(latent_dim).cuda(), scale=torch.ones(latent_dim).cuda())
        else:
            self.prior = Normal(loc=torch.zeros(latent_dim), scale=torch.ones(latent_dim))
        self.vlb_scale = 1 / len(self.input_dim_vec)  # scale for dimensions of input so we can use same LR always

    def create_net(self):
        torch.manual_seed(42)
        torch.cuda.manual_seed(42)
        self.model = VAE_gauss_cat(self.input_dim_vec, self.width, self.depth, self.latent_dim, self.pred_sig)
        if self.cuda:
            self.model = self.model.cuda()
            cudnn.benchmark = True
        print('    Total params: %.2fM' % (self.get_nb_parameters() / 1000000.0))

    def create_opt(self):
        self.optimizer = RAdam(self.model.parameters(), lr=self.lr)

    def fit(self, x):
        self.set_mode_train(train=True)

        if self.flatten:
            x_flat = gauss_cat_to_flat(x, self.input_dim_vec)
        else:
            x_flat = x
            x = flat_to_gauss_cat(x, self.input_dim_vec)

        x, x_flat = to_variable(var=(x, x_flat), cuda=self.cuda)
        self.optimizer.zero_grad()

        approx_post = self.model.encode(x_flat)
        z_sample = approx_post.rsample()
        rec_params = self.model.decode(z_sample)

        vlb = self.model.vlb(self.prior, approx_post, x, rec_params)
        loss = (- vlb * self.vlb_scale).mean()

        loss.backward()
        self.optimizer.step()

        return vlb.mean().item(), rec_params

    def eval(self, x, sample=False):
        self.set_mode_train(train=False)

        if self.flatten:
            x_flat = gauss_cat_to_flat(x, self.input_dim_vec)
        else:
            x_flat = x
            x = flat_to_gauss_cat(x, self.input_dim_vec)

        x, x_flat = to_variable(var=(x, x_flat), cuda=self.cuda)
        approx_post = self.model.encode(x_flat)
        if sample:
            z_sample = approx_post.sample()
        else:
            z_sample = approx_post.loc
        rec_params = self.model.decode(z_sample)

        vlb = self.model.vlb(self.prior, approx_post, x, rec_params)

        return vlb.mean().item(), rec_params

    def eval_iw(self, x, k=50):
        self.set_mode_train(train=False)
        if self.flatten:
            x_flat = gauss_cat_to_flat(x, self.input_dim_vec)
        else:
            x_flat = x
            x = flat_to_gauss_cat(x, self.input_dim_vec)

        x, x_flat = to_variable(var=(x, x_flat), cuda=self.cuda)

        approx_post = self.model.recognition_encode(x)

        iw_lb = self.model.iwlb(self.prior, approx_post, x, k)
        return iw_lb.mean().item()

    def recongnition(self, x, grad=False, flatten=None):
        if flatten is None:
            flatten = self.flatten
        if flatten and grad:
            raise Exception('flatten and grad options are not compatible')
        self.set_mode_train(train=False)
        if flatten:
            x = gauss_cat_to_flat(x, self.input_dim_vec)
        if grad:
            if not x.requires_grad:
                x.requires_grad = True
        else:
            x, = to_variable(var=(x,), volatile=True, cuda=self.cuda)
        approx_post = self.model.encode(x)
        return approx_post

    def regenerate(self, z, grad=False, unflatten=False):
        if unflatten and grad:
            raise Exception('flatten and grad options are not compatible')
        self.set_mode_train(train=False)
        if grad:
            if not z.requires_grad:
                z.requires_grad = True
        else:
            z, = to_variable(var=(z,), volatile=True, cuda=self.cuda)
        out = self.model.decode(z)

        if unflatten:
            out = flat_to_gauss_cat(out, self.input_dim_vec)
        else:
            out = selective_softmax(out, self.input_dim_vec, grad=grad)

        if self.pred_sig:
            raise Exception('Not implemented')
        else:
            return out

In [48]:
width = widths[names.index(dname)]
depth = depths[names.index(dname)] # number of hidden layers
latent_dim = latent_dims[names.index(dname)]

batch_size = 128
nb_epochs = 2500
lr = 1e-4
early_stop = 200

# cuda = torch.cuda.is_available()

In [49]:
net = VAE_gauss_cat_net(input_dim_vec, width, depth, latent_dim, pred_sig=False, lr=lr, cuda=cuda, flatten=True)

[36m
Net:[0m
[33mVAE_gauss_net[0m
    Total params: 0.39M


  return np.sum(p.numel() for p in self.model.parameters())


# Data

In [50]:
import torch.utils.data as data

class Datafeed(data.Dataset):

    def __init__(self, x_train, y_train=None, transform=None):
        self.data = x_train
        self.targets = y_train
        self.transform = transform

    def __getitem__(self, index):
        img = self.data[index]
        if self.transform is not None:
            img = self.transform(img)
        if self.targets is not None:
            return img, self.targets[index]
        else:
            return img

    def __len__(self):
        return len(self.data)

In [51]:
trainset = Datafeed(x_train, x_train, transform=None)
valset = Datafeed(x_test, x_test, transform=None)

# Training

In [52]:
def train_VAE(net, name, batch_size, nb_epochs, trainset, valset, cuda, flat_ims=False,
              train_plot=False, Nclass=None, early_stop=None, script_mode=False):

    models_dir = name + '_models'
    results_dir = name + '_results'
    mkdir(models_dir)
    mkdir(results_dir)

    if cuda:
        trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, pin_memory=True,
                                                  num_workers=3)
        valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False, pin_memory=True,
                                                num_workers=3)

    else:
        trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, pin_memory=False,
                                                  num_workers=3)
        valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False, pin_memory=False,
                                                num_workers=3)

## ---------------------------------------------------------------------------------------------------------------------
# net dims
    cprint('c', '\nNetwork:')

    epoch = 0

    ## ---------------------------------------------------------------------------------------------------------------------
    # train
    cprint('c', '\nTrain:')

    print('  init cost variables:')
    vlb_train = np.zeros(nb_epochs)
    vlb_dev = np.zeros(nb_epochs)
    best_vlb = -np.inf
    best_vlb_train = -np.inf
    best_epoch = 0

    nb_its_dev = 1

    tic0 = time.time()
    for i in range(epoch, nb_epochs):
        net.set_mode_train(True)

        tic = time.time()
        nb_samples = 0
        for x, y in trainloader:

            if flat_ims:
                x = x.view(x.shape[0], -1)
            if Nclass is not None:
                y_oh = torch_onehot(y, Nclass).type(x.type())
                x = torch.cat([x, y_oh], 1)

            cost, _ = net.fit(x)

            vlb_train[i] += cost * len(x)
            nb_samples += len(x)

        vlb_train[i] /= nb_samples

        toc = time.time()

        # ---- print
        print("it %d/%d, vlb %f, " % (i, nb_epochs, vlb_train[i]), end="")
        cprint('r', '   time: %f seconds\n' % (toc - tic))
        net.update_lr(i)

        if vlb_train[i] > best_vlb_train:
            best_vlb_train = vlb_train[i]

        # ---- dev
        if i % nb_its_dev == 0:
            nb_samples = 0
            for j, (x, y) in enumerate(valloader):

                if flat_ims:
                    x = x.view(x.shape[0], -1)
                if Nclass is not None:
                    y_oh = torch_onehot(y, Nclass).type(x.type())
                    x = torch.cat([x, y_oh], 1)

                cost, _ = net.eval(x)

                vlb_dev[i] += cost * len(x)
                nb_samples += len(x)

            vlb_dev[i] /= nb_samples

            cprint('g', '    vlb %f (%f)\n' % (vlb_dev[i], best_vlb))

            if train_plot:
                zz = net.recongnition(x).sample()
                o = net.regenerate(zz)
                try:
                    o = o.cpu()
                except:
                    o = o.loc.cpu()
                if len(x.shape) == 2:
                    side = int(np.sqrt(x.shape[1]))
                    x = x.view(-1, 1, side, side).data
                    o = o.view(-1, 1, side, side).data

                # save_image(torch.cat([x[:8], o[:8]]), results_dir + '/rec_%d.png' % i, nrow=8)
                import matplotlib.pyplot as plt
                plt.figure()
                dd = make_grid(torch.cat([x[:10], o[:10]]), nrow=10).numpy()
                plt.imshow(np.transpose(dd, (1, 2, 0)), interpolation='nearest')
                if script_mode:
                    plt.savefig(results_dir + '/rec%d.png' % i)
                else:
                    plt.show()

                z_sample = normal(loc=0.0, scale=1.0, size=(36, net.latent_dim))
                x_rec = net.regenerate(z_sample)
                try:
                    x_rec = x_rec.cpu()
                except:
                    x_rec = x_rec.loc.cpu()
                if len(x_rec.shape) == 2:
                    side = int(np.sqrt(x_rec.shape[1]))
                    x_rec = x_rec.view(-1, 1, side, side)
                plt.figure()
                dd = make_grid(x_rec, nrow=6).numpy()
                plt.imshow(np.transpose(dd, (1, 2, 0)), interpolation='nearest')
                if script_mode:
                    plt.savefig(results_dir + '/sample%d.png' % i)
                else:
                    plt.show()

        if vlb_dev[i] > best_vlb:
            best_vlb = vlb_dev[i]
            best_epoch = i
            net.save(models_dir + '/theta_best.dat')

        if early_stop is not None and (i - best_epoch) > early_stop:
            break


    net.save(models_dir + '/theta_last.dat')
    toc0 = time.time()
    runtime_per_it = (toc0 - tic0) / float(nb_epochs)
    cprint('r', '   average time: %f seconds\n' % runtime_per_it)

    ## ---------------------------------------------------------------------------------------------------------------------
    # results
    cprint('c', '\nRESULTS:')
    nb_parameters = net.get_nb_parameters()
    best_cost_dev = best_vlb
    best_cost_train = best_vlb_train

    print('  best_vlb_dev: %f' % best_cost_dev)
    print('  best_vlb_train: %f' % best_cost_train)
    print('  nb_parameters: %d (%s)\n' % (nb_parameters, humansize(nb_parameters)))

    ## ---------------------------------------------------------------------------------------------------------------------
    # fig cost vs its
    if not train_plot:
        import matplotlib
        matplotlib.use('agg')
    import matplotlib.pyplot as plt
    if train_plot:
        plt.figure()
        plt.plot(np.clip(vlb_train, -1000, 1000), 'r')
        plt.plot(np.clip(vlb_dev[::nb_its_dev], -1000, 1000), 'b')
        plt.legend(['cost_train', 'cost_dev'])
        plt.ylabel('vlb')
        plt.xlabel('it')
        plt.grid(True)
        plt.savefig(results_dir+'/train_cost.png')
        if train_plot:
            plt.show()
    return vlb_train, vlb_dev


In [53]:
save_dir = '../saves/fc_preact_VAE_NEW(300)_' + dname

In [54]:
import time

In [56]:
vlb_train, vlb_dev = train_VAE(net, save_dir, batch_size, nb_epochs, trainset, valset,
                               cuda=cuda, flat_ims=False, train_plot=False, early_stop=early_stop)

[36m
Network:[0m
[36m
Train:[0m
  init cost variables:
it 0/2500, vlb -16.290074, [31m   time: 2.343825 seconds
[0m
[32m    vlb -11.441558 (-inf)
[0m
[36mWritting ../saves/fc_preact_VAE_NEW(300)_default_credit_models/theta_best.dat
[0m
it 1/2500, vlb -14.814891, [31m   time: 2.291842 seconds
[0m
[32m    vlb -10.454740 (-11.441558)
[0m
[36mWritting ../saves/fc_preact_VAE_NEW(300)_default_credit_models/theta_best.dat
[0m
it 2/2500, vlb -14.097355, [31m   time: 2.231448 seconds
[0m
[32m    vlb -10.100600 (-10.454740)
[0m
[36mWritting ../saves/fc_preact_VAE_NEW(300)_default_credit_models/theta_best.dat
[0m
it 3/2500, vlb -13.544256, [31m   time: 2.344557 seconds
[0m
[32m    vlb -9.498764 (-10.100600)
[0m
[36mWritting ../saves/fc_preact_VAE_NEW(300)_default_credit_models/theta_best.dat
[0m
it 4/2500, vlb -13.129443, [31m   time: 2.267055 seconds
[0m
[32m    vlb -9.556649 (-9.498764)
[0m
it 5/2500, vlb -12.812818, [31m   time: 2.539868 seconds
[0m
[32m    v

[32m    vlb -7.879159 (-7.879782)
[0m
[36mWritting ../saves/fc_preact_VAE_NEW(300)_default_credit_models/theta_best.dat
[0m
it 57/2500, vlb -10.152980, [31m   time: 2.482724 seconds
[0m
[32m    vlb -7.956826 (-7.879159)
[0m
it 58/2500, vlb -10.162180, [31m   time: 2.617347 seconds
[0m
[32m    vlb -7.926289 (-7.879159)
[0m
it 59/2500, vlb -10.143148, [31m   time: 2.491330 seconds
[0m
[32m    vlb -7.918534 (-7.879159)
[0m
it 60/2500, vlb -10.141480, [31m   time: 2.712121 seconds
[0m
[32m    vlb -7.917398 (-7.879159)
[0m
it 61/2500, vlb -10.100748, [31m   time: 2.459818 seconds
[0m
[32m    vlb -7.854641 (-7.879159)
[0m
[36mWritting ../saves/fc_preact_VAE_NEW(300)_default_credit_models/theta_best.dat
[0m
it 62/2500, vlb -10.107993, [31m   time: 2.572758 seconds
[0m
[32m    vlb -7.880467 (-7.854641)
[0m
it 63/2500, vlb -10.106003, [31m   time: 2.512296 seconds
[0m
[32m    vlb -7.863739 (-7.854641)
[0m
it 64/2500, vlb -10.102460, [31m   time: 2.536662 secon

Exception in thread Thread-168:
Traceback (most recent call last):
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
    fd = df.detach()
  File "/home/ali/anaconda3/envs/py38/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/home/ali/anaconda3/envs/py38/lib

KeyboardInterrupt: 