In [1]:
#Import all the necessary packages
import numpy as np
import tensorflow as tf
import edward as ed
from edward.models import Normal, BernoulliWithSigmoidProbs, Bernoulli, Dirichlet, Empirical
from edward.util import Progbar
import time
import math
import matplotlib.pyplot as pyplot
from sklearn.metrics import roc_curve, auc
import random

  return f(*args, **kwds)


# Probabilistic PCA with variations

In [None]:
#All dimension of input data ranges from [-1, 1]
#thus motivating usages of non-Gaussian noises
#Input: 
#DATA, the data dimension to be reduced
#latent_dim, the latent dimension
#dimension_reduction_coef_var, the variance of the coefficients
#observation_var, 
#sigmoid: whether the sigmoid is used before/after addition of noise, or not used
#inference_method: the method to infer the latent variables, klqp and HMC implemented
#Output:
#qw: the "posterior" of dimension reduction coefficients
#qz: the "posterior" of latent variables
def unsupervised_dimension_reduction(DATA, latent_dim, dimension_reduction_coef_var, 
                                    observation_var, sigmoid='last', inference_method='KLqp'):
    sample_size = DATA.shape[0]
    correlation_count = DATA.shape[1]
    if sigmoid:
        observation_var = None
    w = Normal(loc=tf.zeros([correlation_count, latent_dim]), 
               scale=dimension_reduction_coef_var * tf.ones([correlation_count, latent_dim]))
    z = Normal(loc=tf.zeros([sample_size, latent_dim]), 
               scale=tf.ones([sample_size, latent_dim]))
    targets = None
    if sigmoid == 'last':
        logits = Normal(loc=tf.matmul(w, z, transpose_b=True), 
                   scale=observation_var * tf.ones([correlation_count, sample_size]))
        t = tf.sigmoid(logits)
        targets = 2 * t - 1
    elif sigmoid == 'no_sigmoid':
        targets = Normal(loc=tf.matmul(w, z, transpose_b=True), 
                   scale=observation_var * tf.ones([correlation_count, sample_size]))
    elif sigmoid == 'first':
        t = tf.sigmoid(tf.matmul(w, z, transpose_b=True))
        t = 2 * t - 1
        targets = Normal(loc=t, 
                         scale=observation_var * tf.ones([correlation_count, sample_size]))
    qw = Normal(loc=tf.Variable(tf.random_normal([correlation_count, latent_dim])),
            scale=tf.nn.softplus(tf.Variable(tf.random_normal([correlation_count, latent_dim]))))
    qz = Normal(loc=tf.Variable(tf.random_normal([sample_size, latent_dim])),
            scale=tf.nn.softplus(tf.Variable(tf.random_normal([sample_size, latent_dim]))))
    inference = ed.KLqp({w: qw, z: qz}, data={targets: DATA.T})
    inference.run(n_iter=500, n_print=100, n_samples=100)
    return qw, qz

In [None]:
#All dimension of input data ranges from [-1, 1]
#thus motivating usages of non-Gaussian noises
#Input: 
#DATA, the data dimension to be reduced
#qw: the probability distribution dimension reduction coefficients
#latent_dim, the latent dimension
#dimension_reduction_coef_var, the variance of the coefficients
#observation_var, 
#sigmoid: whether the sigmoid is used before/after addition of noise, or not used
#inference_method: the method to infer the latent variables, klqp and HMC implemented
#Output:
#qw: the "posterior" of dimensino reduction coefficients
#qz: the "posterior" of latent variables
def learn_latent_given_w(DATA, qw, latent_dim, observation_var, sigmoid=False):
    sample_size = DATA.shape[0]
    correlation_count = DATA.shape[1]
    z = Normal(loc=tf.zeros([sample_size, latent_dim]), 
               scale=tf.ones([sample_size, latent_dim]))
    targets = None
    if sigmoid == 'last':
        logits = Normal(loc=tf.matmul(w, z, transpose_b=True), 
                   scale=observation_var * tf.ones([correlation_count, sample_size]))
        t = tf.sigmoid(logits)
        targets = 2 * t - 1
    elif sigmoid == 'no_sigmoid':
        targets = Normal(loc=tf.matmul(w, z, transpose_b=True), 
                   scale=observation_var * tf.ones([correlation_count, sample_size]))
    elif sigmoid == 'first':
        t = tf.sigmoid(tf.matmul(w, z, transpose_b=True))
        t = 2 * t - 1
        targets = Normal(loc=t, 
                         scale=observation_var * tf.ones([correlation_count, sample_size]))
    qz = Normal(loc=tf.Variable(tf.random_normal([sample_size, latent_dim])),
            scale=tf.nn.softplus(tf.Variable(tf.random_normal([sample_size, latent_dim]))))
    inference = ed.KLqp({z: qz}, data={targets: DATA.T})
    inference.run(n_iter=500, n_print=100, n_samples=100)
    return qz

In [None]:
# Bayesian logistics regression
# Input:
# qz: the distribution of the input variables, either latent/or not dimensions
#y_labels: the labels for this supervised learning
#latent_dim: the dimension of qz
#regression_coef_var: equivalent to regularization term
def bayesian_logistics_regression(qz, y_labels, latent_dim, regression_coef_var):
    regression_coef = Normal(loc=tf.zeros([1, latent_dim]), 
               scale=regression_coef_var * tf.ones([1, latent_dim]))
    y = Bernoulli(logits=tf.matmul(qz, regression_coef, transpose_b=True))
    qcoeff = Normal(loc=tf.Variable(tf.random_normal([1, latent_dim])),
            scale=tf.nn.softplus(tf.Variable(tf.random_normal([1, latent_dim]))))
    inference = ed.KLqp({regression_coef: qcoeff}, data={y: y_labels})
    inference.run(n_iter=500, n_print=100, n_samples=10)
    return qcoeff

In [None]:
#A wrapper that is created to automatically sample training and test set
#and compare different models
#return the area under the roc curve
def dimension_reduction_and_logistics_regression(latent_dim, 
                                                 dimension_reduction_coef_var, 
                                                 regression_coef_var, 
                                                 sigmoid, 
                                                 observation_var, 
                                                 DATA, 
                                                 labels):
    train_DATA, train_label, test_DATA, test_label = split_train_test(DATA, labels)
    test_label = test_label.reshape((test_label.shape[0],))
    qw, qz_train = unsupervised_dimension_reduction(train_DATA, latent_dim, 
                                              dimension_reduction_coef_var, 
                                              observation_var, sigmoid)
    qcoef = bayesian_logistics_regression(qz_train, train_label, latent_dim, regression_coef_var)
    qz_test = learn_latent_given_w(test_DATA, qw, latent_dim, observation_var, sigmoid)
    n_samples=100
    probas = tf.gather(tf.reduce_mean(tf.stack([tf.sigmoid(tf.matmul(qz_test.sample(), qcoef.sample(), 
                                                                     transpose_b=True))])))
                  for _ in range(n_samples)]), axis=0), 0, axis=1)
    score = probas.eval()
    fpr, tpr, thresholds = roc_curve(test_label, score)
    roc_area = auc(fpr, tpr)
    
    return roc_area

In [None]:
#A wrapper for raw bayesian logistics regression
#return the area under the roc curve
def raw_bayesian_logistics_regression(regression_coef_var, DATA, labels):
    train_DATA, train_label, test_DATA, test_label = split_train_test(DATA, labels)
    dimension = DATA.shape[1]
    test_label = test_label.reshape((test_label.shape[0],))
    qcoef = bayesian_logistics_regression(qz, train_label, dimension, regression_coef_var)
    probas = tf.gather(tf.reduce_mean(tf.stack([tf.sigmoid(tf.matmul(test_DATA, qcoef.sample(), 
                                                                     transpose_b=True))])))
    score = probas.eval()
    fpr, tpr, thresholds = roc_curve(test_label, score)
    roc_area = auc(fpr, tpr)
    return roc_area

In [None]:
regression_coef_var = 1
#DATA, labels = 
#Demo:
roc_area = raw_bayesian_logistics_regression(regression_coef_var, DATA, labels)
print('The roc area achieved by raw bayesian logistics regression is %f' 
      % roc_area)

In [None]:
latent_dim = 20 
dimension_reduction_coef_var = 1 
regression_coef_var = 1 
sigmoid = 'last'
observation_var = 0.01
latent_dim = 10
roc_area = dimension_reduction_and_logistics_regression(latent_dim, 
                                                        dimension_reduction_coef_var,
                                                        regression_coef_var, 
                                                        sigmoid,
                                                        observation_var, 
                                                        DATA, 
                                                        labels)
print('The roc area achieved by bayesian logistics regression followed by dimension reduction is %f' 
      % roc_area)


In [None]:
#Parameter tuning for bayesian logistics regression followed by dimension reduction
latent_dims = [10, 20, 30, 40, 50]
dimension_reduction_coef_vars = [0.01, 0.1, 1, 2]
regression_coef_vars = [0.01, 0.1, 1, 2]
sigmoids = ['last', 'before', 'no_sigmoid']
observation_vars = [0.01, 0.1, 1, 2]
counter = 0
performance_dict = {}
config_dict = {}
for latent_dim in latent_dims:
    for dimension_reduction_coef_var in dimension_reduction_coef_vars:
        for regression_coef_var in regression_coef_vars:
            for sigmoid in sigmoids:
                for observation_var in observation_vars:
                    config_dict[counter] = [latent_dim, dimension_reduction_coef_var, 
                                            regression_coef_var, sigmoid, observation_var]
                    avg_auc = 0
                    for cv_fold in range(5):
                        avg_auc += dimension_reduction_and_logistics_regression(latent_dim, 
                                                        dimension_reduction_coef_var,
                                                        regression_coef_var, 
                                                        sigmoid,
                                                        observation_var, 
                                                        DATA, 
                                                        labels)
                    avg_auc /= 5
                    performance_dict[counter] = avg_auc
                    counter += 1
                    

In [None]:
regression_coef_vars = [0.01, 0.1, 1, 2]
for regression_coef_var in regression_coef_vars:
    avg_auc = 0
    print('regression coefficient variance=%f' % regression_coef_var)
    for cv_fold in range(5):
        avg_auc = raw_bayesian_logistics_regression(regression_coef_var, DATA, labels)
    avg_auc /= 5
    print('Performance: roc=%f' % avg_auc)