In [None]:
# !pip install SciencePlots

import matplotlib.pyplot as plt

# plt.style.reload_library()

# plt.style.use('science')
# plt.style.use(['science','no-latex', 'high-vis', 'notebook'])
# plt.rcParams["figure.figsize"] = [6,4]

import numpy as np
import matplotlib.pyplot as plt

import csv

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from sklearn.linear_model import LinearRegression

import sklearn
from sklearn import datasets

def compute_gram_matrix_nocoeff(distance_matrix, sigma):
    #G_matrix = np.exp(-distance_matrix/(2*sigma**2))/np.sqrt(2*np.pi*sigma**2)
    G_matrix = np.exp(-distance_matrix/(2*sigma**2))
    return G_matrix

def compute_gram_matrix(distance_matrix, sigma):
    G_matrix = np.exp(-distance_matrix/(2*sigma**2))/np.sqrt(2*np.pi*sigma**2)
    return G_matrix

def delete_diagonal(A):
    return A[~np.eye(A.shape[0],dtype=bool)].reshape(A.shape[0],-1)

def compute_QMI(x, z, step = 1):
    
    x = np.expand_dims(x, 1)
    z = np.expand_dims(z, 1)

    distance_x = (np.expand_dims(x, 1) - np.expand_dims(x, 0))**2
    distance_z = (np.expand_dims(z, 1) - np.expand_dims(z, 0))**2
    
    distance_x_1_2 = np.sum((distance_x**2), 2)
    distance_y_1_2 = np.sum(distance_z**2, 2)

    step = 1

    G_matrix_x_1_2 = compute_gram_matrix(distance_x_1_2, step)
    G_matrix_y_1_2 = compute_gram_matrix(distance_y_1_2, step)
    
    V_f = np.mean(G_matrix_x_1_2*G_matrix_y_1_2) 
    V_x = np.mean(G_matrix_x_1_2)
    V_y = np.mean(G_matrix_y_1_2)

    QMI = (-np.log2(V_x)) + (-np.log2(V_y)) - (-np.log2(V_f)) 
    
    return QMI

def compute_E_QMI(x, z, step = 1, batch_size = 300):
    
    x = x.reshape(-1, 1)
    z = z.reshape(-1, 1)
    
    #np.random.seed(4)
    
    batch = np.random.choice(x.shape[0], batch_size)
    x_train = x[batch, :]
    y_train = z[batch, :]

    batch = np.random.choice(x.shape[0], batch_size)
    x_train_2 = x[batch, :]
    y_train_2 = z[batch, :]
    
    L1_distance = x_train - x_train_2
    y_L1_distance = y_train - y_train_2

    distance_x_1_2 = np.sum(L1_distance**2, 1)
    distance_y_1_2 = np.sum(y_L1_distance**2, 1)

    step = 1

    G_matrix_x_1_2 = compute_gram_matrix(distance_x_1_2, step) 
    G_matrix_y_1_2 = compute_gram_matrix(distance_y_1_2, step) 

    #G_matrix_x_1_2 = G_matrix_x_1_2 - torch.mean(G_matrix_x_1_2) + torch.std(G_matrix_x_1_2)
    #G_matrix_y_1_2 = G_matrix_y_1_2 - y_mean + y_std

    V_f = np.mean(G_matrix_x_1_2*G_matrix_y_1_2) 
    V_x = np.mean(G_matrix_x_1_2)
    V_y = np.mean(G_matrix_y_1_2)

    QMI = (-np.log2(V_x)) + (-np.log2(V_y)) - (-np.log2(V_f)) 
    
    return QMI

def normalize_tensor(tensor):
    mean = np.mean(tensor, 0)
    tensor = tensor-mean
    std = np.std(tensor, 0)
    tensor = tensor/std
    tensor = tensor
    return tensor

def compute_E_QMI_type1(x, z, step = 1, batch_size = 300):
    
    x = x.reshape(-1, 1)
    z = z.reshape(-1, 1)

    z = ((z-np.mean(z))/np.std(z))/np.sqrt(2)
    
    #np.random.seed(4)
    
    batch = np.random.choice(x.shape[0], batch_size)
    x_train = x[batch, :]
    y_train = z[batch, :]

    batch = np.random.choice(x.shape[0], batch_size)
    x_train_2 = x[batch, :]
    y_train_2 = z[batch, :]
    
    L1_distance = x_train - x_train_2
    L1_distance = normalize_tensor(L1_distance)
    y_L1_distance = y_train - y_train_2

    distance_x_1_2 = np.sum(L1_distance**2, 1)
    distance_y_1_2 = np.sum(y_L1_distance**2, 1)

    step = 1

    G_matrix_x_1_2 = compute_gram_matrix(distance_x_1_2, step) 
    G_matrix_y_1_2 = compute_gram_matrix(distance_y_1_2, step) 

    #G_matrix_x_1_2 = G_matrix_x_1_2 - np.mean(G_matrix_x_1_2) + np.std(G_matrix_x_1_2)
    #G_matrix_y_1_2 = G_matrix_y_1_2 - np.mean(G_matrix_y_1_2) + np.std(G_matrix_y_1_2)

    V_f = np.mean(G_matrix_x_1_2*G_matrix_y_1_2) 
    V_x = np.mean(G_matrix_x_1_2)
    V_y = np.mean(G_matrix_y_1_2)

    QMI = (-np.log2(V_x)) + (-np.log2(V_y))  - (-np.log2(V_f)) 
    
    return QMI

def compute_E_QMI_type2(x, z, step = 1, batch_size = 300):
    
    x = x.reshape(-1, 1)
    z = z.reshape(-1, 1)

    #np.random.seed(4)
    
    batch = np.random.choice(x.shape[0], batch_size)
    x_train = x[batch, :]
    y_train = z[batch, :]

    batch = np.random.choice(x.shape[0], batch_size)
    x_train_2 = x[batch, :]
    y_train_2 = z[batch, :]
    
    L1_distance = x_train - x_train_2
    y_L1_distance = y_train - y_train_2

    distance_x_1_2 = np.sum(L1_distance**2, 1)
    distance_y_1_2 = np.sum(y_L1_distance**2, 1)

    step = 1

    G_matrix_x_1_2 = compute_gram_matrix(distance_x_1_2, step) 
    G_matrix_y_1_2 = compute_gram_matrix(distance_y_1_2, step) 

    G_matrix_x_1_2 = G_matrix_x_1_2 - np.mean(G_matrix_x_1_2) + np.std(G_matrix_x_1_2)
    G_matrix_y_1_2 = G_matrix_y_1_2 - np.mean(G_matrix_y_1_2) + np.std(G_matrix_y_1_2)

    V_f = np.mean(G_matrix_x_1_2*G_matrix_y_1_2) 
    V_x = np.mean(G_matrix_x_1_2)
    V_y = np.mean(G_matrix_y_1_2)

    QMI = (-np.log2(V_x)) + (-np.log2(V_y)) - (-np.log2(V_f)) 
    
    return QMI

def compute_E_QMI_coeff(x, z, step = 1, batch_size = 300):
    
    x = x.reshape(-1, 1)
    z = z.reshape(-1, 1)

    np.random.seed(4)
    
    batch = np.random.choice(x.shape[0], batch_size)
    x_train = x[batch, :]
    y_train = z[batch, :]

    batch = np.random.choice(x.shape[0], batch_size)
    x_train_2 = x[batch, :]
    y_train_2 = z[batch, :]
    
    L1_distance = x_train - x_train_2
    y_L1_distance = y_train - y_train_2

    distance_x_1_2 = np.sum(L1_distance**2, 1)
    distance_y_1_2 = np.sum(y_L1_distance**2, 1)

    step = 1

    G_matrix_x_1_2 = np.copy(compute_gram_matrix_nocoeff(distance_x_1_2, step))
    G_matrix_y_1_2 = np.copy(compute_gram_matrix_nocoeff(distance_y_1_2, step))
    V_x = np.mean(G_matrix_x_1_2)
    V_y = np.mean(G_matrix_y_1_2)

    step = 1*np.sqrt(2)
        
    G_matrix_x_1_2 = compute_gram_matrix_nocoeff(distance_x_1_2, step) 
    G_matrix_y_1_2 = compute_gram_matrix_nocoeff(distance_y_1_2, step) 
    V_x_d2 = np.mean(G_matrix_x_1_2)
    V_y_d2 = np.mean(G_matrix_y_1_2)
    
    V_f = np.mean(G_matrix_x_1_2*G_matrix_y_1_2)
    
    QMI = ((-np.log2(V_x)) + (-np.log2(V_y)) - (-np.log2(V_f)))
    min = (-np.log2(V_x)) + (-np.log2(V_y)) - (-np.log2(V_x_d2)) - (-np.log2(V_y_d2))
    max = (-np.log2(V_x)+-np.log2(V_y))/2
        
    return (QMI-min)/(max-min)

def compute_E_CSQMI_nd(x, z, A, B, batch_size = 300, runs = 1000):

    V_f = 0
    V_x = 0
    V_y = 0

    for run in range(0, runs):

        batch = np.random.choice(x.shape[0], batch_size)
        x_train = x[batch, :]
        y_train = z[batch, :]

        batch = np.random.choice(x.shape[0], batch_size)
        x_train_2 = x[batch, :]
        y_train_2 = z[batch, :]

        batch = np.random.choice(x.shape[0], batch_size)
        y_train_3 = z[batch, :]

        batch = np.random.choice(x.shape[0], batch_size)
        y_train_4 = z[batch, :]

        G_matrix_x_1_2 = gaussian_nd(x_train, x_train_2, 2*A)
        G_matrix_y_1_2 = gaussian_nd(y_train, y_train_2, 2*B)

        G_matrix_y_1_3 = gaussian_nd(y_train, y_train_3, 2*B)
        G_matrix_y_3_4 = gaussian_nd(y_train_3, y_train_4, 2*B)

        V_f = (V_f*run + np.mean(G_matrix_x_1_2*G_matrix_y_1_3))/(run+1)
        V_x = (V_x*run + np.mean(G_matrix_x_1_2*G_matrix_y_1_2))/(run+1)
        V_y = (V_y*run + np.mean(G_matrix_x_1_2*G_matrix_y_3_4))/(run+1)

    QMI = (-np.log2(V_f)) + (1/2)*np.log2(V_x) + (1/2)*np.log2(V_y)
    return QMI

def compute_TRUE_CSQMI_nd(center_x, center_y, A, B):

    K = center_x.shape[0]
    dim = center_y.shape[1]

    V_f = 0
    V_x = 0
    V_y = 0

    gram_x = (center_x.reshape(K, 1, dim) - center_x.reshape(1, K, dim)).reshape(-1, dim)
    gram_y = (center_y.reshape(K, 1, dim) - center_y.reshape(1, K, dim)).reshape(-1, dim)

    G_matrix_x = gaussian_nd(gram_x, 0, 2*A).reshape(K, K)
    G_matrix_y = gaussian_nd(gram_y, 0, 2*B).reshape(K, K)

    V_x = np.mean(G_matrix_x*G_matrix_y)
    V_y = np.mean(G_matrix_x)*np.mean(G_matrix_y)
    V_f = np.mean(G_matrix_x.reshape(K, K, 1)*G_matrix_y.reshape(K, K))

    CSQMI = (-np.log2(V_f)) + (1/2)*np.log2(V_x) + (1/2)*np.log2(V_y)
    return CSQMI

def gaussian_1d(input, m, sigma):
    return np.exp(-(input-m)**2/(2*sigma**2))/np.sqrt(2*np.pi*sigma**2)

def gaussian_nd(input, m, sigma):
    k = sigma.shape[0]
    det = np.linalg.det(sigma)
    inv = np.linalg.pinv(sigma)
    
    return ((2*np.pi)**(-k/2))*det**(-1/2)*np.exp(-(1/2)*np.sum((input-m)@inv*(input-m), 1))

def identity(x):
    return np.eye(x.shape[1])

def generate_gauss_samples_nd(center_x, center_y, COV, samples_per_class=3000):
    
    MEAN = np.concatenate([center_x, center_y], 1)    
    component_ = []
    num_class = MEAN.shape[0]
    for i in range(0, num_class):
        samples = np.random.multivariate_normal(MEAN[i], COV, samples_per_class)
        component_.append(samples)
    component_ = np.array(component_).reshape(-1, COV.shape[0])
    
    return component_

In [None]:
#### GENERATE A LOT DATASET
min = 0
max = 1

import numpy as np
import matplotlib.pyplot as plt

def laplacian_pdf(input, mean, scale):
     return (1/(2*scale))*np.exp(-np.abs(input-mean)/scale)

def gaussian_nd(input, m, sigma):
    k = sigma.shape[0]
    det = np.linalg.det(sigma)
    inv = np.linalg.pinv(sigma)
    
    return ((2*np.pi)**(-k/2))*det**(-1/2)*np.exp(-(1/2)*np.sum((input-m)@inv*(input-m), 1))

def construct_contour(centers, weights, learned_variance, interp=100):
    QMI_TRUE_LIST = []
    interp = 100
    delta = (max-min)/interp

    x_axis = np.linspace(min, max, interp)
    y_axis = np.linspace(min, max, interp)
    xv, yv = np.meshgrid(x_axis,y_axis)

    input = np.array((xv, yv)).reshape(2, -1).T

    gaussian_plot_joint_ = []
    gaussian_plot_split_x_ = []
    gaussian_plot_split_y_ = []

    #centers = np.concatenate((center_x, center_y), 1)
    difference = input.reshape(1, -1, 2) - centers.reshape(-1, 1, 2)

    for i in range(0, centers.shape[0]):
        gaussian_plot_joint_.append(weights[i]*gaussian_nd(difference[i], 0, np.array([[learned_variance[i,0], 0], [0, learned_variance[i,1]]])).reshape(interp, interp))

    gaussian_plot_joint = np.mean(np.array(gaussian_plot_joint_), 0)*delta*delta
    
    return gaussian_plot_joint

def construct_pdf_mix_mixture(center_x, center_y, COV, samples_per_class=3000):
    
    MEAN = np.concatenate([center_x, center_y], 1)   
    COV_ = []
    weights_ = []
    component_ = []
    num_class = MEAN.shape[0]
    
    interp = 1000
    max = 1
    min = 0
    delta = (max-min)/interp

    x_axis = np.linspace(min, max, interp)
    y_axis = np.linspace(min, max, interp)
    xv, yv = np.meshgrid(x_axis,y_axis)

    input = np.array((xv, yv)).reshape(2, -1).T
    pdf_map = np.zeros((interp, interp))
    
    for i in range(0, num_class):
        COV[0, 0] = np.random.uniform(0.0002, 0.0008, 1)[0]
        COV[1, 1] = np.random.uniform(0.0002, 0.0008, 1)[0]
        weights = np.random.uniform(0, 1, 1)[0]+0.5
        
        rv = np.random.choice(3, 1)[0]
        
        # if rv==0:
        samples = np.random.multivariate_normal(MEAN[i], COV, int(weights*samples_per_class))
        pdf_map = pdf_map + (weights*gaussian_nd(input - MEAN[i], 0, COV)).reshape(interp, interp)
            
        # if rv==1:
        #     dim_1 = np.random.uniform(MEAN[i, 0]-COV[0, 0]*10, MEAN[i, 0]+COV[0, 0]*10, int(weights*samples_per_class))
        #     dim_2 = np.random.uniform(MEAN[i, 1]-COV[1, 1]*10, MEAN[i, 1]+COV[1, 1]*10, int(weights*samples_per_class))
        #     samples = np.array((dim_1, dim_2)).T
        #     pdf = (input[:, 0]>(MEAN[i, 0]-COV[0, 0]*10))*(input[:, 0]<(MEAN[i, 0]+COV[0, 0]*10))
        #     pdf = pdf*(input[:, 1]>(MEAN[i, 1]-COV[1, 1]*10))*(input[:, 1]<(MEAN[i, 1]+COV[1, 1]*10))
        #     pdf_map = pdf_map + (pdf/(400*COV[0, 0]*COV[1, 1])).reshape(interp, interp)
            
        # if rv==2:
        #     dim_1 = np.random.laplace(MEAN[i, 0], scale=COV[0, 0]*10, size=int(weights*samples_per_class))
        #     dim_2 = np.random.laplace(MEAN[i, 1], scale=COV[1, 1]*10, size=int(weights*samples_per_class))
        #     samples = np.array((dim_1, dim_2)).T
        #     pdf_dim_1 = laplacian_pdf(input[:, 0], MEAN[i, 0], COV[0, 0]*10)
        #     pdf_dim_2 = laplacian_pdf(input[:, 1], MEAN[i, 1], COV[1, 1]*10)
            
        #     pdf_map = pdf_map + weights*(pdf_dim_1*pdf_dim_2).reshape(interp, interp)
            
        component_.append(samples)
        COV_.append(np.copy(COV))
        weights_.append(np.copy(weights))
    component_ = np.concatenate(component_).reshape(-1, COV.shape[0])
    
    return component_, MEAN, np.array(COV_), np.array(weights_), pdf_map, delta

#### GENERATING DATASET!
K = 5
dim = 1

seed_ = [4, 5, 6, 7, 8]
samples_list = []
cov_list = []
mean_list = []
weight_list = []

for seed in seed_:

    np.random.seed(seed)

    center_x = np.array([np.random.uniform(0.2, 0.8, K)]*dim).T
    center_y = np.array([np.random.uniform(0.2, 0.8, K)]*dim).T

    how_many_samples = 100000

    var = 0.001

    dim = 1
    # JOINT DISTRIBUTION
    COV = np.eye(dim*2)*var

    QMI_LIST = []
    np.random.seed(seed)
    samples, MEAN, COV_, weights_, pdf_map, delta = construct_pdf_mix_mixture(center_x, center_y, COV, samples_per_class=how_many_samples)
    pdf_map = pdf_map/np.sum(pdf_map)

    samples_list.append(samples)
    cov_list.append(COV_)
    mean_list.append(MEAN)
    weight_list.append(weights_/np.sum(weights_))

    # true_entropy = np.sqrt(np.sum(pdf_map*pdf_map)/(delta*delta))
    # print('entropy:', true_entropy)
    # plt.imshow(np.log(pdf_map+1e-5), origin='lower', extent=[min, max, min, max])
    # plt.show()

In [None]:
def gaussian_nd(input, m, sigma):
    k = sigma.shape[0]
    det = np.linalg.det(sigma)
    inv = np.linalg.pinv(sigma)
    
    return ((2*np.pi)**(-k/2))*det**(-1/2)*np.exp(-(1/2)*np.sum((input-m)@inv*(input-m), 1))

# def gaussian_nd_save_some(input, m, sigma):
#     det = (sigma[:, 0]*sigma[:, 1])
#     inv = 1/sigma
    
#     return ((2*np.pi)**(-2/2))*det**(-1/2)*np.exp(-(1/2)*np.sum((input-m)*inv*(input-m), 1))

def gaussian_nd_save_some(MEAN, VARIANCE):
    bs = VARIANCE.shape[0]
    dim = VARIANCE.shape[1]

    det = VARIANCE[:, 0]
    for i in range(1, dim):
        det = det*VARIANCE[:, i]    
        
    product = np.sum(((MEAN.reshape(-1, dim)*(1/VARIANCE).reshape(-1, dim))*MEAN.reshape(-1, dim)), 1)
    
    return ((2*np.pi)**(-dim/2))*det**(-1/2)*np.exp(-(1/2)*product)

def gaussian_1d(input, m, sigma):
    det = sigma
    inv = 1/sigma
        
    input = input.reshape(-1, 1)
    m = m.reshape(1, -1)

    return ((2*np.pi)**(-1/2))*det**(-1/2)*np.exp(-(1/2)*((input-m)**2*inv))

def compute_E_CSQMI_nd(gmmmm, gmvvv, gmwww):

    V_f = 0
    V_x = 0
    V_y = 0

    meandiff = (gmmmm[:, 0].reshape(-1, 1) - gmmmm[:, 0].reshape(1, -1)).reshape(-1, 1)
    vardiff = (gmvvv[:, 0, 0].reshape(-1, 1) + gmvvv[:, 0, 0].reshape(1, -1)).reshape(-1, 1)
    weight_diff = (gmwww.reshape(-1, 1)*gmwww.reshape(1, -1)).reshape(-1)
    G_matrix_x = weight_diff*gaussian_nd_save_some(meandiff, vardiff)

    K = int(np.sqrt(G_matrix_x.shape[0]))

    meandiff = (gmmmm[:, 1].reshape(-1, 1) - gmmmm[:, 1].reshape(1, -1)).reshape(-1, 1)
    vardiff = (gmvvv[:, 1, 1].reshape(-1, 1) + gmvvv[:, 1, 1].reshape(1, -1)).reshape(-1, 1)
    weight_diff = (gmwww.reshape(-1, 1)*gmwww.reshape(1, -1)).reshape(-1)
    G_matrix_y = weight_diff*gaussian_nd_save_some(meandiff, vardiff)

    V_x = np.mean(G_matrix_x*G_matrix_y)
    V_y = np.mean(G_matrix_x)*np.mean(G_matrix_y)
    V_f = np.mean(G_matrix_x.reshape(K, K, 1)*G_matrix_y.reshape(K, K))

    CSQMI = (-np.log2(V_f)) + (1/2)*np.log2(V_x) + (1/2)*np.log2(V_y)
    return CSQMI

for i in range(0, 5):
  csqmi = compute_E_CSQMI_nd(mean_list[i], cov_list[i], weight_list[i])
  print('True QMI (EXP #{0})'.format(i+1), csqmi)

### (a) SGM - QMI

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import csv

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from sklearn.linear_model import LinearRegression

import sklearn
from sklearn import datasets

#torch.cuda.set_device(1)

def GD(net, lr):
    for param in net.parameters():
        if param.requires_grad:
            param.data = param.data - lr*param.grad
            
    net.zero_grad()
    return 0

def sample_uniform_data(min=-5, max=5, batch=300):
    return np.random.uniform(min, max, batch)

def gaussian_nd_pytorch(MEAN, VARIANCE):
    bs = VARIANCE.shape[0]
    dim = VARIANCE.shape[1]

    det = VARIANCE[:, 0]
    for i in range(1, dim):
        det = det*VARIANCE[:, i]    
        
    product = torch.sum(((MEAN.reshape(-1, dim)*(1/VARIANCE).view(-1, dim))*MEAN.reshape(-1, dim)), 1)
    
    return ((2*np.pi)**(-dim/2))*det**(-1/2)*torch.exp(-(1/2)*product)

def sample_c(batchsize=32, dis_category=5):
    rand_c = np.zeros((batchsize,dis_category),dtype='float32')
    for i in range(0,batchsize):
        rand = np.random.multinomial(1, dis_category*[1/float(dis_category)], size=1)
        rand_c[i] = rand

    label_c = np.argmax(rand_c,axis=1)
    label_c = torch.LongTensor(label_c.astype('int'))
    rand_c = torch.from_numpy(rand_c.astype('float32'))
    return rand_c,label_c

def generate_laplacian(center_x, center_y,  scale=0.01, samples_per_class=3000):
    
    component_ = []
    num_class = MEAN.shape[0]

    for i in range(0, num_class):
        dim_1 = np.random.laplace(MEAN[i, 0], scale=scale, size=samples_per_class)
        dim_2 = np.random.laplace(MEAN[i, 1], scale=scale, size=samples_per_class)
        samples = np.array((dim_1, dim_2)).T
        component_.append(samples)

    component_ = np.array(component_).reshape(-1, MEAN.shape[1])    
    
    return component_

def generate_uniform(center_x, center_y,  length=0.5, samples_per_class=3000):
    component_ = []
    
    num_class = MEAN.shape[0]
    for i in range(0, num_class):
        x_1 = np.random.uniform(center_x[i], center_x[i]+length, samples_per_class)
        x_3 = np.random.uniform(center_y[i], center_y[i]+length, samples_per_class)
        x = np.array((x_1, x_3)).T
        component_.append(x)
            
    component_ = np.concatenate(component_, 0)
    return component_


class DIS_MOG(nn.Module):
    def __init__(self, rand, HIDDEN, dim):
        super(DIS_MOG, self).__init__()
        self.dim = dim
    
        self.fc1 = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        
        self.fc1_ = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2_ = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3_ = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33_ = nn.Linear(HIDDEN, HIDDEN, bias=True)

        self.fc1_w = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2_w = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3_w = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33_w = nn.Linear(HIDDEN, HIDDEN, bias=True)

        self.sum_dim_mean = 1
        self.sum_dim_var = 1
        self.sum_dim_weights = 30

        self.fc5 = nn.Linear(HIDDEN, dim*self.sum_dim_mean, bias=True)
        self.fc6 = nn.Linear(HIDDEN, dim*self.sum_dim_var, bias=True)
        self.fcw = nn.Linear(HIDDEN, self.sum_dim_weights, bias=True)

    def forward(self, x):
        x_m = self.bn1_(torch.sigmoid((self.fc1_(x))))
        x_m = self.bn2_(torch.sigmoid((self.fc2_(x_m))))
        x_m = self.bn3_(torch.sigmoid((self.fc3_(x_m))))
        x_m = (torch.sigmoid((self.fc33_(x_m))))

        x_v = self.bn1(torch.sigmoid((self.fc1(x))))
        x_v = self.bn2(torch.sigmoid((self.fc2(x_v))))
        x_v = self.bn3(torch.sigmoid((self.fc3(x_v))))
        x_v = (torch.sigmoid((self.fc33(x_v))))

        x_w = self.bn1_w(torch.sigmoid((self.fc1_w(x))))
        x_w = self.bn2_w(torch.sigmoid((self.fc2_w(x_w))))
        x_w = self.bn3_w(torch.sigmoid((self.fc3_w(x_w))))
        x_w = (torch.sigmoid((self.fc33_w(x_w))))

        dim = self.dim 
        
        mean = torch.sigmoid(self.fc5(x_m)).view(x.shape[0], dim, self.sum_dim_mean)
        mean = torch.mean(mean, 2)

        variance = torch.sigmoid(self.fc6(x_v)).view(x.shape[0], dim, self.sum_dim_var)
        variance = (torch.mean(variance, 2))+1e-5
        
        # weights = torch.sigmoid(self.fcw(x_w)).view(x.shape[0], 1, self.sum_dim_weights)
        # weights = torch.mean(weights, 2)

        weights = torch.relu(self.fcw(x_w)).view(x.shape[0], 1, self.sum_dim_weights)
        weights = torch.sum(weights, 2)+1

        return weights, mean, variance

class DIS_MOG_new(nn.Module):
    def __init__(self, rand, HIDDEN, dim):
        super(DIS_MOG_new, self).__init__()
        self.dim = dim
    
        self.fc1 = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        
        self.fc1_ = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2_ = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3_ = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33_ = nn.Linear(HIDDEN, HIDDEN, bias=True)

        self.fc1_w = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2_w = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3_w = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33_w = nn.Linear(HIDDEN, HIDDEN, bias=True)

        self.sum_dim_mean = 1
        self.sum_dim_var = 1
        self.sum_dim_weights = 1

        self.fc5 = nn.Linear(HIDDEN, dim*self.sum_dim_mean, bias=True)
        self.fc6 = nn.Linear(HIDDEN, dim*self.sum_dim_var, bias=True)
        self.fcw = nn.Linear(HIDDEN, self.sum_dim_weights, bias=True)

    def forward(self, x):

        x = self.bn1_(torch.sigmoid((self.fc1_(x))))
        x = self.bn2_(torch.sigmoid((self.fc2_(x))))

        # x_m = self.bn1_(torch.sigmoid((self.fc1_(x))))
        # x_m = self.bn2_(torch.sigmoid((self.fc2_(x_m))))
        x_m = self.bn3_(torch.sigmoid((self.fc3_(x))))
        x_m = (torch.sigmoid((self.fc33_(x_m))))

        # x_v = self.bn1(torch.sigmoid((self.fc1(x))))
        # x_v = self.bn2(torch.sigmoid((self.fc2(x_v))))
        x_v = self.bn3(torch.sigmoid((self.fc3(x))))
        x_v = (torch.sigmoid((self.fc33(x_v))))

        # x_w = self.bn1_w(torch.sigmoid((self.fc1_w(x))))
        # x_w = self.bn2_w(torch.sigmoid((self.fc2_w(x_w))))
        x_w = self.bn3_w(torch.sigmoid((self.fc3_w(x))))
        x_w = (torch.sigmoid((self.fc33_w(x_w))))

        dim = self.dim 
        
        mean = torch.sigmoid(self.fc5(x_m)).view(x.shape[0], dim, self.sum_dim_mean)
        mean = torch.mean(mean, 2)

        variance = torch.sigmoid(self.fc6(x_v)).view(x.shape[0], dim, self.sum_dim_var)
        variance = (torch.mean(variance, 2))+1e-6
        
        # weights = torch.sigmoid(self.fcw(x_w)).view(x.shape[0], 1, self.sum_dim_weights)
        # weights = torch.mean(weights, 2)

        weights = torch.sigmoid(self.fcw(x_w)).view(x.shape[0], 1, self.sum_dim_weights)
        weights = torch.mean(weights, 2)

        return weights, mean, variance

def generate_fix_discrete(bs, d_class):
    num = bs/d_class
    return torch.cat([torch.nn.functional.one_hot(torch.arange(0, d_class))]*int(num), 0)

def compute_VR(learned_weights, learned_mean, MEAN):

  learned_weights = learned_weights/np.mean(learned_weights)
  return np.mean(learned_weights*((((learned_mean.reshape(-1, 1, 2) - MEAN.reshape(1, -1, 2))**2).sum(2).min(1))<1e-4))

################################################## SGM

class DIS_MOG_relu(nn.Module):
    def __init__(self, rand, HIDDEN, dim):
        super(DIS_MOG_relu, self).__init__()
        self.dim = dim
    
        self.fc1 = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3 = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        
        self.fc1_ = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2_ = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3_ = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3_ = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33_ = nn.Linear(HIDDEN, HIDDEN, bias=True)

        self.fc1_w = nn.Linear(rand, HIDDEN, bias=True)
        self.bn1_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc2_w = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn2_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc3_w = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.bn3_w = torch.nn.BatchNorm1d(HIDDEN)
        self.fc33_w = nn.Linear(HIDDEN, HIDDEN, bias=True)

        self.sum_dim_mean = 1024
        self.sum_dim_var = 1024
        self.sum_dim_weights = 1024

        self.fc5 = nn.Linear(HIDDEN, dim*self.sum_dim_mean, bias=True)
        self.fc6 = nn.Linear(HIDDEN, dim*self.sum_dim_var, bias=True)
        self.fcw = nn.Linear(HIDDEN, self.sum_dim_weights, bias=True)

    def forward(self, x):

        x = self.bn1_(torch.relu((self.fc1_(x))))
        x = self.bn2_(torch.relu((self.fc2_(x))))
        #x = self.bn3_(torch.relu((self.fc3_(x))))

        # x_m = self.bn1_(torch.sigmoid((self.fc1_(x))))
        # x_m = self.bn2_(torch.sigmoid((self.fc2_(x_m))))
        #x_m = self.bn3_(torch.relu((self.fc3_(x))))
        x_m = (torch.relu((self.fc33_(x))))

        # x_v = self.bn1(torch.sigmoid((self.fc1(x))))
        # x_v = self.bn2(torch.sigmoid((self.fc2(x_v))))
        #x_v = self.bn3(torch.relu((self.fc3(x))))
        x_v = (torch.relu((self.fc33(x))))

        # x_w = self.bn1_w(torch.sigmoid((self.fc1_w(x))))
        # x_w = self.bn2_w(torch.sigmoid((self.fc2_w(x_w))))
        #x_w = self.bn3_w(torch.relu((self.fc3_w(x))))
        x_w = (torch.relu((self.fc33_w(x))))

        dim = self.dim 
        
        mean = torch.sigmoid(self.fc5(x_m)).view(x.shape[0], dim, self.sum_dim_mean)
        mean = torch.mean(mean, 2)

        variance = torch.sigmoid(self.fc6(x_v)).view(x.shape[0], dim, self.sum_dim_var)
        variance = (torch.mean(variance, 2))+1e-5
        
        # weights = torch.sigmoid(self.fcw(x_w)).view(x.shape[0], 1, self.sum_dim_weights)
        # weights = torch.mean(weights, 2)

        weights = torch.sigmoid(self.fcw(x_w)).view(x.shape[0], 1, self.sum_dim_weights)
        weights = torch.mean(weights, 2)

        return weights, mean, variance

def compute_pair_wise(input, MEAN_3, VARIANCE_3, WEIGHTS_3, c_t, beta_3= 0.9):
    MEAN_DATA = (input.view(bs, 1, 2) - MEAN_3.view(1, bs, 2)).view(-1, 2)
    VARIANCE_DATA = (VAR_ZERO + VARIANCE_3.view(1, bs, 2)).view(-1, 2)
    WEIGHT_DATA = (WEIGHT_ZERO + WEIGHTS_3.view(1, bs)).view(-1)
    
#     MEAN_DATA = input - MEAN_3
#     VARIANCE_DATA = VARIANCE_3
    
    numerator = torch.mean(WEIGHT_DATA*gaussian_nd_pytorch(MEAN_DATA, VARIANCE_DATA))
    c_t = beta_3*c_t + (1-beta_3)*numerator.detach()
    numerator_unbiased = c_t/(1-beta_3**i)

    return numerator_unbiased.item(), c_t.item()

def QMI_NETWORK(c1, c2, c3, c4):

  shuffle_1 = torch.randperm(MEAN_3.size()[0])
  shuffle_2 = torch.randperm(MEAN_3.size()[0])

  mean_x = MEAN_3[:,  0][shuffle_1]
  var_x = VARIANCE_3[:,  0][shuffle_1]
  weigh_x = WEIGHTS_3[:][shuffle_1]

  mean_y = MEAN_3[:,  1][shuffle_2]
  var_y = VARIANCE_3[:,  1][shuffle_2]
  weigh_y = WEIGHTS_3[:][shuffle_2]

  MEAN_3_SHUFFLE = torch.cat((mean_x.reshape(-1, 1), mean_y.reshape(-1, 1)), 1)
  VAR_3_SHUFFLE = torch.cat((var_x.reshape(-1, 1), var_y.reshape(-1, 1)), 1)
  WEIGH_3_SHUFFLE = (weigh_x.reshape(-1, 1)*weigh_y.reshape(-1, 1))

  PXY_GXY, c1 = compute_pair_wise(joint_, MEAN_3, VARIANCE_3, WEIGHTS_3, c1)
  PXY_GXGY, c2 = compute_pair_wise(joint_, MEAN_3_SHUFFLE, VAR_3_SHUFFLE, WEIGH_3_SHUFFLE, c2)
  PXPY_GXY, c3 = compute_pair_wise(disjoint_, MEAN_3, VARIANCE_3, WEIGHTS_3, c3)
  PXPY_GXGY, c4 = compute_pair_wise(disjoint_, MEAN_3_SHUFFLE, VAR_3_SHUFFLE, WEIGH_3_SHUFFLE, c4)

  return -np.log2(np.sqrt((PXPY_GXY*PXY_GXGY)/(PXPY_GXGY*PXY_GXY))), c1, c2, c3, c4

for samplesss in [10, 50, 500, 1000, 3000, 5000, 10000, 30000, 50000, 100000]:
  print('Samples Size:', samplesss)
  x = samples_list[-2]
  np.random.seed(4)
  np.random.shuffle(x)
  x = x[0:samplesss]

  QMI_LIST = []

  SEED = 4

  torch.manual_seed(SEED)
  np.random.seed(SEED)

  rand = 1
  HIDDEN = 256
  dim = 1

  d_class = 300
  num = 3
  bs = d_class*num

  d_howmany = 1

  MOG_NET = DIS_MOG_relu(rand+d_class*d_howmany, HIDDEN, dim*2).cuda()

  optimizer = optim.Adam([
              {'params': MOG_NET.parameters(), 'lr': 0.0001, 'betas': (0.9, 0.999)},
          ])

  entropy_list = []
  discrete_prob = torch.ones((d_class,)).float().cuda()

  beta = 0.999
  v_t = 0.

  beta_2 = 0.999
  c_t = 0.

  VAR_ZERO = torch.zeros((bs, 1, 2)).cuda()
  WEIGHT_ZERO = torch.zeros((bs, 1)).cuda()

  c1 = 0
  c2 = 0
  c3 = 0
  c4 = 0

  QMI_LIST = []

  discrete_vec = generate_fix_discrete(bs, d_class).float().cuda()

  for i in range(1, 20000):
      uniform_vector_3 = torch.cat((torch.rand(bs, rand).cuda(), discrete_vec), 1)
      WEIGHTS_3, MEAN_3, VARIANCE_3 = MOG_NET(uniform_vector_3)
      
      b1 = np.random.choice(x.shape[0], bs)
      b2 = np.random.choice(x.shape[0], bs)
      b3 = np.random.choice(x.shape[0], bs)
      
      joint_ = torch.from_numpy(x[b1, :]).float().cuda()
      disjoint_ = torch.from_numpy(np.concatenate((x[b2, :dim], x[b3, dim:]), 1)).float().cuda()
      
      input = joint_
      
      MEAN_DIFF = (MEAN_3.view(bs, 1, 2) - MEAN_3.view(1, bs, 2)).view(-1, 2)
      VARIANCE_SUM = (VARIANCE_3.view(bs, 1, 2) + VARIANCE_3.view(1, bs, 2)).view(-1, 2)
      WEIGHT_DIFF = (WEIGHTS_3.view(bs, 1)*WEIGHTS_3.view(1, bs)).view(-1)

      square_term = torch.mean(WEIGHT_DIFF*gaussian_nd_pytorch(MEAN_DIFF, VARIANCE_SUM))
      v_t = beta*v_t + (1-beta)*square_term.detach()
      square_term_unbiased = torch.sqrt(v_t/(1-beta**i))
      
      MEAN_DATA = (input.view(bs, 1, 2) - MEAN_3.view(1, bs, 2)).view(-1, 2)
      VARIANCE_DATA = (VAR_ZERO + VARIANCE_3.view(1, bs, 2)).view(-1, 2)
      WEIGHT_DATA = (WEIGHT_ZERO + WEIGHTS_3.view(1, bs)).view(-1)
            
      numerator = torch.mean(WEIGHT_DATA*gaussian_nd_pytorch(MEAN_DATA, VARIANCE_DATA))
      c_t = beta_2*c_t + (1-beta_2)*numerator.detach()
      numerator_unbiased = c_t/(1-beta_2**i)
      
      corr_ = (numerator/square_term_unbiased) - 0.5*numerator_unbiased*square_term/(square_term_unbiased)**3

      (-corr_).backward()
      
      optimizer.step()
      optimizer.zero_grad()
      entropy_list.append((numerator_unbiased/square_term_unbiased).item())

      with torch.no_grad():
        QMI, c1, c2, c3, c4 = QMI_NETWORK(c1, c2, c3, c4)
        QMI_LIST.append(QMI)

      if i%100 == 0:

          print('Iteration:{0}, Learned QMI:{1}, Ground Truth QMI:{2}, Cross-Entropy:{3}'.format(i, QMI_LIST[-1], 0.5357792715466267, entropy_list[-1]))
          
          plt.rcParams["figure.figsize"] = [4,4]

          learned_mean = MEAN_3.detach().cpu().numpy().reshape(-1, 2)
          learned_variance = VARIANCE_3.detach().cpu().numpy().reshape(-1, 2)
          learned_weights = WEIGHTS_3.detach().cpu().numpy()

  learned_mean = MEAN_3.detach().cpu().numpy().reshape(-1, 2)
  learned_variance = VARIANCE_3.detach().cpu().numpy().reshape(-1, 2)
  learned_weights = WEIGHTS_3.detach().cpu().numpy()

  print('Sample Size:{0}, Cross-Entropy:{1}, QMI-MEAN:{2}, QMI-STD:{3}'.format(samplesss, entropy_list[-1], np.mean(QMI_LIST[-3000:]), np.std(QMI_LIST[-3000:])))

### (b) MINE - SHANNON'S MI 
(warning: it can diverge, so early stopping at 50000 iterations)

In [None]:
############ MINE

# QMI_LIST VARIABLE IS FOR CONVINIENCY. MINE is estimating Shannon's MI

class NET(nn.Module):
    def __init__(self, N, HIDDEN, M):
        super(NET, self).__init__()
        self.fc1 = nn.Linear(N, HIDDEN, bias=True)
        self.fc2 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc4 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc3 = nn.Linear(HIDDEN, M, bias=True)
        #self.fc3 = nn.Linear(N, M, bias=True)
        #self.fc4 = nn.Linear(1, 1, bias=True)

    def forward(self, x):
        x = (torch.relu((self.fc1(x))))
        x = (torch.relu((self.fc2(x))))
        x = (torch.relu((self.fc4(x))))
        x = torch.sum(torch.relu(((self.fc3(x)))), 1)
        #x = self.fc4(torch.mean(torch.sigmoid(((self.fc3(x)))), 1).view(-1, 1))
        return x

class BWM(nn.Module):
    def __init__(self, N, M):
        super(BWM, self).__init__()
        self.fc3 = nn.Linear(N, M, bias=True)

    def forward(self, x):
        x = torch.mean(torch.sigmoid(((self.fc3(x)))), 1)
        return x

def GD(net, lr):
    for param in net.parameters():
        if param.requires_grad:
            param.data = param.data - lr*param.grad
            
    net.zero_grad()
    return 0

def sample_uniform_data(min=-5, max=5, batch=300):
  return np.random.uniform(min, max, batch)

def run_MINE(x):

  QMI_LIST = []

  total_dim = 100
  K = 20 # how many Gaussian


  torch.manual_seed(seed)
  np.random.seed(seed)

  #max = np.max(x)
  #min = np.min(x)

  # max = 1
  # min = 0
  #iter = 2000

  net = NET(x.shape[1]*2, 256, 256).cuda()
  #net = BWM(x.shape[1]*2, 500).cuda()

  optimizer = optim.Adam([
                {'params': net.parameters(), 'lr': 0.0001},
            ])

  bs =2000

  beta_1 = 0.999
  beta_2 = 0.999

  m_t = 0
  v_t = 0

  for i in range(1, 50000):

    b1 = np.random.choice(x.shape[0], bs)
    b2 = np.random.choice(x.shape[0], bs)
    b3 = np.random.choice(x.shape[0], bs)

    joint_ = torch.from_numpy(x[b1, :]).float().cuda()
    disjoint_ = torch.from_numpy(np.concatenate((x[b2, 0:dim], x[b3, dim:]), 1)).float().cuda()

    input_1 = torch.cat((joint_, disjoint_), 1)
    input_2 = torch.cat((disjoint_, joint_), 1)

    # input = torch.cat((joint_, disjoint_), 1)

    # uniform_samples = sample_uniform_data(min, max, bs*dim*4).reshape(-1, dim*4)
    # uniform_samples = torch.from_numpy(uniform_samples).float().cuda()

    b4 = np.random.choice(x.shape[0], bs)
    b5 = np.random.choice(x.shape[0], bs)
    b6 = np.random.choice(x.shape[0], bs)
    b7 = np.random.choice(x.shape[0], bs)

    uniform_samples = torch.from_numpy(np.concatenate((x[b4, 0:dim], x[b5, dim:], x[b6, 0:dim], x[b7, dim:]), 1)).float().cuda()

    # get normal output
    output_joint = net(input_1)
    output_uniform = net(uniform_samples)

    m_t = beta_1*m_t + (1-beta_1)*torch.mean(output_joint.detach())
    joint_m = m_t/(1-beta_1**i)

    v_t = beta_2*v_t + (1-beta_2)*torch.mean(torch.exp(output_uniform)).detach()
    variance = v_t/(1-beta_2**i)
    
    #corr_ = (torch.mean(output_joint)/(torch.mean(output_uniform**2)**(1/2)))

    #### RENYI DIVERGENCE
    #corr_ = torch.mean(output_joint)/std - joint_m*torch.mean(output_uniform**2)/(2*std**3)
    #(-corr_).backward()

    ### KL DIVERGENCE
    # corr_ = torch.mean(output_joint) - (torch.mean(torch.exp(output_uniform)))/variance.item()
    # (-corr_).backward()

    corr_ = torch.mean(output_joint) - torch.log(torch.mean(torch.exp(output_uniform)))
    (-corr_).backward()

    #GD(net, 1)

    optimizer.step()
    optimizer.zero_grad()
    #compute = torch.mean(output_joint)/variance.item()
    QMI_LIST.append(corr_.item())

    if i%1000 == 0:
      print('Sample Size:{0}, Iteration:{1}, MI:{2}'.format(samplesss, i, QMI_LIST[-1]))

  return QMI_LIST

for samplesss in [10, 50, 500, 1000, 3000, 5000, 10000, 30000, 50000, 100000]:

    x = samples_list[-2]
    np.random.seed(4)
    np.random.shuffle(x)
    x = x[0:samplesss]

    QMI_LIST = run_MINE(x)
    print('Sample Size:{0}, QMI-MEAN:{2}, QMI-STD:{3}'.format(samplesss, np.mean(QMI_LIST[-10000:]), np.std(QMI_LIST[-10000:])))

### (c) MINE - RENYI'S MI (ORDER 2)

In [None]:
### MINE JPQ

class NET(nn.Module):
    def __init__(self, N, HIDDEN, M):
        super(NET, self).__init__()
        self.fc1 = nn.Linear(N, HIDDEN, bias=True)
        self.fc2 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc4 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc3 = nn.Linear(HIDDEN, M, bias=True)
        #self.fc3 = nn.Linear(N, M, bias=True)
        #self.fc4 = nn.Linear(1, 1, bias=True)

    def forward(self, x):
        x = (torch.relu((self.fc1(x))))
        x = (torch.relu((self.fc2(x))))
        x = (torch.relu((self.fc4(x))))
        x = torch.sum(torch.relu(((self.fc3(x)))), 1)
        #x = self.fc4(torch.mean(torch.sigmoid(((self.fc3(x)))), 1).view(-1, 1))
        return x

class BWM(nn.Module):
    def __init__(self, N, M):
        super(BWM, self).__init__()
        self.fc3 = nn.Linear(N, M, bias=True)

    def forward(self, x):
        x = torch.mean(torch.sigmoid(((self.fc3(x)))), 1)
        return x

def GD(net, lr):
    for param in net.parameters():
        if param.requires_grad:
            param.data = param.data - lr*param.grad
            
    net.zero_grad()
    return 0

def sample_uniform_data(min=-5, max=5, batch=300):
  return np.random.uniform(min, max, batch)


def run_JPQ(x):

  QMI_LIST = []

  total_dim = 100
  K = 20 # how many Gaussian

  torch.manual_seed(seed)
  np.random.seed(seed)

  #max = np.max(x)
  #min = np.min(x)

  # max = 1
  # min = 0
  iter = 5000

  net = NET(x.shape[1], 256, 256).cuda()
  #net = BWM(x.shape[1]*2, 500).cuda()

  optimizer = optim.Adam([
                {'params': net.parameters(), 'lr': 0.0001},
            ])

  bs = 2000

  beta_1 = 0.999
  beta_2 = 0.999

  m_t = 0
  v_t = 0

  QMI_LIST = []

  for i in range(1, 50000):

    b1 = np.random.choice(x.shape[0], bs)
    b2 = np.random.choice(x.shape[0], bs)
    b3 = np.random.choice(x.shape[0], bs)

    joint_ = torch.from_numpy(x[b1, :]).float().cuda()
    disjoint_ = torch.from_numpy(np.concatenate((x[b2, 0:dim], x[b3, dim:]), 1)).float().cuda()

    input_1 = joint_
    uniform_samples = disjoint_

    # get normal output
    output_joint = net(input_1)
    output_uniform = net(uniform_samples)

    m_t = beta_1*m_t + (1-beta_1)*torch.mean(output_joint.detach())
    joint_m = m_t/(1-beta_1**i)

    v_t = beta_2*v_t + (1-beta_2)*torch.mean(output_uniform.detach()**2)
    variance = v_t/(1-beta_2**i)
    std = torch.sqrt(variance)
    
    #corr_ = (torch.mean(output_joint)/(torch.mean(output_uniform**2)**(1/2)))

    #### RENYI DIVERGENCE
    #corr_ = torch.mean(output_joint)/std - joint_m*torch.mean(output_uniform**2)/(2*std**3)
    #(-corr_).backward()

    #### RENYI DIVERGENCE
    corr_ = torch.mean(output_joint)/std - joint_m*torch.mean(output_uniform**2)/(2*std**3)

    ### KL DIVERGENCE
    #corr_ = torch.mean(output_joint) - torch.log(torch.mean(torch.exp(output_uniform)))
    (-corr_).backward()

    #GD(net, 1)

    optimizer.step()
    optimizer.zero_grad()

    QMI_LIST.append(-2*np.log2((joint_m/std).item()))

    if i%100 == 0:
      print('Sample Size:{0}, Iteration:{1}, Renyi MI:{2}'.format(samplesss, i, -QMI_LIST[-1]/2))

  return QMI_LIST

for samplesss in [10, 50, 500, 1000, 3000, 5000, 10000, 30000, 50000, 100000]:
  x = samples_list[-2]
  np.random.seed(4)
  np.random.shuffle(x)
  x = x[0:samplesss]

  QMI_LIST = run_JPQ(x)
  print('Sample Size:{0}, QMI-MEAN:{2}, QMI-STD:{3}'.format(samplesss, np.mean(QMI_LIST[-10000:]), np.std(QMI_LIST[-10000:])))

### (d) High Dimensional Comparison (MINE)

In [None]:
### MINE

class NET(nn.Module):
    def __init__(self, N, HIDDEN, M):
        super(NET, self).__init__()
        self.fc1 = nn.Linear(N, HIDDEN, bias=True)
        self.fc2 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc4 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc3 = nn.Linear(HIDDEN, M, bias=True)
        #self.fc3 = nn.Linear(N, M, bias=True)
        #self.fc4 = nn.Linear(1, 1, bias=True)

    def forward(self, x):
        x = (torch.relu((self.fc1(x))))
        x = (torch.relu((self.fc2(x))))
        x = (torch.relu((self.fc4(x))))
        x = torch.sum(torch.relu(((self.fc3(x)))), 1)
        #x = self.fc4(torch.mean(torch.sigmoid(((self.fc3(x)))), 1).view(-1, 1))
        return x

class BWM(nn.Module):
    def __init__(self, N, M):
        super(BWM, self).__init__()
        self.fc3 = nn.Linear(N, M, bias=True)

    def forward(self, x):
        x = torch.mean(torch.sigmoid(((self.fc3(x)))), 1)
        return x

def GD(net, lr):
    for param in net.parameters():
        if param.requires_grad:
            param.data = param.data - lr*param.grad
            
    net.zero_grad()
    return 0

def sample_uniform_data(min=-5, max=5, batch=300):
  return np.random.uniform(min, max, batch)

#### SAMPLE EFFICIENCY

total_dim = 10000
K = 20 # how many Gaussian

np.random.seed(4)
center_x_total = np.array([np.random.uniform(0, 1, K)]*total_dim).T
center_y_total = np.array([np.random.uniform(0, 1, K)]*total_dim).T

investigate_SAMPLE_EFFICIENCY = []
list_TRUE_VALUE = []

list = []
for dim in [1, 5, 10, 20, 50, 100, 200, 500]:

  # CENTER OF X AND Y
  center_x = center_x_total[:, :dim]
  center_y = center_y_total[:, :dim]

  seed = 4
  bs = 1000

  var = 0.001

  # JOINT DISTRIBUTION
  COV = np.eye(dim*2)*var

#   TRUE_VALUE = compute_TRUE_CSQMI_nd(center_x, center_y, COV[:dim, :dim], COV[dim:, dim:])
#   print('TRUE VALUE:', TRUE_VALUE)
#   list_TRUE_VALUE.append(TRUE_VALUE)

#for how_many_samples in [1, 10, 100, 1000, 10000, 100000]:

  how_many_samples = 1000

  QMI_LIST = []
  np.random.seed(4)
  samples = generate_gauss_samples_nd(center_x, center_y, COV, samples_per_class=how_many_samples)
  x_1 = samples[:, :dim]
  x_3 = samples[:, dim:]

  x = np.concatenate((x_1, x_3), 1)

  ### NORMALIZATION
  #x = x-x.min(0)
  #x = x/x.max(0)

  #plt.scatter(x_1, x_3)
  #plt.show()

  torch.manual_seed(seed)
  np.random.seed(seed)

  #max = np.max(x)
  #min = np.min(x)

  # max = 1
  # min = 0
  iter = 10000

  net = NET(x.shape[1], 256, 256).cuda()
  #net = BWM(x.shape[1]*2, 500).cuda()

  optimizer = optim.Adam([
                {'params': net.parameters(), 'lr': 0.0001},
            ])
  
  beta_1 = 0.999
  beta_2 = 0.999

  m_t = 0
  v_t = 0

  for i in range(1, iter):

    b1 = np.random.choice(x.shape[0], bs)
    b2 = np.random.choice(x.shape[0], bs)
    b3 = np.random.choice(x.shape[0], bs)

    joint_ = torch.from_numpy(x[b1, :]).float().cuda()
    disjoint_ = torch.from_numpy(np.concatenate((x[b2, 0:dim], x[b3, dim:]), 1)).float().cuda()

    # input_1 = torch.cat((joint_, disjoint_), 1)
    # input_2 = torch.cat((disjoint_, joint_), 1)

    # # input = torch.cat((joint_, disjoint_), 1)

    # # uniform_samples = sample_uniform_data(min, max, bs*dim*4).reshape(-1, dim*4)
    # # uniform_samples = torch.from_numpy(uniform_samples).float().cuda()

    # b4 = np.random.choice(x.shape[0], bs)
    # b5 = np.random.choice(x.shape[0], bs)
    # b6 = np.random.choice(x.shape[0], bs)
    # b7 = np.random.choice(x.shape[0], bs)

    # uniform_samples = torch.from_numpy(np.concatenate((x[b4, 0:dim], x[b5, dim:], x[b6, 0:dim], x[b7, dim:]), 1)).float().cuda()

    # get normal output
    output_joint = net(joint_)
    output_uniform = net(disjoint_)

    m_t = beta_1*m_t + (1-beta_1)*torch.mean(output_joint.detach())
    joint_m = m_t/(1-beta_1**i)

    v_t = beta_2*v_t + (1-beta_2)*torch.mean(output_uniform.detach()**2)
    variance = v_t/(1-beta_2**i)
    std = torch.sqrt(variance)
    
    #corr_ = (torch.mean(output_joint)/(torch.mean(output_uniform**2)**(1/2)))
    corr_ = torch.mean(output_joint) - torch.log(torch.mean(torch.exp(output_uniform)))
    (-corr_).backward()
    QMI_LIST.append(corr_.item())

    #GD(net, 1)

    optimizer.step()
    optimizer.zero_grad()

  QMI_LIST.append(corr_.item())
  investigate_SAMPLE_EFFICIENCY.append(QMI_LIST)

  print('Dimension:{0}, MI-MEAN:{1}, MI-STD:{2}'.format(dim, np.mean(QMI_LIST[-3000:]), np.std(QMI_LIST[-3000:])))
  list.append((np.mean(QMI_LIST[-3000:]), np.std(QMI_LIST[-3000:])))


### (e) High Dimensional Comparison (Renyi's MI)

In [None]:
class NET(nn.Module):
    def __init__(self, N, HIDDEN, M):
        super(NET, self).__init__()
        self.fc1 = nn.Linear(N, HIDDEN, bias=True)
        self.fc2 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc4 = nn.Linear(HIDDEN, HIDDEN, bias=True)
        self.fc3 = nn.Linear(HIDDEN, M, bias=True)
        #self.fc3 = nn.Linear(N, M, bias=True)
        #self.fc4 = nn.Linear(1, 1, bias=True)

    def forward(self, x):
        x = (torch.relu((self.fc1(x))))
        x = (torch.relu((self.fc2(x))))
        x = (torch.relu((self.fc4(x))))
        x = torch.sum(torch.relu(((self.fc3(x)))), 1)
        #x = self.fc4(torch.mean(torch.sigmoid(((self.fc3(x)))), 1).view(-1, 1))
        return x

class BWM(nn.Module):
    def __init__(self, N, M):
        super(BWM, self).__init__()
        self.fc3 = nn.Linear(N, M, bias=True)

    def forward(self, x):
        x = torch.mean(torch.sigmoid(((self.fc3(x)))), 1)
        return x

def GD(net, lr):
    for param in net.parameters():
        if param.requires_grad:
            param.data = param.data - lr*param.grad
            
    net.zero_grad()
    return 0

def sample_uniform_data(min=-5, max=5, batch=300):
  return np.random.uniform(min, max, batch)

#### SAMPLE EFFICIENCY

total_dim = 10000
K = 20 # how many Gaussian

np.random.seed(4)
center_x_total = np.array([np.random.uniform(0, 1, K)]*total_dim).T
center_y_total = np.array([np.random.uniform(0, 1, K)]*total_dim).T

investigate_SAMPLE_EFFICIENCY = []
list_TRUE_VALUE = []

for dim in [5, 20, 50, 200, 500]:

  # CENTER OF X AND Y
  center_x = center_x_total[:, :dim]
  center_y = center_y_total[:, :dim]

  seed = 4
  bs = 1000

  var = 0.001

  # JOINT DISTRIBUTION
  COV = np.eye(dim*2)*var

#   TRUE_VALUE = compute_TRUE_CSQMI_nd(center_x, center_y, COV[:dim, :dim], COV[dim:, dim:])
#   print('TRUE VALUE:', TRUE_VALUE)
#   list_TRUE_VALUE.append(TRUE_VALUE)

# for how_many_samples in [1, 10, 100, 1000, 10000, 100000]:

  how_many_samples = 1000

  QMI_LIST = []
  np.random.seed(4)
  samples = generate_gauss_samples_nd(center_x, center_y, COV, samples_per_class=how_many_samples)
  x_1 = samples[:, :dim]
  x_3 = samples[:, dim:]

  x = np.concatenate((x_1, x_3), 1)

  ### NORMALIZATION
  #x = x-x.min(0)
  #x = x/x.max(0)

  #plt.scatter(x_1, x_3)
  #plt.show()

  torch.manual_seed(seed)
  np.random.seed(seed)

  #max = np.max(x)
  #min = np.min(x)

  # max = 1
  # min = 0
  iter = 10000

  net = NET(x.shape[1], 256, 256).cuda()
  #net = BWM(x.shape[1]*2, 500).cuda()

  optimizer = optim.Adam([
                {'params': net.parameters(), 'lr': 0.0001},
            ])
  
  beta_1 = 0.999
  beta_2 = 0.999

  m_t = 0
  v_t = 0

  for i in range(1, iter):

    b1 = np.random.choice(x.shape[0], bs)
    b2 = np.random.choice(x.shape[0], bs)
    b3 = np.random.choice(x.shape[0], bs)

    joint_ = torch.from_numpy(x[b1, :]).float().cuda()
    disjoint_ = torch.from_numpy(np.concatenate((x[b2, 0:dim], x[b3, dim:]), 1)).float().cuda()

    # get normal output
    output_joint = net(joint_)
    output_uniform = net(disjoint_)

    m_t = beta_1*m_t + (1-beta_1)*torch.mean(output_joint.detach())
    joint_m = m_t/(1-beta_1**i)

    v_t = beta_2*v_t + (1-beta_2)*torch.mean(output_uniform.detach()**2)
    variance = v_t/(1-beta_2**i)
    std = torch.sqrt(variance)
    
    #corr_ = (torch.mean(output_joint)/(torch.mean(output_uniform**2)**(1/2)))
    corr_ = torch.mean(output_joint)/std - joint_m*torch.mean(output_uniform**2)/(2*std**3)
    (-corr_).backward()

    #GD(net, 1)

    optimizer.step()
    optimizer.zero_grad()

    QMI_LIST.append(np.log2((joint_m/std).item()))

  print('Dimension:{0}, MI-MEAN:{1}, MI-STD:{2}'.format(dim, np.mean(QMI_LIST[-3000:]), np.std(QMI_LIST[-3000:])))
