In [1]:
import jax.numpy as jnp
from jax import grad, jit, vmap
from jax import random
import numpy as np

# Generazione Dati

In [10]:
from IPython.utils.sysinfo import num_cpus

# numero di cluster
K=5

# dimensione del campione
d=1

# numero di sample
N=1000

key = random.PRNGKey(2023)
# vectors of mean of clusters
sigma=1
mu=random.normal(key,(K,d))*sigma

# cluster assignment
key = random.PRNGKey(2)
c=random.categorical(key,(1/K)*jnp.ones(shape=(K,)),axis=0,shape=(N,))
C=np.zeros(shape=(N,K))
for i in range(N):
  C[i,c[i]]=1


# Data
X=np.matmul(C,mu)+random.normal(key,(N,d))

# Variational Inference

We construct the model for our VI algorithm, we stick to the notation of the paper. Our model is described by:

\begin{align*}
    \mu_k\ \mid x &\stackrel{\tiny\mbox{iid}}{\sim} \mathcal{N}\left(m_k, s^2_k\right) \\
    x_i \mid c_i,μ &\sim \mathcal{N}\left(c^T_iμ, 1\right) \\
    \mu_k &\stackrel{\tiny\mbox{iid}}{\sim} \mathcal{N}\left(0, \sigma^2\right) \\
\end{align*}

We work in the family of Gaussian distribution for this first attempt, then we could expand it to exponential family.


Il paper descrive l'algoritmo in un caso 1D, dobbiamo adattarlo in un generico caso multidimensionale, io ho iniziato implementando il caso unidimensionale nell'algoritmo copiandolo dal paper, consapevole che andrà adattato

In [29]:
def update_phi(data,phi,m,s2):
  for i in np.arange(data.shape[0]):
    for k in np.arange(phi.shape[1]):
      phi[i,k]=np.exp(np.matmul(m[k,:],data[i,:].transpose())-(s2[k]+np.matmul(m[k,:],m[k,:].transpose()))/2) # non sono così sicuro della formula per mk^2
    phi[i,:]=phi[i,:]/np.sum(phi[i,:])
  return phi

update_phi_jit=jit(update_phi)

def update_mean_and_variance(data,phi,m,s2,sigma):
  for k in np.arange(phi.shape[1]):
    m[k,:]=np.matmul(phi[:,k].transpose(),data)/(1/sigma**2+np.sum(phi[:,k]))
    s2[k]=1/(1/sigma**2+np.sum(phi[:,k]))
  return m,s2

update_mean_and_variance_jit=jit(update_mean_and_variance)

def compute_ELBO(m,s2,phi,data):
  # when computing the ELBO value, we omit constants because once we compute the improvement they would have a total of 0
  # Fn stands for the nth component of the formula (21) in the review paper
  F1=-np.sum(np.log(s2))/2 # + const
  # F2= -log(K) sum over k from 1 to K => constant in every iteration
  F3=0
  F4=0
  for i in np.arange(N):
    for k in np.arange(K):
      F3+=phi[i,k]*(np.matmul(m[k,:],data[i,:].transpose())-0.5*(s2[k]+np.matmul(m[k,:],m[k,:].transpose())))
      F4+=np.log(phi[i,k])*phi[i,k]
  F5=0
  for k in np.arange(K):
    F5+=np.matmul((np.matmul(phi[:,k].transpose(),data)), m[k,:]) - (0.5/sigma**2 + np.sum(phi[:,k])/2)*(s2[k]+np.matmul(m[k,:],m[k,:].transpose()))
  return F1+F3+F4+F5

compute_ELBO_jit=jit(compute_ELBO)



In [39]:
# FUNCTION FOR VARIATIONAL INFERENCE
# Notation of the paper
def VI(data,K,sigma,nMAX):
  # creating our variables as estimation of parameters for posterior probabilities
  # jax arrays are immutable, so I don't know how to create these variables in jax and use them
  # I iniialize them randomly, since there is no a-priori starting point which is best than others
  N=data.shape[0]
  d=data.shape[1]
  phi=np.ones((N,K))/K
  m=np.random.normal(size=(K,d))
  s2=np.random.uniform(0,10,size=(K,))
  improvement=1
  tol= 10**-5
  ELBO_old=0 # probabilmente questo andrà modificato
  ELBO_new=0
  nit=0 # number of iterations
  while improvement>tol:
    phi=update_phi(data,phi,m,s2)
    m,s2=update_mean_and_variance(data,phi,m,s2,sigma)
    ELBO_old=ELBO_new
    ELBO_new=compute_ELBO(m,s2,phi,data)
    improvement=np.abs(ELBO_new-ELBO_old)
    nit+=1
    if nit>=nMAX:
      break
  return m,s2,phi




In [None]:
m,s2,phi=VI(X,K,sigma,100)
print(m,'\n',mu)
print(s2)
print(phi,'\n',C)
