In [1]:
import numpy as np
from numba import njit, float64, int64, types
from sklearn.mixture import GaussianMixture as GMM
import time

## Naive Python code (Multivariate Normal)

In [2]:
def multi_ll(X, mu, cov):
    n = X.shape[0]
    p = X.shape[1]
    res = np.zeros(n)
    
    for i in range(n):
        exp_inter = np.dot(np.dot((X[i, :] - mu).T, np.linalg.inv(cov)), 
                           (X[i, :] - mu)) / 2.
        res[i] = (2*np.pi)**(-p/2) * np.linalg.det(cov)**(-0.5)*np.exp(-exp_inter)
    
    return res

In [3]:
def GMM_EM_multi(X, mu, cov, max_iter, tau, q, tol = 1e-15):
    n = X.shape[0]
    p = X.shape[1]
    K = mu.shape[0]
    
    for iteration in range(max_iter):
        for k in range(K):
            ll = multi_ll(X, mu[k, :], cov[:, :, k])
            q[:, k] = tau[k] * ll
            
        for i in range(n):
            q[i, :] /= np.sum(q[i, :])
        
        mu_before = mu
        cov_before = cov
        tau_before = tau
        
        for k in range(K):
            q_k = np.sum(q[:, k])
            mu[k, :] = np.sum(q[:, k].reshape(n,1)*X, axis=0) / q_k
            cov[:, :, k] = np.dot((q[:, k].reshape(n,1) * (X - mu[k, :])).T, 
                                 (X - mu[k, :])) / q_k
            tau[k] = q_k / n
        
        mu_diff = np.max(np.abs(mu - mu_before))
        cov_diff = np.max(np.abs(cov - cov_before))
        tau_diff = np.max(np.abs(tau - tau_before))
        diff = np.max(np.array([np.abs(mu_diff), np.abs(cov_diff), np.abs(tau_diff)]))
        
        if ( (iteration > 1) & (diff < tol)):
            break
        
    return mu, cov, tau, iteration

## Numba Python code (Multivariate Normal)

In [4]:
@njit('float64[:](float64[:,:],float64[:],float64[:,:])')
def multi_ll_njit(X, mu, cov):
    n = X.shape[0]
    p = X.shape[1]
    res = np.zeros(n)
    
    for i in range(n):
        exp_inter = np.dot(np.dot((X[i, :] - mu).T, np.linalg.inv(cov)), 
                           (X[i, :] - mu)) / 2.
        res[i] = (2*np.pi)**(-p/2) * np.linalg.det(cov)**(-0.5)*np.exp(-exp_inter)
    
    return res

In [5]:
@njit('float64(float64[:])')
def nb_sum(X):
    res = 0.0
    for i in range(X.shape[0]):
        res += X[i]
    
    return res

In [6]:
r_sig = types.Tuple([float64[:,:],float64[:,:,:],float64[:],int64])
sig = r_sig(float64[:,:],float64[:,:],float64[:,:,:],int64,float64[:],float64[:,:],float64)

In [7]:
@njit(sig)
def GMM_EM_multi_njit(X, mu, cov, max_iter, tau, q, tol = 1e-08):
    n = X.shape[0]
    p = X.shape[1]
    K = mu.shape[0]
    
    for iteration in range(max_iter):
        for k in range(K):
            ll = multi_ll_njit(X, mu[k, :], cov[:, :, k])
            q[:, k] = tau[k] * ll
            
        for i in range(n):
            q[i, :] /=  nb_sum(q[i, :])
        
        mu_before = mu
        cov_before = cov
        tau_before = tau
        
        for k in range(K):
            q_k = nb_sum(q[:, k])
            q = np.ascontiguousarray(q[:, k]).reshape(n,1)
            mu[k, :] = np.sum(q * X, axis = 0) / q_k
            cov[:, :, k] = np.dot((q*(X - mu[k, :])).T, (X - mu[k, :])) / q_k
            tau[k] = q_k / n
        
        mu_diff = np.max(np.abs(mu - mu_before))
        cov_diff = np.max(np.abs(cov - cov_before))
        tau_diff = np.max(np.abs(tau - tau_before))
        
        diff = np.max(np.array([np.abs(mu_diff), np.abs(cov_diff), np.abs(tau_diff)]))
        
        if ( (iteration > 1 ) & (diff < tol)):
            break
        
    return mu, cov, tau, iteration

## Time Check

In [8]:
time_list_naive_multi_large = []
np.random.seed(42)

for i in range(10):
    
    x1 = np.random.multivariate_normal([1.5, 2.5], [[1.2,0.4],[0.4,1.1]],size= 20000)
    x2 = np.random.multivariate_normal([7.3, 10.2], [[1.5,0.5],[0.5,2.1]],size= 20000)
    X_tot = np.vstack((x1,x2))
    
    mu = np.array([[1.,2.],[6.,8.]])
    cov = np.array([[[1.,1.2],[0.,0.2]],[[0.,0.2],[1.,1.5]]])
    q= np.zeros((len(X_tot),2))
    tau = np.array([1/2,1/2])
    
    t1 = time.time()
    GMM_EM_multi(X_tot, mu, cov, 10000 , tau , q)
    t2 = time.time()
    
    time_list_naive_multi_large.append(t2-t1)

In [9]:
time_list_njit_multi_large = []

np.random.seed(42)

for i in range(10):
    
    x1 = np.random.multivariate_normal([1.5, 2.5], [[1.2,0.4],[0.4,1.1]],size= 20000)
    x2 = np.random.multivariate_normal([7.3, 10.2], [[1.5,0.5],[0.5,2.1]],size= 20000)
    X_tot = np.vstack((x1,x2))
    
    mu = np.array([[1.,2.],[6.,8.]])
    cov = np.array([[[1.,1.2],[0.,0.2]],[[0.,0.2],[1.,1.5]]])
    q= np.zeros((len(X_tot),2))
    tau = np.array([1/2,1/2])
    
    t1 = time.time()
    test=GMM_EM_multi_njit(X_tot, mu, cov, 10000 , tau , q, 1e-08)
    t2 = time.time()
    
    time_list_njit_multi_large.append(t2-t1)

In [10]:
cov = np.array([[[1.,1.2],[0.,0.2]],[[0.,0.2],[1.,1.5]]])
precision = np.array([np.linalg.inv(cov[:,:,0]),np.linalg.inv(cov[:,:,1])])
precision.shape

(2, 2, 2)

In [11]:
time_list_sklearn_multi_large = []
np.random.seed(42)
for i in range(10):
    
    x1 = np.random.multivariate_normal([1.5, 2.5], [[1.2,0.4],[0.4,1.1]],size= 20000)
    x2 = np.random.multivariate_normal([7.3, 10.2], [[1.5,0.5],[0.5,2.1]],size= 20000)
    X_tot = np.vstack((x1,x2))
    
    t1 = time.time()
    gmm = GMM(n_components= 2, random_state= 42 , covariance_type="full" 
              ,means_init=np.array([[1.5,2.5],[7.3,10.2]]), precisions_init= precision
              ,max_iter=10000)
    gmm.fit(X_tot)
    t2 = time.time()
    
    time_list_sklearn_multi_large.append(t2-t1)

In [12]:
print(np.mean(time_list_naive_multi_large))
print(time_list_njit_multi_large[0])
print(np.mean(time_list_njit_multi_large[1:]))
print(np.mean(time_list_sklearn_multi_large))

22.590715217590333
0.6996550559997559
0.6929588847690158
0.11344707012176514


In [13]:
print(np.std(time_list_naive_multi_large)/np.sqrt(10))
print(np.std(time_list_njit_multi_large[1:])/np.sqrt(9))
print(np.std(time_list_sklearn_multi_large)/np.sqrt(10))

0.15637913983578974
0.022916451976872265
0.0011782519058522488
