In [None]:
import numpy as np
import scipy as sp
from scipy.stats import norm


In [51]:
# 1. Generate Synthetic Data
N = 7
K = 3

In [123]:
r = np.zeros((K, N))  # will store the responsibilities

np.random.seed(44)

data = np.array([-3, -2.5, -1, 0, 2, 4, 5])

means = np.array([-4, 0, 8])
covs = np.array([1, 0.2, 3])
weights = np.array([1 / 3, 1 / 3, 1 / 3])

i = 0
j = 0

# for i in range(N):
#     for j in range(K):
#         foo = weights[j] * norm.pdf(loc=means[j], scale=covs[j], x=data[i])
#         bar = 0
#         for k in range(K):
#             bar += weights[k] * norm.pdf(loc=means[k], scale=covs[k], x=data[i])
#         r[i, j] = foo / bar
#         # print r[i, j] with 2 decimal places
#         print("{:.2f}".format(r[i, j]), end=" ")


# foo = weights[0] * norm.pdf(loc=means[0], scale=covs[0], x=data[index])

# sum = 0

for k in range(K):
    r[k] = norm.pdf(loc=means[k], scale=np.sqrt(covs[k]), x=data.T)

r = r / np.sum(r, axis=0)

r.T
# foo / sum


array([[1.      , 0.      , 0.      ],
       [0.999999, 0.000001, 0.      ],
       [0.057069, 0.942926, 0.000004],
       [0.00015 , 0.999844, 0.000006],
       [0.00001 , 0.066237, 0.933753],
       [0.      , 0.      , 1.      ],
       [0.      , 0.      , 1.      ]])

In [121]:
n = 3 - 1
k = 3 - 1
q = weights[k] * norm.pdf(loc=means[k], scale=np.sqrt(covs[k]), x=data[n])
q

np.float64(1.0525740966894856e-07)

In [109]:
q1 = weights[0] * norm.pdf(loc=means[0], scale=covs[0], x=data[n])
q2 = weights[1] * norm.pdf(loc=means[1], scale=covs[1], x=data[n])
q3 = weights[2] * norm.pdf(loc=means[2], scale=covs[2], x=data[n])
qq = q1 + q2 + q3
q1, q2, q3, qq

(np.float64(0.0014772828039793357),
 np.float64(2.4778658578904965e-06),
 np.float64(0.0004924276013264453),
 np.float64(0.0019721882711636715))

In [110]:
q / qq

np.float64(0.24968589892074194)

In [120]:
K = 3  # Number of components
N = 7  # Number of observations

weights = np.array([1 / 3] * K)  # Equal weights for all components
means = np.array([-4, 0, 8])  # Mean vector
scales = np.array([1, 0.2, 3])  # Standard deviations
variances = scales  # Variances

# Observations
observations = np.array([-3, -2.5, -1, 0, 2, 4, 5])

# Initialize the responsibility matrix (N x K)
responsibilities = np.zeros((N, K))

# Precompute constants
sqrt_2pi = np.sqrt(2 * np.pi)

# Compute the responsibilities
for n in range(N):
    y_n = observations[n]
    # Compute the Gaussian densities for all components
    p_nk = np.zeros(K)
    for k in range(K):
        mu_k = means[k]
        sigma_k2 = variances[k]
        coeff = 1 / (sqrt_2pi * np.sqrt(sigma_k2))
        exponent = -((y_n - mu_k) ** 2) / (2 * sigma_k2)
        p_nk[k] = norm.pdf(y_n, loc=mu_k, scale=np.sqrt(sigma_k2))
        print(p_nk[k])

    # Multiply by the weights
    weighted_p_nk = weights * p_nk

    # Compute the responsibilities by normalizing
    total = np.sum(weighted_p_nk)
    responsibilities[n, :] = weighted_p_nk / total

# Print the responsibility matrix
np.set_printoptions(precision=6, suppress=True)
print("Responsibility Matrix (N x K):")
print(responsibilities)

0.24197072451914337
1.5092779429032093e-10
4.01862412158526e-10
0.12951759566589174
1.4606420129620445e-07
2.4109507201666824e-09
0.0044318484119380075
0.07322491280963242
3.157722290068457e-07
0.00013383022576488537
0.8920620580763856
5.3687720495468876e-06
6.075882849823286e-09
4.049955478044552e-05
0.0005709295833517096
5.052271083536893e-15
3.789795640412959e-18
0.01600408392170322
1.0279773571668917e-18
6.411947371150563e-28
0.05139344326792309
Responsibility Matrix (N x K):
[[1.       0.       0.      ]
 [0.999999 0.000001 0.      ]
 [0.057069 0.942926 0.000004]
 [0.00015  0.999844 0.000006]
 [0.00001  0.066237 0.933753]
 [0.       0.       1.      ]
 [0.       0.       1.      ]]


In [128]:
class GMM(object):
    def __init__(self, X, k=2):
        # dimension
        X = np.asarray(X)
        self.m, self.n = X.shape
        self.data = X.copy()
        # number of mixtures
        self.k = k

    def _init(self):
        # init mixture means/sigmas
        self.mean_arr = np.asmatrix(np.random.random((self.k, self.n)))
        self.sigma_arr = np.array(
            [np.asmatrix(np.identity(self.n)) for i in range(self.k)]
        )
        self.phi = np.ones(self.k) / self.k
        self.w = np.asmatrix(np.empty((self.m, self.k), dtype=float))
        # print(self.mean_arr)
        # print(self.sigma_arr)

    def fit(self, tol=1e-4):
        self._init()
        num_iters = 0
        ll = 1
        previous_ll = 0
        while ll - previous_ll > tol:
            previous_ll = self.loglikelihood()
            self._fit()
            num_iters += 1
            ll = self.loglikelihood()
            print("Iteration %d: log-likelihood is %.6f" % (num_iters, ll))
        print("Terminate at %d-th iteration:log-likelihood is %.6f" % (num_iters, ll))

    def loglikelihood(self):
        ll = 0
        for i in range(self.m):
            tmp = 0
            for j in range(self.k):
                # print(self.sigma_arr[j])
                tmp += (
                    sp.stats.multivariate_normal.pdf(
                        self.data[i, :], self.mean_arr[j, :].A1, self.sigma_arr[j, :]
                    )
                    * self.phi[j]
                )
            ll += np.log(tmp)
        return ll

    def _fit(self):
        self.e_step()
        self.m_step()

    def e_step(self):
        # calculate w_j^{(i)}
        for i in range(self.m):
            den = 0
            for j in range(self.k):
                num = (
                    sp.stats.multivariate_normal.pdf(
                        self.data[i, :], self.mean_arr[j].A1, self.sigma_arr[j]
                    )
                    * self.phi[j]
                )
                den += num
                self.w[i, j] = num
            self.w[i, :] /= den
            assert self.w[i, :].sum() - 1 < 1e-4

    def m_step(self):
        for j in range(self.k):
            const = self.w[:, j].sum()
            self.phi[j] = 1 / self.m * const
            _mu_j = np.zeros(self.n)
            _sigma_j = np.zeros((self.n, self.n))
            for i in range(self.m):
                _mu_j += self.data[i, :] * self.w[i, j]
                _sigma_j += self.w[i, j] * (
                    (self.data[i, :] - self.mean_arr[j, :]).T
                    * (self.data[i, :] - self.mean_arr[j, :])
                )
                # print((self.data[i, :] - self.mean_arr[j, :]).T * (self.data[i, :] - self.mean_arr[j, :]))
            self.mean_arr[j] = _mu_j / const
            self.sigma_arr[j] = _sigma_j / const
        # print(self.sigma_arr)

In [156]:
X = sp.stats.norm.rvs(loc=means[0], scale=np.sqrt(covs[0]), size=20)
X = np.hstack((X, sp.stats.norm.rvs(loc=means[1], scale=np.sqrt(covs[1]), size=20)))
X = np.hstack((X, sp.stats.norm.rvs(loc=means[2], scale=np.sqrt(covs[2]), size=20)))
X = X.reshape(-1, 1)
X.shape

(60, 1)

In [190]:
data2 = data.reshape(-1, 1)

In [191]:
gmm = GMM(data2, 3)
gmm.fit()

Iteration 1: log-likelihood is -17.322663
Iteration 2: log-likelihood is -17.201741
Iteration 3: log-likelihood is -17.070497
Iteration 4: log-likelihood is -16.856126
Iteration 5: log-likelihood is -16.612013
Iteration 6: log-likelihood is -16.435516
Iteration 7: log-likelihood is -16.345426
Iteration 8: log-likelihood is -16.299685
Iteration 9: log-likelihood is -16.265928
Iteration 10: log-likelihood is -16.228111
Iteration 11: log-likelihood is -16.179418
Iteration 12: log-likelihood is -16.118677
Iteration 13: log-likelihood is -16.053517
Iteration 14: log-likelihood is -15.999474
Iteration 15: log-likelihood is -15.961880
Iteration 16: log-likelihood is -15.930958
Iteration 17: log-likelihood is -15.894492
Iteration 18: log-likelihood is -15.835504
Iteration 19: log-likelihood is -15.714461
Iteration 20: log-likelihood is -15.466288
Iteration 21: log-likelihood is -15.278695
Iteration 22: log-likelihood is -15.261741
Iteration 23: log-likelihood is -15.253832
Iteration 24: log-li

In [192]:
gmm.mean_arr

matrix([[ 4.505256],
        [ 0.282011],
        [-2.754311]])

In [193]:
gmm.sigma_arr

array([[[0.250013]],

       [[1.86222 ]],

       [[0.062481]]])

In [194]:
gmm.phi

array([0.282046, 0.444496, 0.273457])