In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns 

In [2]:
dataset_large = pd.read_csv("GMM_250_samples.csv", header=None).values
dataset_small = pd.read_csv("GMM_25_samples.csv", header=None).values

In [3]:
def parameter_initialization(data, K=3):
    np.random.shuffle(data)
    init_means = data[:K]
    init_cov = np.ones(K) * 0.1
    init_mixing_coeffs = np.ones(K) / K
    return init_means, init_cov, init_mixing_coeffs

In [4]:
def membership_calculation(data, means, covariances, mixing_coeffs):
    component_number, sample_number = len(means), len(data)

    member_weights = np.zeros((sample_number, component_number))

    for i in range(sample_number):
        for j in range(component_number):
            likelihood_component = mixing_coeffs[j] * \
                np.exp(-0.5 * ((data[i] - means[j]) ** 2) / covariances[j])
            member_weight = likelihood_component / np.sum(
                mixing_coeffs * np.exp(-0.5 * ((data[i] - means) ** 2) / covariances)
            )
            member_weights[i][j] = member_weight

    return member_weights

In [5]:
def parameter_estimation(data, member_weights):
    sample_count = len(data)
    means = np.sum(member_weights * data.reshape(sample_count, 1), axis=0) / np.sum(member_weights, axis=0)
    covariances = np.sum(member_weights * ((data.reshape(sample_count, 1) - means) ** 2), axis=0) / np.sum(member_weights, axis=0)
    mixing_coeffs = np.mean(member_weights, axis=0)
    return means, covariances, mixing_coeffs

In [6]:
def gmm_algorithm(data, K=3):
    starting_means, starting_covariances, starting_mixing_coeffs = parameter_initialization(data, K)
    member_weights = membership_calculation(data, starting_means, starting_covariances, starting_mixing_coeffs)
    estimated_params = parameter_estimation(data, member_weights)
    return estimated_params

In [7]:
gmm_algorithm(dataset_large, K=3)

(array([1.91830198, 3.95597775, 2.9419255 ]),
 array([0.13543884, 0.04563449, 0.06890781]),
 array([0.11870496, 0.06944617, 0.1451822 ]))

In [8]:
gmm_algorithm(dataset_small, K=3)

(array([2.32064701, 1.90205746, 3.11835658]),
 array([0.28631186, 0.07373786, 0.51488645]),
 array([0.06993242, 0.03569702, 0.22770389]))