In [1]:
import numpy as np
from sklearn import datasets
from sklearn.datasets import make_spd_matrix
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.stats import multivariate_normal

In [145]:
class GaussianMixture:

    def __init__(self, n_components=1, tol=0.001, init_params='random', max_iter=100):
        assert n_components > 0
        assert tol != 0
        assert init_params in {'random', 'kmeans'}
        assert max_iter > 0
        
        self.n_components = n_components
        self.tol = tol
        self.max_iter = max_iter
        self.init_params = init_params
          
    def __assertion(self, X):
        N, D = X.shape
        assert self.weights.shape == (self.n_components, )
        assert self.means.shape == (self.n_components, D)
        assert self.covs.shape == (self.n_components, D, D)
            
    def __init_params(self, X):
        N, D = X.shape
        
        # Responsibilities are initialized randomly
        if self.init_params == 'random':
            self.means = np.random.choice(X.flatten(), size=(self.n_components, D))
        
        # Responsibilities are initialized using KMeans
        elif self.init_params == 'kmeans':
            kmeans = KMeans(n_clusters=self.n_components, tol=self.tol, max_iter=self.max_iter)
            kmeans.fit(X)
            self.means = kmeans.cluster_centers_
        
        self.weights = np.ones(self.n_components) / self.n_components
        self.covs = np.array([make_spd_matrix(D) for _ in range(self.n_components)])
        self.__assertion(X)
    
    def predict_prob(self, X):
        return np.array([multivariate_normal.pdf(X, self.means[k], self.covs[k], allow_singular=True) * \
                         self.weights[k] for k in range(self.n_components)])
    
    def __expectation(self, X):
        N, D = X.shape
        self.probs = self.predict_prob(X)
        self.probs /= np.sum(self.probs, axis=0)
        assert self.probs.shape == (self.n_components, N)        
        
    def __maximization(self, X):
        N, D = X.shape     
        
        for k in range(self.n_components):
            k_raised_prob = self.probs[k].reshape(N, 1)        
            k_total_prob = np.sum(self.probs[k])
            self.weights[k] = k_total_prob / N
            self.means[k] = np.sum(k_raised_prob * X, axis=0) / k_total_prob           
            diff_k = (X - self.means[k])
            self.covs[k] = np.dot((k_raised_prob * diff_k).T, diff_k) / k_total_prob
        
        self.__assertion(X)
    
    def __em(self, X):
        self.__expectation(X)
        self.__maximization(X)  
    
    def fit(self, X):
        self.__init_params(X)
        for _ in range(self.max_iter):
            self.__em(X)
    
    def predict(self, X):
        return np.argmax(self.predict_prob(X), axis=0).reshape(-1, 1)
         

In [170]:
gmm = GaussianMixture(n_components=2, init_params='kmeans')
X = cardio_data['X']
y = cardio_data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
gmm.fit(X_train)
y_predict = gmm.predict(X_test)

In [149]:
from sklearn.metrics import classification_report

In [171]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.97      0.85      0.90       168
         1.0       0.30      0.69      0.42        16

    accuracy                           0.83       184
   macro avg       0.63      0.77      0.66       184
weighted avg       0.91      0.83      0.86       184



In [116]:
cardio_data = loadmat('cardio.mat')

In [123]:
import pandas as pd
data = pd.DataFrame(cardio_data['X'])

In [130]:
(data < -10 ** 9).any().sum()

0