# Exercise 12.9

In [7]:
import numpy as np
import pandas as pd
from numpy import linalg as la
from scipy.stats import multivariate_normal as mn
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [70]:
def EM_GMM(zs, K, w0s=None, mu0s =None, Sigma0s = None, tol=1e-20, maxiter=200):
    
    # if the parameters aren't initialized, intialize them
    if w0s is None:
        w0s = np.random.uniform(0, 1, size = K)
        w0s /= la.norm(w0s, ord=1)
    if mu0s is None:
        indices = np.arange(0, zs.shape[0])
        choice = np.random.choice(indices, size=K, replace=False)
        mu0s = zs[choice]
    if Sigma0s is None:
        Sigma0s = np.array([np.eye(zs[0].size) for _ in range(K)])
        
    # helper function for computing q_i^t(k)
    def q(m, h):
        num = np.exp(-np.inner(zs[m] - mu0s[h], la.inv(Sigma0s[h])@(zs[m] - mu0s[h]))/2)*w0s[h]\
                /np.sqrt(la.det(Sigma0s[h]))
        
        denom = np.sum([np.exp(-np.inner((zs[m] - mu0s[hp]), la.inv(Sigma0s[hp])@(zs[m]-mu0s[hp]))/2)*w0s[hp]
                        /np.sqrt(la.det(Sigma0s[hp]))
                        for hp in range(K)])
        
        return num / denom

    # now we update
    n = zs.shape[0]
    for j in range(maxiter):
        # initalize updates
        w1s = np.empty_like(w0s)
        mu1s = np.empty_like(mu0s)
        Sigma1s = np.empty_like(Sigma0s)

        # populate the updates
        for k in range(K):
            #get current Q vector
            curr_Q = np.array([q(i, k) for i in range(n)])
            # vupdate weight
            w_update = np.sum(curr_Q) / n
            w1s[k] = w_update
            # update mu
            mu_update = np.sum(curr_Q.reshape(-1, 1) * zs, axis= 0) / (n*w_update)
            mu1s[k] = mu_update
            # update Sigma
            Sigma_update = curr_Q * (zs - mu_update).T @ (zs - mu_update) / (n*w_update)
            Sigma1s[k] = Sigma_update
            
        # check convergence
        if la.norm(w0s - w1s, ord=1) < tol:
            
            return w1s, mu1s, Sigma1s, True, j+1

        # if not converged, assign updates
        w0s = w1s.copy()
        mu0s = mu1s.copy()
        Sigma0s = Sigma1s.copy()
        
    return w0s, mu0s, Sigma0s, False, j+1

## Exercise 12.10 

### part 1

In [52]:
def problem12_10_1():
    
    iris = datasets.load_iris()
    
    X = iris.data
    y = iris.target
    
    weights, mus, Sigmas, converged, iterations = EM_GMM(X, 3)
    
    
    classification = np.empty(X.shape[0])
    for i, x in enumerate(X):
        curr_class = np.array([mn.pdf(x, mean=mus[k], cov=Sigmas[k]) for k in range(3)])
        classification[i] = np.argmax(curr_class)
        
    print('EM for GMM accuracy: ', np.sum(classification == y) / y.size)
        
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
    KM = KMeans(n_clusters=3)
    KM.fit(X_train)
    prediction = KM.predict(X_test)
    print('KMeans Algo:', np.sum(prediction == y_test)/y_test.size)
    return

problem12_10_1()

EM for GMM accuracy:  0.9666666666666667
KMeans Algo: 0.2222222222222222


### part 2

We will use the titanic dataset file used in the KMeans lab. 

In [83]:
def problem12_10_2():
    
    X = pd.read_csv('train.csv')
    #drop na
    X.dropna(inplace=True)
    #get label
    y = X['Survived']
    #drop columns that aren't filled or will affect the classifier
    X = X.drop(['Cabin', 'Name', 'PassengerId', 'Survived', 'Ticket'], axis=1)
    #one-hot-encode
    X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)
    
     
    weights, mus, Sigmas, converged, iterations = EM_GMM(X.values, 2)
    
    
    
    classification = np.empty(X.values.shape[0])
    for i, x in enumerate(X.values):
        curr_class = np.array([mn.pdf(x, mean=mus[k], cov=Sigmas[k]) for k in range(2)])
        classification[i] = np.argmax(curr_class)
        
    print('EM for GMM accuracy: ', np.sum(classification == y.values) / y.values.size)
        
        
    KM = KMeans(n_clusters=2)
    KM.fit(X)
    prediction = KM.predict(X)
    print('KMeans Algo:', np.sum(prediction == y)/y.size)
    return


problem12_10_2()

EM for GMM accuracy:  0.4426229508196721
KMeans Algo: 0.37158469945355194


Because we are using the titanic dataset the only clustering that really makese sense is 2. (whethere they survived or not). As we can see, neither the KMeans algorithm provided by sklearn or the EM for GMM algorithm I designed are terribly inaccurate. Although, mine is certainly more accurate than than KMeans. This is most likely because the assumptions made in the KMeans implementation are not valid, i.e. the $w_k$ are not all equal and the covariance matrices $\Sigma_k$ are not equal and diagonal. The GMM model may not be that accurate because the assumption that there is a Gaussian distribution underlying the data is probably erroneous. 