# EM-Clustering

In [2]:
import sys
import numpy as np
import math
import matplotlib.pyplot as plt
import random

## Gaussian Density Function $f$

$$ f(x_j | \mu_i, \Sigma_i) = \frac{1}{\sqrt{(2\pi)^k \text{det}(\Sigma)}} \text{exp}[-\frac{1}{2}(x_j - \mu_i)^T \Sigma^{-1}(x_j-\mu_i)]$$

In [3]:
def gaussian(x, mean, covar):
    det_covar = np.linalg.det(covar)
    inv_covar = np.linalg.inv(covar + np.eye(len(covar)) * .1)
    term1_den = ( ((2*3.14)**k ) * det_covar) **.5 
    term1 = 1/term1_den
    term2_in = -.5 * np.matmul( np.matmul((x - mean).T,  inv_covar), (x - mean) )
    term2_in = term2_in[0][0]

    return term1 * np.exp(term2_in)

## EM-Clustering Algorithm

![](EMalgo.png)


In [4]:
def EM(D, Y, k, eps):
    d = len(D[0])
    n = len(D)
    t = 0
    
    #randomized initialized mean vector with random values from dataset
    mean_vec_0 = np.zeros((k, d))
    for i in range (k):
        for j in range (d):
            rand = random.randint(0,n-1)
            mean_vec_0[i][j] = D[rand][j]
    
    #covariance matrix initialized to dxd identity matrix
    covar = []
    for i in range (k):
        covar.append(np.identity(d))
    
    #initialized probability of each class
    P = np.zeros(k)
    P.fill(1/k)
    
    
    mean_vec_curr = mean_vec_0
    mean_vec_prev = mean_vec_0
    
    
    while True:
        
        mean_vec_prev = mean_vec_curr
        t = t + 1
        w = np.zeros((k, n))
        
        #Expectation Step
        for i in range (k):
            for j in range (n):
                x_j = np.array([D[j,:]]).T
                mean_i = np.array( [ mean_vec_curr[i,:] ] ).T
                w[i][j] = gaussian(x_j, mean_i, covar[i]) * P[i]
                summation = 0
                for a in range (k):
                    mean_a = np.array( [ mean_vec_curr[a,:] ] ).T
                    summation = summation + gaussian(x_j, mean_a, covar[a]) * P[a]
                    
                w[i][j] = w[i][j] / summation
                
        #Maximization Step
        for i in range(k):
            
            #mean_vector update
            num_sum = 0
            den_sum = 0
            for j in range (n):                
                x_j = D[j,:]
                num_sum = num_sum + w[i][j] * x_j
                den_sum = den_sum + w[i][j]
                
            mean_i = num_sum / den_sum
            for x in range(len(mean_i)):
                mean_vec_curr[i][x] = mean_i[x]
            
            #covar update
            num_sum = 0
            den_sum = 0
            for j in range (n):
                x_j = np.array([D[j,:]]).T
                mean_i = np.array([mean_vec_curr[i,:]]).T
                num_sum = num_sum + w[i][j] * np.matmul((x_j - mean_i), (x_j - mean_i).T)
                den_sum = den_sum + w[i][j]

            covar[i] = num_sum / den_sum
            
            P[i] = den_sum / n
        
        
        if(np.allclose(mean_vec_curr, mean_vec_prev, rtol = 0.0, atol = eps) == True):
            return t, covar, mean_vec_curr
    
        


### Parse Categorical Data

In [5]:
def parse(y):
    n = len(y)
    for i in range (n):
        
        if(y[i] == "Iris-setosa"):
            y[i] = 0
        elif(y[i] == "Iris-versicolor"):
            y[i] = 1
        else:
            y[i] = 2
            
    return y

### Training Sets

In [6]:
train1 = np.genfromtxt("dancing_truth.txt", delimiter=',')
d = len(train1[0])
train1x = train1[:,np.arange(d-1)]
train1y = np.array([train1[:,d-1]]).T

train2 = np.genfromtxt("1R2RC_truth.txt", delimiter=',')
d = len(train2[0])
train2x = train2[:,np.arange(d-1)]
train2y = np.array([train2[:,d-1]]).T

train3x = np.loadtxt('iris.txt', delimiter=',', usecols=[0,1,2,3])
train3y = np.loadtxt('iris.txt', delimiter=',', usecols=[4],dtype=np.str)
train3y = np.array([parse(train3y)]).T

In [7]:
k = 13
eps = .001

In [8]:
t, covar, mean = EM(train1x, train1y, k, eps)

In [9]:
print(t)

1


In [27]:
print(t)
for i in range(k):
    print("Covariance of Cluster ", i+1)
    print(covar[i])
    print("Mean of Cluster", i+1 )
    print(mean[i,:])

Covariance of Cluster  0
[[ 9.32482507e-02  1.70743586e-02  9.34201116e-02 ...  2.41501955e-03
   8.98263292e-02  2.23262059e-04]
 [ 1.70743586e-02  5.97557318e-02  1.68167968e-02 ...  5.60534008e-02
   2.00122824e-02  5.52690944e-02]
 [ 9.34201116e-02  1.68167968e-02  9.36104629e-02 ...  2.16367994e-03
   8.99141784e-02 -1.87060072e-05]
 ...
 [ 2.41501955e-03  5.60534008e-02  2.16367994e-03 ...  5.58413169e-02
   5.79170596e-03  5.56054930e-02]
 [ 8.98263292e-02  2.00122824e-02  8.99141784e-02 ...  5.79170596e-03
   8.73362951e-02  3.61243239e-03]
 [ 2.23262059e-04  5.52690944e-02 -1.87060072e-05 ...  5.56054930e-02
   3.61243239e-03  5.54679993e-02]]
Mean of Cluster 0
[-0.03038596 -0.46493117 -0.03048521 -0.46576518 -0.03039981 -0.46676624
 -0.03003133 -0.4674334  -0.02945338 -0.46778983 -0.02900711 -0.46763912
 -0.02876161 -0.46730011 -0.02866057 -0.46686211 -0.02871804 -0.46633594
 -0.0289128  -0.46566731 -0.0288667  -0.46515622 -0.02895094 -0.46472341
 -0.02894977 -0.46481837 -0.0