## Mix the data

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal as mn


trn5 = np.array(pd.read_csv('trn5.csv', index_col=0)).T
trn6 = np.array(pd.read_csv('trn6.csv', index_col=0)).T
trn8 = np.array(pd.read_csv('trn8.csv', index_col=0)).T

mix = np.vstack([trn5,trn6,trn8])

## Initialization
Vi bruger k-means til at finde initial means.

Vi bruger apriori at vi har 3 forskellige grupperinger.

Vi kunne have lavet en $\varepsilon$ som var en tolerance imellem means, som når vores iterativ ændring er mindre stopper vi algoritmen

In [57]:
k = 3
iter = 100
idxs = [np.random.randint(0, mix.shape[0]) for i in range(k)]
initials = [mix[idx] for idx in idxs]

means = initials

for i in range(iter):
    # calculate distances to all means
    collections = {f"c{i+1}":[] for i in range(k)}
    distances = []
    for mean in means:
        distances.append(np.hypot(*(mix - mean).T))
    distances = np.array(distances)
    
    # classify points according to distances
    for j, d in enumerate(distances.T):
        l = np.argmin(d)
        collections[f"c{l+1}"].append(mix[j])
    
    # calculate new means
    means = []
    for key in [f"c{j+1}" for j in range(k)]:
        points = np.array(collections[key])
        mean = np.mean(points, axis=0)
        means.append(mean)

def plot():
    plt.title("Classification based on euclidian distance to means")
    for key in collections:
        vals = np.array(collections[key])
        plt.scatter(vals[:, 0], vals[:, 1], label=key)

    means = np.array(means)
    plt.scatter(means[:, 0], means[:, 1], label="means")
    plt.legend()
    plt.show()

In [71]:
# Create the initial covariances and priors
variances = []
priors = []
for key in collections:
    # covariance
    vals = np.array(collections[key])
    cov = np.cov(vals.T)
    variances.append(cov)

    # priors
    n = len(vals)
    pg = n / len(mix)
    priors.append(pg)

In [93]:
def E_step(data, means, variances, prios, k=3):
    h = np.zeros((len(data), k))
    for i, dat in enumerate(data):
        h_row = np.zeros((1, k))
        bot = 0
        
        for j, (mean, cov, prior) in enumerate(zip(means, variances, priors)):
            top = mn.pdf(dat, mean=mean, cov=cov) * prior
            h_row[0, j] = top
            bot += top
        
        h_row /= bot
        h[i] = np.copy(h_row)

    return h


test = E_step(mix, means, variances, priors)

In [None]:
def M_step(data, means, h):
    pass