# Expectation-Maximization Clustering

### Maximization
$$
q_{mk} = \frac{\sum\limits_{n=1}^{N} r_{nk}I(t_m \in d_n)}{\sum\limits_{n=1}^{N}r_{nk}};
\alpha_{k} = \frac{1}{N} \sum\limits_{n=1}^{N} r_{nk}
$$

### Expectation
$$
r_{nk} = \frac{\alpha_k\left(\prod_{t_m \in d_n}q_{mk}\right)
\left(\prod_{t_m \not\in d_n}(1-q_{mk})\right)}
{\sum\limits_{k=1}^{K}\alpha_k\left(\prod_{t_m \in d_n}q_{mk}\right)
\left(\prod_{t_m \not\in d_n}(1-q_{mk})\right)}
$$

### Example
<table>
    <tr>
        <th>DocID</th>
        <th>Tokens</th>
        <th>Class</th>
    </tr>
    <tr>
        <td>0</td>
        <td>apple ios mac book fruit</td>
        <td>A</td>
    </tr>
    <tr>
        <td>1</td>
        <td>apple mac book apple store fruit</td>
        <td>A</td>
    </tr>
    <tr>
        <td>2</td>
        <td>microsoft ibm apple oracle</td>
        <td>A</td>
    </tr>
    <tr>
        <td>3</td>
        <td>apple banana mango fruit</td>
        <td>B</td>
    </tr>
    <tr>
        <td>4</td>
        <td>apple fruit</td>
        <td>B</td>
    </tr>
</table>

In [8]:
import numpy as np
from IPython.core.display import display, HTML, Image

docs = [
    ['apple', 'ios', 'mac', 'book'],
    ['apple', 'mac', 'book', 'apple', 'store'],
    ['microsoft', 'ibm', 'apple', 'oracle'],
    ['apple', 'banana', 'mango', 'fruit'],
    ['apple', 'fruit', 'mango']
]
terms = list(set([x for y in docs for x in y]))
M = np.array([[1 if x in y else 0 for x in terms] for y in docs])
N, K, m = len(docs), 2, len(terms)
A = np.zeros(K)
Q = np.zeros((m, K))

## Random init $r_{nk}$

In [9]:
R = np.zeros((N, K))
for doc in range(N):
    a = np.random.uniform()
    R[doc] = [a, 1-a]

## Functions

In [10]:
def maximization(K, R):
    for k in range(K):
        A[k] = R[:,k].sum() / N
        for word in range(m):
            sigma_doc = 0.0
            for doc in range(N):
                sigma_doc += R[doc,k] * M[doc,word]
            Q[word][k] = sigma_doc / R[:,k].sum()

In [11]:
def estimate(doc, k, Q, A):
    q_doc = np.zeros(m)
    for word in range(m):
        if M[doc,word] > 0:
            q_doc[word] = Q[word,k]
        else:
            q_doc[word] = 1 - Q[word,k]
    return A[k] * q_doc.prod()
    
def expectation(K, Q, A):
    for doc in range(N):
        k_estimation = np.array([estimate(doc, k, Q, A) for k in range(K)])
        for k in range(K):
            R[doc][k] = k_estimation[k] / k_estimation.sum()

In [12]:
def to_table(title, data, cols, rows):
    header = "<tr>" + "".join(["<th>{}</th>".format(x) for x in [''] + cols]) + "</tr>"
    trs = []
    for i, c in enumerate(rows):
        tr = "<tr>" + "<td>{}</td>".format(c)
        tr += "".join(["<td>{}</td>".format(round(x, 3)) for x in data[i]])
        tr += "</tr>"
        trs.append(tr)
    table = "<h3>{}</h3><table>{}{}</table>".format(
        title,
        header,
        "".join(trs)
    )
    return table

def show(r, q, a):
    table = "<table><tr><td style='vertical-align: top;'>{}</td><td style='vertical-align: top;'>{}</td><td style='vertical-align: top;'>{}</td></tr></table>".format(
        r, q, a
    )
    display(HTML(table))

## Start

In [13]:
TR = to_table('$r_{nk}$', R, range(K), range(N))
TQ = to_table('$q_{mk}$', Q, range(K), terms)
TA = to_table('$a_{k}$', [A], range(K), ['priors'])
show(TR, TQ, TA)

0,1,2
$r_{nk}$0100.3120.68810.3160.68420.2810.71930.4880.51240.3050.695,$q_{mk}$01apple0.00.0fruit0.00.0ios0.00.0mango0.00.0banana0.00.0mac0.00.0book0.00.0oracle0.00.0microsoft0.00.0store0.00.0ibm0.00.0,$a_{k}$01priors0.00.0

Unnamed: 0,0,1
0,0.312,0.688
1,0.316,0.684
2,0.281,0.719
3,0.488,0.512
4,0.305,0.695

Unnamed: 0,0,1
apple,0.0,0.0
fruit,0.0,0.0
ios,0.0,0.0
mango,0.0,0.0
banana,0.0,0.0
mac,0.0,0.0
book,0.0,0.0
oracle,0.0,0.0
microsoft,0.0,0.0
store,0.0,0.0

Unnamed: 0,0,1
priors,0.0,0.0


## Iterate

In [14]:
for iteration in range(10):
    maximization(K, R)
    expectation(K, Q, A)
    TR = to_table('$r_{nk}$', R, range(K), range(N))
    TQ = to_table('$q_{mk}$', Q, range(K), terms)
    TA = to_table('$a_{k}$', [A], range(K), ['priors'])
    display(HTML("<h2>ITERATION {}</h2>".format(iteration+1)))
    show(TR, TQ, TA)

0,1,2
$r_{nk}$0100.2110.78910.2150.78520.1420.85830.6990.30140.5150.485,$q_{mk}$01apple1.01.0fruit0.4660.366ios0.1830.209mango0.4660.366banana0.2870.155mac0.3690.416book0.3690.416oracle0.1650.218microsoft0.1650.218store0.1860.207ibm0.1650.218,$a_{k}$01priors0.340.66

Unnamed: 0,0,1
0,0.211,0.789
1,0.215,0.785
2,0.142,0.858
3,0.699,0.301
4,0.515,0.485

Unnamed: 0,0,1
apple,1.0,1.0
fruit,0.466,0.366
ios,0.183,0.209
mango,0.466,0.366
banana,0.287,0.155
mac,0.369,0.416
book,0.369,0.416
oracle,0.165,0.218
microsoft,0.165,0.218
store,0.186,0.207

Unnamed: 0,0,1
priors,0.34,0.66


0,1,2
$r_{nk}$0100.0170.98310.0180.98220.0050.99530.9910.00940.9450.055,$q_{mk}$01apple1.01.0fruit0.6810.245ios0.1180.245mango0.6810.245banana0.3920.094mac0.2390.489book0.2390.489oracle0.080.266microsoft0.080.266store0.1210.244ibm0.080.266,$a_{k}$01priors0.3560.644

Unnamed: 0,0,1
0,0.017,0.983
1,0.018,0.982
2,0.005,0.995
3,0.991,0.009
4,0.945,0.055

Unnamed: 0,0,1
apple,1.0,1.0
fruit,0.681,0.245
ios,0.118,0.245
mango,0.681,0.245
banana,0.392,0.094
mac,0.239,0.489
book,0.239,0.489
oracle,0.08,0.266
microsoft,0.08,0.266
store,0.121,0.244

Unnamed: 0,0,1
priors,0.356,0.644


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit0.980.021ios0.0090.325mango0.980.021banana0.5010.003mac0.0180.65book0.0180.65oracle0.0030.329microsoft0.0030.329store0.0090.325ibm0.0030.329,$a_{k}$01priors0.3950.605

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,0.98,0.021
ios,0.009,0.325
mango,0.98,0.021
banana,0.501,0.003
mac,0.018,0.65
book,0.018,0.65
oracle,0.003,0.329
microsoft,0.003,0.329
store,0.009,0.325

Unnamed: 0,0,1
priors,0.395,0.605


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit1.00.0ios0.00.333mango1.00.0banana0.50.0mac0.00.667book0.00.667oracle0.00.333microsoft0.00.333store0.00.333ibm0.00.333,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,1.0,0.0
ios,0.0,0.333
mango,1.0,0.0
banana,0.5,0.0
mac,0.0,0.667
book,0.0,0.667
oracle,0.0,0.333
microsoft,0.0,0.333
store,0.0,0.333

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit1.00.0ios0.00.333mango1.00.0banana0.50.0mac0.00.667book0.00.667oracle0.00.333microsoft0.00.333store0.00.333ibm0.00.333,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,1.0,0.0
ios,0.0,0.333
mango,1.0,0.0
banana,0.5,0.0
mac,0.0,0.667
book,0.0,0.667
oracle,0.0,0.333
microsoft,0.0,0.333
store,0.0,0.333

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit1.00.0ios0.00.333mango1.00.0banana0.50.0mac0.00.667book0.00.667oracle0.00.333microsoft0.00.333store0.00.333ibm0.00.333,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,1.0,0.0
ios,0.0,0.333
mango,1.0,0.0
banana,0.5,0.0
mac,0.0,0.667
book,0.0,0.667
oracle,0.0,0.333
microsoft,0.0,0.333
store,0.0,0.333

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit1.00.0ios0.00.333mango1.00.0banana0.50.0mac0.00.667book0.00.667oracle0.00.333microsoft0.00.333store0.00.333ibm0.00.333,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,1.0,0.0
ios,0.0,0.333
mango,1.0,0.0
banana,0.5,0.0
mac,0.0,0.667
book,0.0,0.667
oracle,0.0,0.333
microsoft,0.0,0.333
store,0.0,0.333

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit1.00.0ios0.00.333mango1.00.0banana0.50.0mac0.00.667book0.00.667oracle0.00.333microsoft0.00.333store0.00.333ibm0.00.333,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,1.0,0.0
ios,0.0,0.333
mango,1.0,0.0
banana,0.5,0.0
mac,0.0,0.667
book,0.0,0.667
oracle,0.0,0.333
microsoft,0.0,0.333
store,0.0,0.333

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit1.00.0ios0.00.333mango1.00.0banana0.50.0mac0.00.667book0.00.667oracle0.00.333microsoft0.00.333store0.00.333ibm0.00.333,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,1.0,0.0
ios,0.0,0.333
mango,1.0,0.0
banana,0.5,0.0
mac,0.0,0.667
book,0.0,0.667
oracle,0.0,0.333
microsoft,0.0,0.333
store,0.0,0.333

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.010.01.020.01.031.00.041.00.0,$q_{mk}$01apple1.01.0fruit1.00.0ios0.00.333mango1.00.0banana0.50.0mac0.00.667book0.00.667oracle0.00.333microsoft0.00.333store0.00.333ibm0.00.333,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0

Unnamed: 0,0,1
apple,1.0,1.0
fruit,1.0,0.0
ios,0.0,0.333
mango,1.0,0.0
banana,0.5,0.0
mac,0.0,0.667
book,0.0,0.667
oracle,0.0,0.333
microsoft,0.0,0.333
store,0.0,0.333

Unnamed: 0,0,1
priors,0.4,0.6
