# Expectation-Maximization Clustering

### Maximization
$$
q_{mk} = \frac{\sum\limits_{n=1}^{N} r_{nk}I(t_m \in d_n)}{\sum\limits_{n=1}^{N}r_{nk}};
\alpha_{k} = \frac{1}{N} \sum\limits_{n=1}^{N} r_{nk}
$$

### Expectation
$$
r_{nk} = \frac{\alpha_k\left(\prod_{t_m \in d_n}q_{mk}\right)
\left(\prod_{t_m \not\in d_n}(1-q_{mk})\right)}
{\sum\limits_{k=1}^{K}\alpha_k\left(\prod_{t_m \in d_n}q_{mk}\right)
\left(\prod_{t_m \not\in d_n}(1-q_{mk})\right)}
$$

### Example
<table>
    <tr>
        <th>DocID</th>
        <th>Tokens</th>
        <th>Class</th>
    </tr>
    <tr>
        <td>0</td>
        <td>apple ios mac book fruit</td>
        <td>A</td>
    </tr>
    <tr>
        <td>1</td>
        <td>apple mac book apple store fruit</td>
        <td>A</td>
    </tr>
    <tr>
        <td>2</td>
        <td>microsoft ibm apple oracle</td>
        <td>A</td>
    </tr>
    <tr>
        <td>3</td>
        <td>apple banana mango fruit</td>
        <td>B</td>
    </tr>
    <tr>
        <td>4</td>
        <td>apple fruit</td>
        <td>B</td>
    </tr>
</table>

In [8]:
import numpy as np
from IPython.core.display import display, HTML, Image

docs = [
    ['apple', 'ios', 'mac', 'book', 'fruit'],
    ['apple', 'mac', 'book', 'apple', 'store'],
    ['microsoft', 'ibm', 'apple', 'oracle'],
    ['apple', 'banana', 'mango', 'fruit'],
    ['apple', 'fruit', 'mango']
]
terms = list(set([x for y in docs for x in y]))
M = np.array([[1 if x in y else 0 for x in terms] for y in docs])
N, K, m = len(docs), 2, len(terms)
A = np.zeros(K)
Q = np.zeros((m, K))

## Random init $r_{nk}$

In [9]:
R = np.zeros((N, K))
for doc in range(N):
    a = np.random.uniform()
    R[doc] = [a, 1-a]

## Functions

In [10]:
def maximization(K, R):
    for k in range(K):
        A[k] = R[:,k].sum() / N
        for word in range(m):
            sigma_doc = 0.0
            for doc in range(N):
                sigma_doc += R[doc,k] * M[doc,word]
            Q[word][k] = sigma_doc / R[:,k].sum()

In [11]:
def estimate(doc, k, Q, A):
    q_doc = np.zeros(m)
    for word in range(m):
        if M[doc,word] > 0:
            q_doc[word] = Q[word,k]
        else:
            q_doc[word] = 1 - Q[word,k]
    return A[k] * q_doc.prod()
    
def expectation(K, Q, A):
    for doc in range(N):
        k_estimation = np.array([estimate(doc, k, Q, A) for k in range(K)])
        for k in range(K):
            R[doc][k] = k_estimation[k] / k_estimation.sum()

In [12]:
def to_table(title, data, cols, rows):
    header = "<tr>" + "".join(["<th>{}</th>".format(x) for x in [''] + cols]) + "</tr>"
    trs = []
    for i, c in enumerate(rows):
        tr = "<tr>" + "<td>{}</td>".format(c)
        tr += "".join(["<td>{}</td>".format(round(x, 3)) for x in data[i]])
        tr += "</tr>"
        trs.append(tr)
    table = "<h3>{}</h3><table>{}{}</table>".format(
        title,
        header,
        "".join(trs)
    )
    return table

def show(r, q, a):
    table = "<table><tr><td style='vertical-align: top;'>{}</td><td style='vertical-align: top;'>{}</td><td style='vertical-align: top;'>{}</td></tr></table>".format(
        r, q, a
    )
    display(HTML(table))

## Start

In [13]:
TR = to_table('$r_{nk}$', R, range(K), range(N))
TQ = to_table('$q_{mk}$', Q, range(K), terms)
TA = to_table('$a_{k}$', [A], range(K), ['priors'])
show(TR, TQ, TA)

0,1,2
$r_{nk}$0100.4960.50410.930.0720.8490.15130.9760.02440.0550.945,$q_{mk}$01mango0.00.0apple0.00.0ios0.00.0fruit0.00.0banana0.00.0mac0.00.0book0.00.0oracle0.00.0microsoft0.00.0store0.00.0ibm0.00.0,$a_{k}$01priors0.00.0

Unnamed: 0,0,1
0,0.496,0.504
1,0.93,0.07
2,0.849,0.151
3,0.976,0.024
4,0.055,0.945

Unnamed: 0,0,1
mango,0.0,0.0
apple,0.0,0.0
ios,0.0,0.0
fruit,0.0,0.0
banana,0.0,0.0
mac,0.0,0.0
book,0.0,0.0
oracle,0.0,0.0
microsoft,0.0,0.0
store,0.0,0.0

Unnamed: 0,0,1
priors,0.0,0.0


## Iterate

In [14]:
for iteration in range(10):
    maximization(K, R)
    expectation(K, Q, A)
    TR = to_table('$r_{nk}$', R, range(K), range(N))
    TQ = to_table('$q_{mk}$', Q, range(K), terms)
    TA = to_table('$a_{k}$', [A], range(K), ['priors'])
    display(HTML("<h2>ITERATION {}</h2>".format(iteration+1)))
    show(TR, TQ, TA)

0,1,2
$r_{nk}$0100.2850.71510.9850.01520.9930.00730.810.1940.1280.872,$q_{mk}$01mango0.3120.572apple1.01.0ios0.150.297fruit0.4620.87banana0.2950.014mac0.4310.339book0.4310.339oracle0.2570.089microsoft0.2570.089store0.2810.041ibm0.2570.089,$a_{k}$01priors0.6610.339

Unnamed: 0,0,1
0,0.285,0.715
1,0.985,0.015
2,0.993,0.007
3,0.81,0.19
4,0.128,0.872

Unnamed: 0,0,1
mango,0.312,0.572
apple,1.0,1.0
ios,0.15,0.297
fruit,0.462,0.87
banana,0.295,0.014
mac,0.431,0.339
book,0.431,0.339
oracle,0.257,0.089
microsoft,0.257,0.089
store,0.281,0.041

Unnamed: 0,0,1
priors,0.661,0.339


0,1,2
$r_{nk}$0100.0470.95311.00.021.00.030.2280.77240.0930.907,$q_{mk}$01mango0.2930.591apple1.01.0ios0.0890.398fruit0.3820.988banana0.2530.106mac0.3970.406book0.3970.406oracle0.310.004microsoft0.310.004store0.3080.008ibm0.310.004,$a_{k}$01priors0.640.36

Unnamed: 0,0,1
0,0.047,0.953
1,1.0,0.0
2,1.0,0.0
3,0.228,0.772
4,0.093,0.907

Unnamed: 0,0,1
mango,0.293,0.591
apple,1.0,1.0
ios,0.089,0.398
fruit,0.382,0.988
banana,0.253,0.106
mac,0.397,0.406
book,0.397,0.406
oracle,0.31,0.004
microsoft,0.31,0.004
store,0.308,0.008

Unnamed: 0,0,1
priors,0.64,0.36


0,1,2
$r_{nk}$0100.0040.99611.00.021.00.030.0010.99940.0050.995,$q_{mk}$01mango0.1360.638apple1.01.0ios0.020.362fruit0.1561.0banana0.0960.293mac0.4420.362book0.4420.362oracle0.4220.0microsoft0.4220.0store0.4220.0ibm0.4220.0,$a_{k}$01priors0.4740.526

Unnamed: 0,0,1
0,0.004,0.996
1,1.0,0.0
2,1.0,0.0
3,0.001,0.999
4,0.005,0.995

Unnamed: 0,0,1
mango,0.136,0.638
apple,1.0,1.0
ios,0.02,0.362
fruit,0.156,1.0
banana,0.096,0.293
mac,0.442,0.362
book,0.442,0.362
oracle,0.422,0.0
microsoft,0.422,0.0
store,0.422,0.0

Unnamed: 0,0,1
priors,0.474,0.526


0,1,2
$r_{nk}$0100.01.011.00.021.00.030.01.040.01.0,$q_{mk}$01mango0.0030.667apple1.01.0ios0.0020.333fruit0.0051.0banana0.0010.334mac0.4990.333book0.4990.333oracle0.4970.0microsoft0.4970.0store0.4970.0ibm0.4970.0,$a_{k}$01priors0.4020.598

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0

Unnamed: 0,0,1
mango,0.003,0.667
apple,1.0,1.0
ios,0.002,0.333
fruit,0.005,1.0
banana,0.001,0.334
mac,0.499,0.333
book,0.499,0.333
oracle,0.497,0.0
microsoft,0.497,0.0
store,0.497,0.0

Unnamed: 0,0,1
priors,0.402,0.598


0,1,2
$r_{nk}$0100.01.011.00.021.00.030.01.040.01.0,$q_{mk}$01mango0.00.667apple1.01.0ios0.00.333fruit0.01.0banana0.00.333mac0.50.333book0.50.333oracle0.50.0microsoft0.50.0store0.50.0ibm0.50.0,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0

Unnamed: 0,0,1
mango,0.0,0.667
apple,1.0,1.0
ios,0.0,0.333
fruit,0.0,1.0
banana,0.0,0.333
mac,0.5,0.333
book,0.5,0.333
oracle,0.5,0.0
microsoft,0.5,0.0
store,0.5,0.0

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.011.00.021.00.030.01.040.01.0,$q_{mk}$01mango0.00.667apple1.01.0ios0.00.333fruit0.01.0banana0.00.333mac0.50.333book0.50.333oracle0.50.0microsoft0.50.0store0.50.0ibm0.50.0,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0

Unnamed: 0,0,1
mango,0.0,0.667
apple,1.0,1.0
ios,0.0,0.333
fruit,0.0,1.0
banana,0.0,0.333
mac,0.5,0.333
book,0.5,0.333
oracle,0.5,0.0
microsoft,0.5,0.0
store,0.5,0.0

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.011.00.021.00.030.01.040.01.0,$q_{mk}$01mango0.00.667apple1.01.0ios0.00.333fruit0.01.0banana0.00.333mac0.50.333book0.50.333oracle0.50.0microsoft0.50.0store0.50.0ibm0.50.0,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0

Unnamed: 0,0,1
mango,0.0,0.667
apple,1.0,1.0
ios,0.0,0.333
fruit,0.0,1.0
banana,0.0,0.333
mac,0.5,0.333
book,0.5,0.333
oracle,0.5,0.0
microsoft,0.5,0.0
store,0.5,0.0

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.011.00.021.00.030.01.040.01.0,$q_{mk}$01mango0.00.667apple1.01.0ios0.00.333fruit0.01.0banana0.00.333mac0.50.333book0.50.333oracle0.50.0microsoft0.50.0store0.50.0ibm0.50.0,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0

Unnamed: 0,0,1
mango,0.0,0.667
apple,1.0,1.0
ios,0.0,0.333
fruit,0.0,1.0
banana,0.0,0.333
mac,0.5,0.333
book,0.5,0.333
oracle,0.5,0.0
microsoft,0.5,0.0
store,0.5,0.0

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.011.00.021.00.030.01.040.01.0,$q_{mk}$01mango0.00.667apple1.01.0ios0.00.333fruit0.01.0banana0.00.333mac0.50.333book0.50.333oracle0.50.0microsoft0.50.0store0.50.0ibm0.50.0,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0

Unnamed: 0,0,1
mango,0.0,0.667
apple,1.0,1.0
ios,0.0,0.333
fruit,0.0,1.0
banana,0.0,0.333
mac,0.5,0.333
book,0.5,0.333
oracle,0.5,0.0
microsoft,0.5,0.0
store,0.5,0.0

Unnamed: 0,0,1
priors,0.4,0.6


0,1,2
$r_{nk}$0100.01.011.00.021.00.030.01.040.01.0,$q_{mk}$01mango0.00.667apple1.01.0ios0.00.333fruit0.01.0banana0.00.333mac0.50.333book0.50.333oracle0.50.0microsoft0.50.0store0.50.0ibm0.50.0,$a_{k}$01priors0.40.6

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0

Unnamed: 0,0,1
mango,0.0,0.667
apple,1.0,1.0
ios,0.0,0.333
fruit,0.0,1.0
banana,0.0,0.333
mac,0.5,0.333
book,0.5,0.333
oracle,0.5,0.0
microsoft,0.5,0.0
store,0.5,0.0

Unnamed: 0,0,1
priors,0.4,0.6
