# Tutorial on collapsed Gibbs sampling

In [25]:
import numpy as np
import pandas as pd
from IPython.display import display

docs = [
    ['apple', 'ios', 'mac', 'book'],
    ['apple', 'mac', 'book', 'apple', 'store'],
    ['mac', 'book', 'ios', 'store'],
    ['banana', 'mango', 'fruit'],
    ['apple', 'fruit'],
    ['orange', 'strawberry', 'banana'],
    ['orange', 'mango', 'banana'],
    ['fruit', 'apple', 'mac', 'ios']
]
K = 2
words = list(set([x for y in docs for x in y]))
alpha = np.array([1]*K)
beta = np.array([1]*len(words))

## Init counters of assignments
- $n_{d,k}$ number of words of the document $d$ assigned to $k$
- $n_{k,w}$ number of times (instances of) word $w$ is assigned to $k$
- $n_k$ number of word instances assignements to $k$
- $z$ array of assignments to topics for each of the words instances

In [26]:
N = sum([len(x) for x in docs])
z = np.zeros(N)
ndk = np.zeros((len(docs), K))
nkw = np.zeros((K, len(words)))
nk = np.zeros(K)

### Randomly initialize

In [27]:
instance = 0
occurrences = []
for i, doc in enumerate(docs):
    for w in doc:
        occurrences.append(w)
        t = np.random.choice(range(0,K))
        z[instance] = t
        nk[t] += 1
        ndk[i,t] += 1
        nkw[t, words.index(w)] += 1
        instance += 1

In [35]:
display(pd.DataFrame(z, index=occurrences).T)
display(pd.DataFrame(ndk, index=range(len(docs)), columns=range(K)))
display(pd.DataFrame(nkw, index=range(K), columns=words))

Unnamed: 0,apple,ios,mac,book,apple.1,mac.1,book.1,apple.2,store,mac.2,...,orange,strawberry,banana,orange.1,mango,banana.1,fruit,apple.3,mac.3,ios.1
0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0


Unnamed: 0,0,1
0,2.0,2.0
1,1.0,4.0
2,1.0,3.0
3,2.0,1.0
4,1.0,1.0
5,0.0,3.0
6,1.0,2.0
7,1.0,3.0


Unnamed: 0,book,fruit,strawberry,mango,orange,apple,banana,ios,store,mac
0,1.0,3.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,0.0
1,2.0,0.0,1.0,1.0,2.0,3.0,2.0,2.0,2.0,4.0


## Likelihood

In [36]:
def theta(ndk, alpha, d_i, k_i):
    return (ndk[d_i, k_i] + alpha[k_i]) / (np.sum(ndk[d_i,:]) + alpha[k_i])

def phi(nkw, beta, w_i, k_i):
    return (nkw[k_i, w_i] + beta[w_i]) / (np.sum(nkw[k_i,:] + beta))

In [37]:
print(theta(ndk, alpha, 1, 0))

0.3333333333333333


# Gibbs Sampling

In [38]:
def gibbs(docs, words, topics, ndk, nkw, nk, alpha, beta, iterations=10):
    for it in range(0, iterations):
        w_i = 0
        for doc_i, doc in enumerate(docs):
            for w in doc:
                word = [x for y in docs for x in y][w_i]
                topic = int(topics[w_i])
                p_z = np.zeros(len(alpha))
                for k_i in range(0, len(alpha)):
                    p_z[k_i] = theta(ndk, alpha, doc_i, k_i) * phi(nkw, beta, words.index(word), k_i)
                p_z = p_z / np.sum(p_z)
                # Sample from p_z
                new_topic = np.random.choice(len(p_z), 1, p=p_z)[0]
                # Update
                # Remove current assignment
                ndk[doc_i, topic] -= 1
                nkw[topic, words.index(word)] -= 1
                nk[topic] -= 1
                topics[w_i] = new_topic
                ndk[doc_i, new_topic] += 1
                nkw[new_topic, words.index(word)] += 1
                nk[new_topic] += 1
                w_i += 1


In [39]:
gibbs(docs, words, z, ndk, nkw, nk, alpha, beta, iterations=1000)

In [40]:
display(pd.DataFrame(z, index=occurrences).T)
display(pd.DataFrame(ndk, index=range(len(docs)), columns=range(K)))
display(pd.DataFrame(nkw, index=range(K), columns=words))

Unnamed: 0,apple,ios,mac,book,apple.1,mac.1,book.1,apple.2,store,mac.2,...,orange,strawberry,banana,orange.1,mango,banana.1,fruit,apple.3,mac.3,ios.1
0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


Unnamed: 0,0,1
0,2.0,2.0
1,1.0,4.0
2,0.0,4.0
3,2.0,1.0
4,1.0,1.0
5,1.0,2.0
6,3.0,0.0
7,1.0,3.0


Unnamed: 0,book,fruit,strawberry,mango,orange,apple,banana,ios,store,mac
0,1.0,3.0,0.0,1.0,1.0,2.0,3.0,0.0,0.0,0.0
1,2.0,0.0,1.0,1.0,1.0,3.0,0.0,3.0,2.0,4.0


# Sklearn implementation

In [41]:
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter


M = np.zeros((len(words), len(docs)))
for i, doc in enumerate(docs):
    for k, w in Counter(doc).most_common():
        M[words.index(k), i] = w
LDA = LatentDirichletAllocation(n_components=2, learning_method='batch', max_iter=100).fit(M.T)

In [42]:
A = LDA.components_

In [43]:
A.T

array([[3.49132793, 0.50867207],
       [2.41388648, 1.58611352],
       [0.50388521, 1.49611479],
       [0.50460754, 2.49539246],
       [0.50397213, 2.49602787],
       [5.48684037, 0.51315963],
       [0.5044269 , 3.4955731 ],
       [3.48999856, 0.51000144],
       [2.49166926, 0.50833074],
       [4.49064378, 0.50935622]])

Topic word distribution. components_[i, j] represents word j in topic i.

In [44]:
display(pd.DataFrame(A, index=range(K), columns=words))

Unnamed: 0,book,fruit,strawberry,mango,orange,apple,banana,ios,store,mac
0,3.491328,2.413886,0.503885,0.504608,0.503972,5.48684,0.504427,3.489999,2.491669,4.490644
1,0.508672,1.586114,1.496115,2.495392,2.496028,0.51316,3.495573,0.510001,0.508331,0.509356


In [45]:
Terms = np.zeros((len(words), K))
Docs = np.zeros((len(docs), K))

for w_i, w in enumerate(words):
    Terms[w_i] = A.T[w_i] / np.sum(A.T[w_i])

for d_i, d in enumerate(docs):
    dM = np.zeros((len(d), K))
    for j, w in enumerate(d):
        dM[j] = Terms[words.index(w)]
    Docs[d_i] = np.sum(dM, axis=0) / np.array([len(d)]*K)

In [46]:
display(pd.DataFrame(Terms, index=words, columns=range(0,K)))
display(pd.DataFrame(Docs, index=range(0, len(docs)), columns=range(0,K)))

Unnamed: 0,0,1
book,0.872832,0.127168
fruit,0.603472,0.396528
strawberry,0.251943,0.748057
mango,0.168203,0.831797
orange,0.167991,0.832009
apple,0.914473,0.085527
banana,0.126107,0.873893
ios,0.8725,0.1275
store,0.830556,0.169444
mac,0.898129,0.101871


Unnamed: 0,0,1
0,0.889483,0.110517
1,0.886093,0.113907
2,0.868504,0.131496
3,0.29926,0.70074
4,0.758973,0.241027
5,0.182013,0.817987
6,0.1541,0.8459
7,0.822143,0.177857


In [47]:
sigma_w, sigma_d = 4, 3
for t_k in range(0, K):
    print('topic', t_k)
    top_w = sorted([(i, wk) for i, wk in enumerate(Terms.T[t_k])], key=lambda y: -y[1])[:sigma_w]
    top_d = sorted([(i, dk) for i, dk in enumerate(Docs.T[t_k])], key=lambda y: -y[1])[:sigma_d]
    print([words[x[0]] for x in top_w])
    print([docs[x[0]] for x in top_d])
    print("")

topic 0
['apple', 'mac', 'book', 'ios']
[['apple', 'ios', 'mac', 'book'], ['apple', 'mac', 'book', 'apple', 'store'], ['mac', 'book', 'ios', 'store']]

topic 1
['banana', 'orange', 'mango', 'strawberry']
[['orange', 'mango', 'banana'], ['orange', 'strawberry', 'banana'], ['banana', 'mango', 'fruit']]

