# Tutorial on collapsed Gibbs sampling

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display

docs = [
    ['apple', 'ios', 'mac', 'book'],
    ['apple', 'mac', 'book', 'apple', 'store'],
    ['mac', 'book', 'ios', 'store'],
    ['banana', 'mango', 'fruit'],
    ['apple', 'fruit'],
    ['orange', 'strawberry', 'banana'],
    ['orange', 'mango', 'banana'],
    ['fruit', 'apple', 'mac', 'ios']
]
K = 2
words = list(set([x for y in docs for x in y]))
alpha = np.array([1]*K)
beta = np.array([1]*len(words))

## Init counters of assignments
- $n_{d,k}$ number of words of the document $d$ assigned to $k$
- $n_{k,w}$ number of times (instances of) word $w$ is assigned to $k$
- $n_k$ number of word instances assignements to $k$
- $z$ array of assignments to topics for each of the words instances

In [2]:
N = sum([len(x) for x in docs])
z = np.zeros(N)
ndk = np.zeros((len(docs), K))
nkw = np.zeros((K, len(words)))
nk = np.zeros(K)

### Randomly initialize

In [3]:
instance = 0
occurrences = []
for i, doc in enumerate(docs):
    for w in doc:
        occurrences.append(w)
        t = np.random.choice(range(0,K))
        z[instance] = t
        nk[t] += 1
        ndk[i,t] += 1
        nkw[t, words.index(w)] += 1
        instance += 1

In [4]:
display(pd.DataFrame(z, index=occurrences).T)
display(pd.DataFrame(ndk, index=range(len(docs)), columns=range(K)).T)
display(pd.DataFrame(nkw, index=range(K), columns=words))

Unnamed: 0,apple,ios,mac,book,apple.1,mac.1,book.1,apple.2,store,mac.2,...,orange,strawberry,banana,orange.1,mango,banana.1,fruit,apple.3,mac.3,ios.1
0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7
0,3.0,3.0,2.0,1.0,1.0,1.0,1.0,3.0
1,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0


Unnamed: 0,banana,orange,ios,store,book,mac,apple,strawberry,mango,fruit
0,1.0,0.0,3.0,2.0,1.0,1.0,4.0,1.0,1.0,1.0
1,2.0,2.0,0.0,0.0,2.0,3.0,1.0,0.0,1.0,2.0


## Likelihood

In [5]:
def theta(ndk, alpha, d_i, k_i):
    return (ndk[d_i, k_i] + alpha[k_i]) / (np.sum(ndk[d_i,:]) + alpha[k_i])

def phi(nkw, beta, w_i, k_i):
    return (nkw[k_i, w_i] + beta[w_i]) / (np.sum(nkw[k_i,:] + beta))

In [6]:
print(theta(ndk, alpha, 1, 0))

0.6666666666666666


# Gibbs Sampling

In [7]:
from IPython.display import clear_output

In [8]:
def copy(a):
    if a.ndim == 1:
        new_a = np.zeros(a.shape)
        for i, v in enumerate(a):
            new_a[i] = v
    else:
        new_a = np.zeros(a.shape)
        for i, row in enumerate(a):
            for j, v in enumerate(row):
                new_a[i,j] = v
    return new_a

In [9]:
def gibbs(docs, words, topics, ndk, nkw, nk, alpha, beta, iterations=10):
    history = [(copy(ndk), copy(nkw), copy(topics))]
    for it in range(0, iterations):
        w_i = 0
        for doc_i, doc in enumerate(docs):
            for w in doc:
                word = [x for y in docs for x in y][w_i]
                topic = int(topics[w_i])
                p_z = np.zeros(len(alpha))
                for k_i in range(0, len(alpha)):
                    p_z[k_i] = theta(ndk, alpha, doc_i, k_i) * phi(nkw, beta, words.index(word), k_i)
                p_z = p_z / np.sum(p_z)
                # Sample from p_z
                new_topic = np.random.choice(len(p_z), 1, p=p_z)[0]
                # Update
                # Remove current assignment
                ndk[doc_i, topic] -= 1
                nkw[topic, words.index(word)] -= 1
                nk[topic] -= 1
                topics[w_i] = new_topic
                ndk[doc_i, new_topic] += 1
                nkw[new_topic, words.index(word)] += 1
                nk[new_topic] += 1
                w_i += 1
            history.append((copy(ndk), copy(nkw), copy(topics)))
    return history

In [10]:
history = gibbs(docs, words, z, ndk, nkw, nk, alpha, beta, iterations=1000)

In [11]:
for ndk_h, nkw_h, z_h in history:
    clear_output(wait=True)
    display(pd.DataFrame(z_h, index=occurrences).T)
    display(pd.DataFrame(ndk_h, index=range(len(docs)), columns=range(K)).T)
    display(pd.DataFrame(nkw_h, index=range(K), columns=words))
    command = input()
    if command == 'quit':
        break

Unnamed: 0,apple,ios,mac,book,apple.1,mac.1,book.1,apple.2,store,mac.2,...,orange,strawberry,banana,orange.1,mango,banana.1,fruit,apple.3,mac.3,ios.1
0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,1.0,4.0,1.0,0.0,3.0,3.0,3.0
1,3.0,4.0,0.0,2.0,2.0,0.0,0.0,1.0


Unnamed: 0,banana,orange,ios,store,book,mac,apple,strawberry,mango,fruit
0,2.0,2.0,3.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0
1,1.0,0.0,0.0,1.0,2.0,2.0,4.0,0.0,0.0,2.0


quit


# Sklearn implementation

In [12]:
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter


M = np.zeros((len(words), len(docs)))
for i, doc in enumerate(docs):
    for k, w in Counter(doc).most_common():
        M[words.index(k), i] = w
LDA = LatentDirichletAllocation(n_components=2, learning_method='batch', max_iter=100).fit(M.T)

In [13]:
A = LDA.components_

In [14]:
A.T

array([[3.49557314, 0.50442686],
       [2.4960279 , 0.5039721 ],
       [0.51000163, 3.48999837],
       [0.50833086, 2.49166914],
       [0.50867213, 3.49132787],
       [0.50935644, 4.49064356],
       [0.51316524, 5.48683476],
       [1.49611483, 0.50388517],
       [2.49539249, 0.50460751],
       [1.58618976, 2.41381024]])

Topic word distribution. components_[i, j] represents word j in topic i.

In [15]:
display(pd.DataFrame(A, index=range(K), columns=words))

Unnamed: 0,banana,orange,ios,store,book,mac,apple,strawberry,mango,fruit
0,3.495573,2.496028,0.510002,0.508331,0.508672,0.509356,0.513165,1.496115,2.495392,1.58619
1,0.504427,0.503972,3.489998,2.491669,3.491328,4.490644,5.486835,0.503885,0.504608,2.41381


In [16]:
Terms = np.zeros((len(words), K))
Docs = np.zeros((len(docs), K))

for w_i, w in enumerate(words):
    Terms[w_i] = A.T[w_i] / np.sum(A.T[w_i])

for d_i, d in enumerate(docs):
    dM = np.zeros((len(d), K))
    for j, w in enumerate(d):
        dM[j] = Terms[words.index(w)]
    Docs[d_i] = np.sum(dM, axis=0) / np.array([len(d)]*K)

In [17]:
display(pd.DataFrame(Terms, index=words, columns=range(0,K)))
display(pd.DataFrame(Docs, index=range(0, len(docs)), columns=range(0,K)))

Unnamed: 0,0,1
banana,0.873893,0.126107
orange,0.832009,0.167991
ios,0.1275,0.8725
store,0.169444,0.830556
book,0.127168,0.872832
mac,0.101871,0.898129
apple,0.085528,0.914472
strawberry,0.748057,0.251943
mango,0.831797,0.168203
fruit,0.396547,0.603453


Unnamed: 0,0,1
0,0.110517,0.889483
1,0.113908,0.886092
2,0.131496,0.868504
3,0.700746,0.299254
4,0.241037,0.758963
5,0.817987,0.182013
6,0.8459,0.1541
7,0.177862,0.822138


In [18]:
sigma_w, sigma_d = 4, 3
for t_k in range(0, K):
    print('topic', t_k)
    top_w = sorted([(i, wk) for i, wk in enumerate(Terms.T[t_k])], key=lambda y: -y[1])[:sigma_w]
    top_d = sorted([(i, dk) for i, dk in enumerate(Docs.T[t_k])], key=lambda y: -y[1])[:sigma_d]
    print([words[x[0]] for x in top_w])
    print([docs[x[0]] for x in top_d])
    print("")

topic 0
['banana', 'orange', 'mango', 'strawberry']
[['orange', 'mango', 'banana'], ['orange', 'strawberry', 'banana'], ['banana', 'mango', 'fruit']]

topic 1
['apple', 'mac', 'book', 'ios']
[['apple', 'ios', 'mac', 'book'], ['apple', 'mac', 'book', 'apple', 'store'], ['mac', 'book', 'ios', 'store']]

