# Tutorial on collapsed Gibbs sampling

In [1]:
import numpy as np
from IPython.core.display import display, HTML, Image
import nbimporter
from utils import to_table


docs = [
    ['apple', 'ios', 'mac', 'book'],
    ['apple', 'mac', 'book', 'apple', 'store'],
    ['mac', 'book', 'ios', 'store'],
    ['banana', 'mango', 'fruit'],
    ['apple', 'fruit'],
    ['orange', 'strawberry', 'banana'],
    ['orange', 'mango', 'banana'],
    ['fruit', 'apple', 'mac', 'ios']
]
K = 2
words = list(set([x for y in docs for x in y]))
alpha = np.array([1]*K)
beta = np.array([1]*len(words))

## Init counters of assignments
- $n_{d,k}$ number of words of the document $d$ assigned to $k$
- $n_{k,w}$ number of times (instances of) word $w$ is assigned to $k$
- $n_k$ number of word instances assignements to $k$
- $z$ array of assignments to topics for each of the words instances

In [2]:
N = sum([len(x) for x in docs])
z = np.zeros(N)
ndk = np.zeros((len(docs), K))
nkw = np.zeros((K, len(words)))
nk = np.zeros(K)

### Randomly initialize

In [3]:
instance = 0
for i, doc in enumerate(docs):
    for w in doc:
        t = np.random.choice(range(0,K))
        z[instance] = t
        nk[t] += 1
        ndk[i,t] += 1
        nkw[t, words.index(w)] += 1
        instance += 1

In [4]:
initial_assignment = to_table(z, ['topic'], [x for y in docs for x in y])
initial_ndk = to_table(ndk, range(0,len(docs)), range(0,K))
initial_nkw = to_table(nkw, range(0,K), words)

In [5]:
display(HTML(initial_assignment))
display(HTML(initial_ndk))
display(HTML(initial_nkw))

Unnamed: 0,apple,ios,mac,book,apple.1,mac.1,book.1,apple.2,store,mac.2,book.2,ios.1,store.1,banana,mango,fruit,apple.3,fruit.1,orange,strawberry,banana.1,orange.1,mango.1,banana.2,fruit.2,apple.4,mac.3,ios.2
topic,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


Unnamed: 0,0,1
0,1.0,3.0
1,1.0,4.0
2,3.0,1.0
3,1.0,2.0
4,1.0,1.0
5,2.0,1.0
6,1.0,2.0
7,2.0,2.0


Unnamed: 0,mango,ios,apple,book,store,banana,strawberry,mac,fruit,orange
0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0
1,1.0,2.0,3.0,2.0,1.0,2.0,0.0,2.0,2.0,1.0


## Likelihood

In [6]:
def theta(ndk, alpha, d_i, k_i):
    return (ndk[d_i, k_i] + alpha[k_i]) / (np.sum(ndk[d_i,:]) + alpha[k_i])

def phi(nkw, beta, w_i, k_i):
    return (nkw[k_i, w_i] + beta[w_i]) / (np.sum(nkw[k_i,:] + beta))

In [7]:
print(theta(ndk, alpha, 1, 0))

0.3333333333333333


# Gibbs Sampling

In [8]:
def gibbs(docs, words, topics, ndk, nkw, nk, alpha, beta, iterations=10):
    for it in range(0, iterations):
        w_i = 0
        for doc_i, doc in enumerate(docs):
            for w in doc:
                word = [x for y in docs for x in y][w_i]
                topic = int(topics[w_i])
                p_z = np.zeros(len(alpha))
                for k_i in range(0, len(alpha)):
                    p_z[k_i] = theta(ndk, alpha, doc_i, k_i) * phi(nkw, beta, words.index(word), k_i)
                p_z = p_z / np.sum(p_z)
                # Sample from p_z
                new_topic = np.random.choice(len(p_z), 1, p=p_z)[0]
                # Update
                # Remove current assignment
                ndk[doc_i, topic] -= 1
                nkw[topic, words.index(word)] -= 1
                nk[topic] -= 1
                topics[w_i] = new_topic
                ndk[doc_i, new_topic] += 1
                nkw[new_topic, words.index(word)] += 1
                nk[new_topic] += 1
                w_i += 1


In [9]:
gibbs(docs, words, z, ndk, nkw, nk, alpha, beta, iterations=1000)

In [10]:
final_assignment = to_table(z, ['topic'], [x for y in docs for x in y])
final_ndk = to_table(ndk, range(0,len(docs)), range(0,K))
final_nkw = to_table(nkw, range(0,K), words)

In [11]:
display(HTML(initial_assignment))
display(HTML(final_assignment))

Unnamed: 0,apple,ios,mac,book,apple.1,mac.1,book.1,apple.2,store,mac.2,book.2,ios.1,store.1,banana,mango,fruit,apple.3,fruit.1,orange,strawberry,banana.1,orange.1,mango.1,banana.2,fruit.2,apple.4,mac.3,ios.2
topic,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


Unnamed: 0,apple,ios,mac,book,apple.1,mac.1,book.1,apple.2,store,mac.2,book.2,ios.1,store.1,banana,mango,fruit,apple.3,fruit.1,orange,strawberry,banana.1,orange.1,mango.1,banana.2,fruit.2,apple.4,mac.3,ios.2
topic,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
display(HTML(initial_ndk))
display(HTML(final_ndk))

Unnamed: 0,0,1
0,1.0,3.0
1,1.0,4.0
2,3.0,1.0
3,1.0,2.0
4,1.0,1.0
5,2.0,1.0
6,1.0,2.0
7,2.0,2.0


Unnamed: 0,0,1
0,1.0,3.0
1,1.0,4.0
2,1.0,3.0
3,3.0,0.0
4,1.0,1.0
5,2.0,1.0
6,3.0,0.0
7,3.0,1.0


In [13]:
display(HTML(initial_nkw))
display(HTML(final_nkw))

Unnamed: 0,mango,ios,apple,book,store,banana,strawberry,mac,fruit,orange
0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0
1,1.0,2.0,3.0,2.0,1.0,2.0,0.0,2.0,2.0,1.0


Unnamed: 0,mango,ios,apple,book,store,banana,strawberry,mac,fruit,orange
0,2.0,0.0,1.0,0.0,0.0,3.0,0.0,4.0,3.0,2.0
1,0.0,3.0,4.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0


# Sklearn implementation

In [15]:
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter


M = np.zeros((len(words), len(docs)))
for i, doc in enumerate(docs):
    for k, w in Counter(doc).most_common():
        M[words.index(k), i] = w
LDA = LatentDirichletAllocation(n_components=2, learning_method='batch', max_iter=100).fit(M.T)

In [16]:
A = LDA.components_

In [19]:
A.T

array([[0.50460805, 2.49539195],
       [3.48999824, 0.51000176],
       [5.48683803, 0.51316197],
       [3.49132849, 0.50867151],
       [2.49166989, 0.50833011],
       [0.50442722, 3.49557278],
       [0.50388516, 1.49611484],
       [4.49064386, 0.50935614],
       [2.41385953, 1.58614047],
       [0.5039721 , 2.4960279 ]])

Topic word distribution. components_[i, j] represents word j in topic i.

In [20]:
display(HTML(to_table(A, range(0,K), words)))

Unnamed: 0,mango,ios,apple,book,store,banana,strawberry,mac,fruit,orange
0,0.505,3.49,5.487,3.491,2.492,0.504,0.504,4.491,2.414,0.504
1,2.495,0.51,0.513,0.509,0.508,3.496,1.496,0.509,1.586,2.496


In [21]:
Terms = np.zeros((len(words), K))
Docs = np.zeros((len(docs), K))

for w_i, w in enumerate(words):
    Terms[w_i] = A.T[w_i] / np.sum(A.T[w_i])

for d_i, d in enumerate(docs):
    dM = np.zeros((len(d), K))
    for j, w in enumerate(d):
        dM[j] = Terms[words.index(w)]
    Docs[d_i] = np.sum(dM, axis=0) / np.array([len(d)]*K)

In [22]:
display(HTML(to_table(Terms, words, range(0,K))))
display(HTML(to_table(Docs, range(0, len(docs)), range(0,K))))

Unnamed: 0,0,1
mango,0.168,0.832
ios,0.872,0.128
apple,0.914,0.086
book,0.873,0.127
store,0.831,0.169
banana,0.126,0.874
strawberry,0.252,0.748
mac,0.898,0.102
fruit,0.603,0.397
orange,0.168,0.832


Unnamed: 0,0,1
0,0.889,0.111
1,0.886,0.114
2,0.869,0.131
3,0.299,0.701
4,0.759,0.241
5,0.182,0.818
6,0.154,0.846
7,0.822,0.178


In [25]:
sigma_w, sigma_d = 4, 3
for t_k in range(0, K):
    print('topic', t_k)
    top_w = sorted([(i, wk) for i, wk in enumerate(Terms.T[t_k])], key=lambda y: -y[1])[:sigma_w]
    top_d = sorted([(i, dk) for i, dk in enumerate(Docs.T[t_k])], key=lambda y: -y[1])[:sigma_d]
    print([words[x[0]] for x in top_w])
    print([docs[x[0]] for x in top_d])
    print("")

topic 0
['apple', 'mac', 'book', 'ios']
[['apple', 'ios', 'mac', 'book'], ['apple', 'mac', 'book', 'apple', 'store'], ['mac', 'book', 'ios', 'store']]

topic 1
['banana', 'orange', 'mango', 'strawberry']
[['orange', 'mango', 'banana'], ['orange', 'strawberry', 'banana'], ['banana', 'mango', 'fruit']]

