# Chinese Restaurant Process

In [13]:
import numpy as np
from collections import Counter
from scipy.special import gammaln

The following implementation is based on the formula below:

![Image Description](../image/CRP%20formula.png)

Note that in the following implementation, -1 in the denomenator is omitting since python is 0-based indexing.

In [2]:
def CRP(gamma, N):
    """
    Simulate the Chinese Restaurant Process (CRP).

    Parameters:
    - gamma: concentration parameter. Measures how likely it is for a new table. Range = [0,infinity). 
    - N: the number of customers to seat.

    Returns:
    - table: A list representing the assignment of customers to tables.
    - probability: Probability of sitting at each table
    """
    tables = np.zeros(N)
    tables[0] = 1 # root 
    
    for i in range(1, N):# index is this way to ensure formula is calculated corretly
        
        prob_existing = tables[:i] / (i + gamma) # probability of joining an existing table.
        prob_existing = prob_existing[prob_existing > 0] 
        prob_new = gamma / (i + gamma) # probability of creating a new table
        
        probability = np.append(prob_existing, prob_new)
        
        table_num = np.random.choice(len(probability), p = probability) # making selection
        
        if table_num == i:
            tables[i] = 1  # new table
        else:
            tables[table_num] += 1 # existing table chosen, customer + 1
            
    tables = tables[tables > 0] # removing empty table entries
    
    return (tables, probability)

In [3]:
CRP(1,10)

(array([7., 2., 1.]), array([0.6, 0.2, 0.1, 0.1]))

## Original Gibbs Sampler from Griffiths & Steyvers

This implementation is designed for flat topic modeling as in traditional LDA. It assigns a single topic to each word in a document from a predefined number of topics. The output is the topic assignment for each word, and topics are treated independently of each other. There is no hierarchical structure.

- Flat Topic Model: Each document is a mixture of topics, but the topics themselves don’t have relationships with each other (i.e., no hierarchy).

In [11]:
def Z(corpus, T, alpha, beta, num_iterations):
    """
    Implements Gibbs sampling for topic assignment using the method from Griffiths & Steyvers.
    
    Parameters:
    - corpus: A list of documents, where each document is a list of words.
    - T: Number of topics.
    - alpha: Dirichlet prior for document-topic distributions.
    - beta: Dirichlet prior for topic-word distributions.
    - num_iterations: Number of iterations to run the Gibbs sampler.
    
    Returns:
    - z: Topic assignments for each word in each document. A list of list.
    """
    # Initialize variables
    unique_words_in_corpus = set(word for doc in corpus for word in doc)  # Set of unique words
    word_to_index = {word: idx for idx, word in enumerate(unique_words_in_corpus)}  # Map words to indices
    
    D = len(corpus)                         # number of documents  
    unique_words_in_corpus = set(word for doc in corpus for word in doc)
    W = len(unique_words_in_corpus)         # unique vocabulary size of the entire corpus
    
    z = [[np.random.randint(0, T-1) for _ in doc] for doc in corpus]  # random initial topic assignments
    
    # Initialize count matrices
    n_wt = np.zeros((T, W))  # word-topic counts
    n_dt = np.zeros((D, T))  # document-topic counts
    n_t = np.zeros(T)        # total word counts for each topic
    
    # Populate initial counts
    for d, doc in enumerate(corpus):
        for i, word in enumerate(doc):
            word_idx = word_to_index[word]  # Convert word to its index
            topic = z[d][i]
            n_wt[topic][word_idx] += 1
            n_dt[d][topic] += 1
            n_t[topic] += 1
    
    # Gibbs sampling iterations
    for iteration in range(num_iterations):
        for d, doc in enumerate(corpus):
            for i, word in enumerate(doc):
                word_idx = word_to_index[word]  # Convert word to its index
                current_topic = z[d][i]
                
                # Decrease current word counts
                n_wt[current_topic][word_idx] -= 1
                n_dt[d][current_topic] -= 1
                n_t[current_topic] -= 1
                
                # Compute topic probabilities for this word
                p = np.zeros(T)
                for t in range(T):
                    word_topic_prob = (n_wt[t][word_idx] + beta) / (n_t[t] + W * beta)
                    doc_topic_prob = (n_dt[d][t] + alpha) / (len(doc) + T * alpha)
                    p[t] = word_topic_prob * doc_topic_prob
                
                # Normalize and sample a new topic
                p /= np.sum(p)
                new_topic = np.random.choice(T, p=p)
                
                # Update counts with the new topic
                z[d][i] = new_topic
                n_wt[new_topic][word_idx] += 1
                n_dt[d][new_topic] += 1
                n_t[new_topic] += 1
    
    return z
# Change z to a better representation. 

In [12]:
# Example
corpus = [['apple', 'banana', 'apple'],
          ['banana', 'cherry'],
          ['math','art','science']]

print(Z(corpus,2,0.1,0.1,100))

[[0, 0, 0], [0, 0], [1, 1, 1]]
