In [1]:
import numpy as np

# Constructing nCRP tree

In [2]:
class Node:
    def __init__(self):
        self.children = {}  # Dictionary to store child nodes
        self.documents = 0  # Number of documents passing through this node (i.e. number of customers)
        self.vocab = {}

class nCRPTree:
    def __init__(self, alpha):
        self.root = Node()
        self.alpha = alpha  # Concentration parameter

    def sample_new_path(self, max_depth):
        """
        Sample a path through the tree using the nCRP.

        Parameters:
        - max_depth: the maximum depth (number of levels) of the tree (i.e. L-level tree). 
                    L >= 1 (if L == 1, it only contains the root.)
        Returns:
        - path: a list of topics representing the path through the tree
        """
        current_node = self.root
        current_node.documents += 1
        path = [] # not considering the root

        for level in range(1,max_depth):
            # Use the CRP function to sample a topic
            sampled_topic = self.CRP(current_node)
            path.append(sampled_topic)

            # create new table if needed
            if sampled_topic not in current_node.children.keys():
                current_node.children[sampled_topic] = Node()
            
            # Move to the next node in the path
            current_node = current_node.children[sampled_topic]
            current_node.documents += 1

        return path

    def CRP(self, node):
        """
        Basic CRP process.
        
        Returns:
        - sampled_topic: label of the sampled topic(Not the Node)
        """
        total_documents = node.documents #including the incoming document
        topic_probabilities = {}
        
        # There are no table, so we have to get a new table with probability 1
        if not node.children:
            return np.int64(1) # the new table has a key = 1
        
        else:
            # Calculating the probability of joining each of the existing tables (topics)
            for topic, child_node in node.children.items():
                topic_probabilities[topic] = child_node.documents / (self.alpha + total_documents - 1)

            # Probability of creating a new table (topic)
            new_table_key = np.max(topic) + 1
            topic_probabilities[new_table_key] = self.alpha / (self.alpha + total_documents - 1)

            topics = list(topic_probabilities.keys())
            probabilities = list(topic_probabilities.values())

            # Since probabilities sum to 1, normalization is not needed
            sampled_topic = np.random.choice(topics, p=probabilities)

            return sampled_topic
    
    def generate_initial_paths(self, corpus, max_depth):
        """
        Generate initial paths for each document in the corpus using the nCRP tree.

        Parameters:
        - corpus: A list of documents (each document is a list of words)
        - max_depth: Maximum depth (number of levels) of the tree

        Returns:
        - initial_paths: A dictionary mapping document index to its initial sampled path
        """
        paths = {}

        # Loop through each document in the corpus
        for i, doc in enumerate(corpus):
            path = self.sample_new_path(max_depth)  # Sample a path for the document
            paths[i] = path  # Store the path with document index as key

        return paths

Testing nCRP process and initialisation

In [3]:
# Example corpus
corpus = [
    ["apple", "banana"],            # Document 1
    ["cat", "dog", "fish"],         # Document 2
    ["car", "bike"],                # Document 3
]
corpus1 = [
    ["apple", "banana", "orange", "grape", "melon"],     # Document 1
    ["cat", "dog", "fish", "bird", "hamster"],           # Document 2
    ["car", "bike", "bus", "train", "plane", "ship"],    # Document 3
    ["house", "building", "apartment", "cabin"],         # Document 4
    ["sun", "moon", "stars", "galaxy"],                  # Document 5
    ["river", "lake", "ocean", "sea"],                   # Document 6
    ["earth", "mars", "jupiter", "saturn", "venus"],     # Document 7
]

# Initialize the tree and set parameters
alpha = 1.0
tree = nCRPTree(alpha)
max_depth = 4

In [5]:
# Generate initial paths for all documents in the corpus
initial_paths = tree.generate_initial_paths(corpus, max_depth)
initial_paths1 = tree.generate_initial_paths(corpus1, max_depth)

# Print the initial paths for each document
print("Initial Paths for Each Document:")
for doc_id, path in initial_paths.items():
    print(f"Document {doc_id + 1} Path: {path}")
    
print("Initial Paths for Each Document:")
for doc_id, path in initial_paths1.items():
    print(f"Document {doc_id + 1} Path: {path}")

Initial Paths for Each Document:
Document 1 Path: [np.int64(2), np.int64(3), np.int64(1)]
Document 2 Path: [np.int64(2), np.int64(3), np.int64(2)]
Document 3 Path: [np.int64(2), np.int64(2), np.int64(2)]
Initial Paths for Each Document:
Document 1 Path: [np.int64(3), np.int64(1), np.int64(1)]
Document 2 Path: [np.int64(4), np.int64(1), np.int64(1)]
Document 3 Path: [np.int64(2), np.int64(2), np.int64(3)]
Document 4 Path: [np.int64(2), np.int64(2), np.int64(3)]
Document 5 Path: [np.int64(2), np.int64(4), np.int64(2)]
Document 6 Path: [np.int64(2), np.int64(2), np.int64(3)]
Document 7 Path: [np.int64(2), np.int64(1), np.int64(1)]
