In [27]:
from transformers import BertModel, BertTokenizer
import pandas as pd

# Load the BERT model and tokenizer
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Extract the embedding matrix
embedding_matrix = model.embeddings.word_embeddings.weight

# Get the vocabulary as a dictionary: {token: index}
vocab = tokenizer.get_vocab()

# Reverse the vocab dictionary: {index: token}
index_to_token = {index: token for token, index in vocab.items()}

# Prepare a DataFrame for a preview
n = 10000  # Number of tokens to preview
embedding_table = {
    "Token": [index_to_token[i] for i in range(len(embedding_matrix))],
    "Embedding": [embedding_matrix[i].tolist() for i in range(len(embedding_matrix))]
}

# Convert to a Pandas DataFrame for better visualization
df = pd.DataFrame(embedding_table)

In [29]:
# Extract the token column
tokens = embedding_table['Token']  # Replace 'token' with the actual column name

# Save the tokens to a text file
with open("tokens.txt", "w") as f:
    for token in tokens:
        f.write(token + "\n")

In [3]:
import numpy as np
tree = {}  # Monte Carlo Tree: stores N(s, a), Q(s, a), P(s, a)

def select_node(tree, sentence):
    path = []
    current_node = tuple(sentence)
    print(current_node)

    while current_node in tree and tree[current_node]["children"]:
        children = tree[current_node]["children"]
        total_visits = sum(tree[child]["N"] for child in children)

        if not children:  # Handle empty children case
            print(f"No children for node {current_node}")
            break

        # Select the child with the highest UCB score
        best_child = max(
            children,
            key=lambda child: (
                tree[child]["Q"] +
                np.sqrt(total_visits) * tree[child]["P"] / (1 + tree[child]["N"])
            )
        )
        path.append((current_node, best_child))
        current_node = best_child

    return path, current_node

In [5]:
path, current_node = select_node(tree, "the quick brown dog")
print(path)
print(current_node)

('t', 'h', 'e', ' ', 'q', 'u', 'i', 'c', 'k', ' ', 'b', 'r', 'o', 'w', 'n', ' ', 'd', 'o', 'g')
[]
('t', 'h', 'e', ' ', 'q', 'u', 'i', 'c', 'k', ' ', 'b', 'r', 'o', 'w', 'n', ' ', 'd', 'o', 'g')


: 