# Target Words Source

### List

In [1]:
words = sorted([
    "happy", "sad", "joy", "anger", "fear", "love", "hate", "excited", "nervous", "calm",
    # Professions
    "doctor", "engineer", "teacher", "lawyer", "artist", "scientist", "nurse", "chef", "pilot", "writer",
    # Nature
    "tree", "river", "mountain", "ocean", "flower", "desert", "forest", "sky", "cloud", "animal",
    # Technology
    "computer", "internet", "robot", "AI", "software", "hardware", "phone", "camera", "network", "algorithm",
    # Relationships
    "father", "mother", "brother", "sister", "friend", "husband", "wife", "child", "partner", "neighbor",
    # Food
    "bread", "apple", "pizza", "coffee", "chocolate", "milk", "soup", "rice", "cake", "cheese",
    # Geography
    "city", "village", "country", "continent", "river", "lake", "mountain", "valley", "desert", "island",
    # Abstract Concepts
    "freedom", "justice", "peace", "war", "knowledge", "power", "truth", "beauty", "faith", "wealth",
    # Animals
    "cat", "dog", "lion", "tiger", "elephant", "bird", "fish", "whale", "dolphin", "butterfly",
    # Vehicles
    "car", "truck", "bicycle", "train", "airplane", "ship", "boat", "motorcycle", "subway", "helicopter",
    # Sports
    "soccer", "basketball", "tennis", "cricket", "baseball", "golf", "hockey", "boxing", "running", "swimming",
    # Royalty/Leadership
    "king", "queen", "prince", "princess", "leader", "president", "minister", "senator", "governor", "mayor",
    # Miscellaneous
    "book", "music", "movie", "art", "language", "history", "science", "medicine", "education", "philosophy"
])

### Dataset

In [1]:
from tools import Tools

# dataset = 'mturk-771.csv'
# dataset = 'mturk-287.csv'
# dataset = 'wordsim353-sim.csv'
dataset = 'rg-65.csv'
words = []
dataset_words = Tools.get_dataset_words(dataset)
for word in dataset_words:
    words.append(word)
print(f"Total words: {len(words)}")

Total words: 48


### All Vocabulary

In [5]:
from tools import Tools

words = []
vectorizer_X = Tools.read_pickle_data("vectorizer_X.pickle")
number_of_features = vectorizer_X.get_feature_names_out().shape[0]
words = vectorizer_X.get_feature_names_out()
print(f"Total words: {len(words)}")
print(f"First 10 words: {words[:10]}")

Total words: 18896
First 10 words: ['00' '000' '007' '01' '02' '05' '06' '10' '100' '1000']


# Omni TM-AE Collecting of embedding vectors for target words

In [3]:
from tmu.models.autoencoder.autoencoder import TMAutoEncoder
import numpy as np
import tqdm as tqdm
from tools import Tools


X_train = Tools.read_pickle_data("X.pickle")
vectorizer_X = Tools.read_pickle_data("vectorizer_X.pickle")
number_of_features = vectorizer_X.get_feature_names_out().shape[0]
feature_names = vectorizer_X.get_feature_names_out()

# 1 Billion Parameters
# clause_weight_threshold = 0
# number_of_examples = 2000
# accumulation = 24
# clauses = 32
# T = 20000
# s = 1.0
# epochs = 4
# number_of_state_bits_ta = 8
# all_features = True

# IMDB Parameters
clause_weight_threshold = 0
number_of_examples = 2000
accumulation = 24
clauses = 32
T = 3200
s = 1.0
epochs = 10
number_of_state_bits_ta = 8
all_features = True

valid_words = []
for word in words:
    if word in vectorizer_X.vocabulary_:
        word_id = vectorizer_X.vocabulary_[word]
        valid_words.append((word, word_id))

# Function to collect Omni embedding for a single word
def train_word(word_data):
    word, word_id = word_data
    single_output_active = np.empty(1, dtype=np.uint32)
    single_output_active[0] = word_id

    tm = TMAutoEncoder(
        number_of_clauses=clauses,
        T=T,
        s=s,
        output_active=single_output_active,
        max_included_literals=3,
        accumulation=accumulation,
        feature_negation=True,
        platform='CPU', 
        output_balancing=0.5
    )

    for e in range(epochs):
        tm.fit(X_train, number_of_examples=number_of_examples)
    clauses_weights = tm.get_weights(0)

    literal_sums = np.zeros(number_of_features)
    literal_counts = np.zeros(number_of_features)
    
    for j in range(clauses):
        clause_weight = clauses_weights[j]
        if clause_weight > 0:
            for i in range(tm.clause_bank.number_of_literals):
                if i < number_of_features:
                    literal_sums[i] += tm.get_ta_state(j, i)
                    literal_counts[i] += 1
                else:
                    literal_sums[i - number_of_features] -= tm.get_ta_state(j, i)
                    literal_counts[i - number_of_features] += 1

    non_zero_counts = literal_counts > 0
    embedding = np.zeros(number_of_features)
    embedding[non_zero_counts] = (literal_sums[non_zero_counts] / literal_counts[non_zero_counts]).astype(int)
    return embedding

if __name__ == "__main__":
    #save for each word so can called it like  omni_embeddings.get(id, None)
    all_embeddings = {}
    for word_data in tqdm.tqdm(valid_words, desc="Training words", unit="word"):
        embedding = train_word(word_data)
        all_embeddings[word_data[1]] = embedding

# save embeddings as pickle
Tools.save_pickle_data(all_embeddings, "omni_embeddings.pickle")

Training words: 100%|██████████| 37/37 [02:55<00:00,  4.74s/word]
