In [1]:
import dotenv
dotenv.load_dotenv()

True

In [2]:
import numpy as np
import inflect
p = inflect.engine()

@np.vectorize
def number_to_string(number):
    return p.number_to_words(number)

@np.vectorize
def multiply_numbers(number_1, number_2):
    res = number_1 * number_2
    return (f'{number_to_string(number_1)} times {number_to_string(number_2)}', number_to_string(res), res)

In [3]:
# Generate a random number between 1 and 100 billion
lower_bound = 1e4
upper_bound = 1e6
n_samples = 10_000

np.random.seed(1337)
numbers = np.random.randint(lower_bound, upper_bound, (n_samples, 2))

number_1 = numbers[:, 0]
number_1_string = number_to_string(number_1)

number_2 = numbers[:, 1]

expression_string, result_string, result = multiply_numbers(number_1, number_2)

In [4]:
comparator_numbers = np.arange(lower_bound ** 2, upper_bound ** 2, 10_000_000).astype(int)
comparator_number_strings = number_to_string(comparator_numbers)

In [5]:
import chromadb
from inflection.encoding.openai import ada_embedding_function

CHROMA_PATH = '../chroma/'
client = chromadb.PersistentClient(path = CHROMA_PATH, settings = chromadb.Settings(anonymized_telemetry=False))

In [6]:
expression_collection = client.get_or_create_collection("expression_collection", embedding_function=ada_embedding_function)
solution_collection = client.get_or_create_collection("solution_collection", embedding_function=ada_embedding_function)
evaluation_collection = client.get_or_create_collection("evaulation_collection", embedding_function=ada_embedding_function)

#### Populate Expression DB

In [7]:
# Expression Metadata
metadatas = [{'number_1_int': int(n1_int), 
              'number_2_int': int(n2_int),
              'product_int': int(prod)}
              for n1_int, n2_int, prod in zip(number_1, number_2, result)]

# Expression Documents
docs = expression_string.tolist()

# IDs
ids = [hex(i)[2:] for i in range(len(docs))]

expression_collection.add(ids, metadatas=metadatas, documents=docs)

#### Populate Solution DB

In [8]:
# Solution Metadata
metadatas = [{'number_1_int': int(n1_int), 
              'number_2_int': int(n2_int),
              'product_int': int(prod)}
              for n1_int, n2_int, prod in zip(number_1, number_2, result)]

# Solution Documents
docs = result_string.tolist()

# IDs
ids = [hex(i)[2:] for i in range(len(docs))]

solution_collection.add(ids, metadatas=metadatas, documents=docs)

#### Populate Evaluation DB

In [9]:
# Solution Metadata
metadatas = [
    {'prod_int': int(number_int)} for number_int in comparator_numbers
]

# Solution Documents
docs = comparator_number_strings.tolist()

# Ids
ids = [hex(i)[2:] for i in range(len(docs))]

evaluation_collection.add(ids, metadatas=metadatas, documents=docs)