In [1]:
using PyCall
using DataFrames

# Import the fine_tune_model and parse_decoded_strings functions from the Python script
py"""
import sys
sys.path.append(".")
from SGS_Tokenizer import ExtendedGPT2Tokenizer, GPT2Tokenizer, GPT2LMHeadModel
"""

function tensor_to_array(tensor::PyObject)
    # Convert PyObject to Julia Vector{UInt32}
    hllset_array = pycall(tensor.numpy, PyArray)
    hllset_vector = Vector{Int64}(hllset_array)

    return hllset_vector
end

tensor_to_array (generic function with 1 method)

In [2]:
text = "When the distance between two unit-length vectors is defined to be the length of their vector difference then"

vocab_file = "JLD2/vocab.json"      # Path to the vocab file
merges_file = "JLD2/merges.txt"     # Path to the merges file

tokenizer = py"ExtendedGPT2Tokenizer"(vocab_file, merges_file, p=4)

# text = "When the distance between two unit-length vectors is defined to be the length of their vector difference then"

# Update tensors
token_ids = tokenizer.tokenize_text(text)
new_tensor_1, double_value = tokenizer.update_tensors(token_ids)

# println("new_tensor_1:", new_tensor_1)
# println("double_value:", double_value)

id, sha1, hll_tensor = tokenizer.tensor_to_hlltensor(new_tensor_1)
println("HLLSet:", id, "; ", sha1, "; ", hll_tensor)

hll_vector = tensor_to_array(hll_tensor)

# println("HLLSet (Vector{Int64}):", hll_vector)

tensor_slice = tokenizer.hlltensor_to_tensor(hll_tensor)
println("Tensor Slice:", tensor_slice)

# tokenizer.print_tensor_1(tensor=tokenizer.tensor_1)

Generated double_value: 13.1
Generated double_value: 1.4
Generated double_value: 9.1
Generated double_value: 0.1
Generated double_value: 1.1
Generated double_value: 5.3
Generated double_value: 0.1
Generated double_value: 9.2
Generated double_value: 1.1
Generated double_value: 2.2
Generated double_value: 9.4
Generated double_value: 5.2
Generated double_value: 7.3
Generated double_value: 1.4
Generated double_value: 4.1
Generated double_value: 15.1
Generated double_value: 1.1
Generated double_value: 5.4
Generated double_value: 2.3
Generated double_value: 3.1
HLLSet:1; 4da9de3e80bcb65ee6169a411b0206ce45ba68bc; PyObject tensor([ 2, 18, 12,  2,  2, 28,  0,  8,  0, 22,  0,  0,  0,  2,  0,  2])
Tensor Slice:



PyObject tensor([[1.0220e+03, 4.0546e+07, 1.0000e-01],
        [1.2000e+01, 1.6594e+08, 1.0000e-01],
        [7.3400e+02, 3.3153e+08, 1.1000e+00],
        [3.0104e+04, 5.2051e+08, 1.1000e+00],
        [5.1100e+02, 4.0169e+08, 1.1000e+00],
        [2.6200e+02, 3.9209e+08, 1.4000e+00],
        [2.6200e+02, 3.9209e+08, 1.4000e+00],
        [3.1800e+02, 5.7578e+08, 2.2000e+00],
        [3.5800e+03, 6.3241e+08, 2.3000e+00],
        [7.8800e+02, 9.3641e+08, 3.1000e+00],
        [4.1290e+03, 1.1767e+09, 4.1000e+00],
        [2.8400e+02, 1.3680e+09, 5.2000e+00],
        [4.3260e+03, 1.4954e+09, 5.3000e+00],
        [1.5879e+04, 1.5664e+09, 5.4000e+00],
        [3.0700e+02, 1.8849e+09, 7.3000e+00],
        [5.2530e+03, 2.4650e+09, 9.1000e+00],
        [1.3664e+04, 2.6029e+09, 9.2000e+00],
        [5.4470e+03, 2.5253e+09, 9.4000e+00],
        [2.2150e+03, 3.5021e+09, 1.3100e+01],
        [2.8600e+02, 4.2398e+09, 1.5100e+01]], dtype=torch.float64)


In [3]:
tokenizer.print_tensors()

tensor_0: {'4da9de3e80bcb65ee6169a411b0206ce45ba68bc': 1}
tensor_1:
[2215,	3502125683,	13.1]
[262,	392093352,	1.4]
[5253,	2464985007,	9.1]
[1022,	40545897,	0.1]
[734,	331532800,	1.1]
[4326,	1495415092,	5.3]
[12,	165936533,	0.1]
[13664,	2602854234,	9.2]
[30104,	520508673,	1.1]
[318,	575781722,	2.2]
[5447,	2525292680,	9.4]
[284,	1368034586,	5.2]
[307,	1884863212,	7.3]
[262,	392093352,	1.4]
[4129,	1176673453,	4.1]
[286,	4239847945,	15.1]
[511,	401687415,	1.1]
[15879,	1566433000,	5.4]
[3580,	632412124,	2.3]
[788,	936414267,	3.1]


In [4]:
# Perform search
query = "When the distance between two unit-length vectors is defined"

threshold = 0.1
related_hllsets = pycall(tokenizer.search, PyObject, query, threshold)
println("Related HLL sets: ", related_hllsets)

# Get related tokens
related_tokens = pycall(tokenizer.get_related_tokens, PyObject, related_hllsets)
# println("Related tokens: ", related_tokens)

dummy_id: torch.Size([1, 1]); query_hllset: torch.Size([16])
query_hllset_with_id: tensor([[ 0,  0,  0,  4,  4,  2,  0, 32, 16, 10,  0,  0,  2,  0,  0,  2,  2]])
Related HLL sets: PyObject [0]
Processing hllset (1): tensor([ 1,  2, 18, 12,  2,  2, 28,  0,  8,  0, 22,  0,  0,  0,  2,  0,  2])
Processing hllset (2): tensor([ 2, 18, 12,  2,  2, 28,  0,  8,  0, 22,  0,  0,  0,  2,  0,  2])


2024-10-08 01:48:30.721548: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-08 01:48:30.739676: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-08 01:48:30.746136: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-08 01:48:30.760620: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


PyObject [' between', '-', ' two', ' vectors', ' their', ' the', ' the', ' is', ' difference', ' then', ' length', ' to', ' unit', ' vector', ' be', ' distance', 'length', ' defined', 'When', ' of']

In [5]:
# Generate meaningful text
suggestions = []
try    
    suggestions = pycall(tokenizer.generate_text, PyObject, related_tokens, 3)
catch e
    println("Error generating text suggestions: ", e)
end


println(tokenizer.format_generated_texts(suggestions))

# tokenizer.print_tensors()


Generated text suggestions:
Suggestion 1:
-lengthWhenThis article is about a character, it is not about them. For other uses of the term "character", see Character (disambiguation).

Suggestion 2:
-lengthWhenThis article is about a character, it is not about them. For other uses of the term "character", see Character (disambiguation)

Suggestion 3:
-lengthWhenThis article is about a character, it is not about them. For other uses of the term "character", see Character.

"I'm






In [6]:
# Evaluate generated texts
communities = [
    
    "4da9de3e80bcb65ee6169a411b0206ce45ba68bc"
    ]  # Load or define your communities of HLL sets here

evaluation_results = pycall(tokenizer.evaluate_generated_texts, PyObject, suggestions, communities)
println("Evaluation results: ", tokenizer.format_generated_texts(evaluation_results))

hllset_id: 4da9de3e80bcb65ee6169a411b0206ce45ba68bc
index: 1
community_hllset: tensor([ 1,  2, 18, 12,  2,  2, 28,  0,  8,  0, 22,  0,  0,  0,  2,  0,  2])
Evaluation results: 
Generated text suggestions:
Suggestion 1:
('-lengthWhenThis article is about a character, it is not about them. For other uses of the term "character", see Character (disambiguation).', '4da9de3e80bcb65ee6169a411b0206ce45ba68bc', 0.21087891921820445)

Suggestion 2:
('-lengthWhenThis article is about a character, it is not about them. For other uses of the term "character", see Character (disambiguation)', '4da9de3e80bcb65ee6169a411b0206ce45ba68bc', 0.2988664113443373)

Suggestion 3:
('-lengthWhenThis article is about a character, it is not about them. For other uses of the term "character", see Character.\n\n"I\'m', '4da9de3e80bcb65ee6169a411b0206ce45ba68bc', 0.29280576695787364)


