In [1]:
include("src//entity.jl")
using ..Entity


In [None]:

# Generate text starting from a seed token
seed_token = "concept"
generated_text = generate_text(seed_token, 50)
println("Generated Text: $generated_text")

In [None]:
# Define the number of tokens and bins
num_tokens = 1000
num_bins = 1024

# Create arrays for each property
hash_codes = Vector{UInt32}(undef, num_tokens)
entity_instances = [Vector{Entity.Instance{10}}() for _ in 1:num_tokens]
tf_frequencies = Vector{UInt32}(undef, num_tokens)
bin_numbers = Vector{UInt16}(undef, num_tokens)
trailing_zeros = Vector{UInt8}(undef, num_tokens)
preceding_tokens = [Vector{UInt32}() for _ in 1:num_tokens]
following_tokens = [Vector{UInt32}() for _ in 1:num_tokens]
commit_ids = Vector{String}(undef, num_tokens)

# Initialize the arrays with random or default values
for i in 1:num_tokens
    hash_codes[i] = rand(UInt32)
    tf_frequencies[i] = rand(UInt32)
    bin_numbers[i] = rand(UInt16) % num_bins + 1  # Ensure bin numbers are within the range 1 to num_bins
    trailing_zeros[i] = rand(UInt8)
    commit_ids[i] = ""
end

In [None]:
# Function to split tensors by bin number
function split_by_bin_number(bin_numbers, num_bins)
    bins = [Vector{Int}() for _ in 1:num_bins]
    for i in 1:length(bin_numbers)
        bin = bin_numbers[i]
        push!(bins[bin], i)
    end
    return bins
end

# Split the tokens by bin number
bins = split_by_bin_number(bin_numbers, num_bins)

# Access tokens in a specific bin
bin_index = 2
tokens_in_bin = bins[bin_index]

# Example: Print hash codes of tokens in the specified bin
println("Hash codes of tokens in bin $bin_index:")
for token_index in tokens_in_bin
    println(hash_codes[token_index])
end

In [None]:
println("All token hashes:")
for hash_code in hash_codes
    println(hash_code)
end

In [None]:
# Define the hash code of the token you want to retrieve
target_hash_code = 2783398889

# Find the index of the token with the specified hash code
token_index = findfirst(x -> x == target_hash_code, hash_codes)

if token_index !== nothing
    # Retrieve all properties using the index
    token_hash_code = hash_codes[token_index]
    token_entity_instances = entity_instances[token_index]
    token_tf_frequency = tf_frequencies[token_index]
    token_bin_number = bin_numbers[token_index]
    token_trailing_zeros = trailing_zeros[token_index]
    token_preceding_tokens = preceding_tokens[token_index]
    token_following_tokens = following_tokens[token_index]
    token_commit_id = commit_ids[token_index]

    # Print all properties of the token
    println("Properties of token with hash code $target_hash_code:")
    println("Hash Code: $token_hash_code")
    println("Entity Instances: $token_entity_instances")
    println("TF Frequency: $token_tf_frequency")
    println("Bin Number: $token_bin_number")
    println("Trailing Zeros: $token_trailing_zeros")
    println("Preceding Tokens: $token_preceding_tokens")
    println("Following Tokens: $token_following_tokens")
    println("Commit ID: $token_commit_id")
else
    println("Token with hash code $target_hash_code not found.")
end

In [None]:
using SHA, Random, StatsBase

# Define the Token struct
struct Token
    hash_code::UInt32
    token::String
    entity_instances::Vector{Entity.Instance{10}}
    tf_frequency::Float64
    bin_number::UInt16
    trailing_zeros::Int
    preceding_tokens::Vector{UInt32}
    following_tokens::Vector{UInt32}
    commit_id::String
end

# Define the TokenContext struct
struct TokenContext
    recent_tokens::Vector{UInt32}
    max_length::Int
end

# Initialize a dictionary to store tokens by their hash codes
tokens_dict = Dict{UInt32, Token}()

# Function to tokenize the text
function tokenize(text::String)
    return split(text, r"\W+")
end

# Function to convert a token to a hash
function token_to_hash(token::AbstractString)
    return reinterpret(UInt32, sha1(String(token))[1:4])[1]
end

# Function to process the text
function process_text(text::String)
    tokens = tokenize(text)
    num_tokens = length(tokens)
    
    for i in 1:num_tokens
        token = tokens[i]
        token_hash = token_to_hash(token)
        
        if haskey(tokens_dict, token_hash)
            # Token already exists, update properties
            existing_token = tokens_dict[token_hash]
            if i > 1
                preceding_token_hash = token_to_hash(tokens[i-1])
                push!(existing_token.preceding_tokens, preceding_token_hash)
            end
            if i < num_tokens
                following_token_hash = token_to_hash(tokens[i+1])
                push!(existing_token.following_tokens, following_token_hash)
            end
        else
            # Token does not exist, create a new instance
            new_token = Token(
                token_hash,
                token,
                Vector{Entity.Instance{10}}(),
                1.0,
                rand(UInt16),
                0,
                i > 1 ? [token_to_hash(tokens[i-1])] : [],
                i < num_tokens ? [token_to_hash(tokens[i+1])] : [],
                ""
            )
            tokens_dict[token_hash] = new_token
        end
    end
end

# Function to generate text
function generate_text(start_token::String, text_length::Int)
    generated_text = [start_token]
    current_token = start_token
    context = TokenContext(Vector{UInt32}(), 3)
    
    for _ in 1:text_length
        current_token_hash = token_to_hash(current_token)
        if haskey(tokens_dict, current_token_hash)
            token_info = tokens_dict[current_token_hash]
            if !isempty(token_info.following_tokens)
                # Consider the context and tf_frequency for the next token selection
                next_token_hash = select_next_token(token_info, context)
                next_token = tokens_dict[next_token_hash].token
                push!(generated_text, next_token)
                current_token = next_token
                
                # Update the context
                push!(context.recent_tokens, next_token_hash)
                if length(context.recent_tokens) > context.max_length
                    popfirst!(context.recent_tokens)
                end
            else
                break
            end
        else
            break
        end
    end
    
    return join(generated_text, " ")
end

# Function to select the next token considering context and tf_frequency
function select_next_token(token_info::Token, context::TokenContext)
    # Filter following tokens based on context
    filtered_tokens = filter(x -> !(x in context.recent_tokens), token_info.following_tokens)
    
    # If no tokens left after filtering, use the original following tokens
    if isempty(filtered_tokens)
        filtered_tokens = token_info.following_tokens
    end
    
    # Select the next token based on tf_frequency
    weights = [tokens_dict[x].tf_frequency for x in filtered_tokens]
    next_token_index = sample(Weights(weights))
    return filtered_tokens[next_token_index]
end

# Example text
text = "The concept behind ST-GNN is illustrated in Figure 2, where each time step is a graph and is passed through a GCN/GAT network to obtain the resultant encoded graph that embed the inter-relational spatial dependence. Subsequently, these encoded graphs can be modelled exactly like time series data as long as the integrity of the graphical structure of the data at each time step is preserved. Figure 2 demonstrates these two steps, the temporal model could be any sequential model ranging from ARIMA or simple recurrent neural network to transformers."

# Process the text
process_text(text)

# Print the tokens dictionary
for (hash_code, token) in tokens_dict
    println("Token Hash: $hash_code")
    println("Token: $(token.token)")
    println("Preceding Tokens: ", [Int(x) for x in token.preceding_tokens])
    println("Following Tokens: ", [Int(x) for x in token.following_tokens])
    println()
end

# Generate text starting from a seed token
seed_token = "concept"
generated_text = generate_text(seed_token, 50)
println("Generated Text: $generated_text")

In [None]:
using SHA, Random

# Define the Token struct
# struct Token
#     hash_code::UInt32
#     entity_instances::Vector{Entity.Instance{10}}
#     tf_frequency::Float64
#     bin_number::UInt16
#     trailing_zeros::UInt8
#     preceding_tokens::Vector{UInt32}
#     following_tokens::Vector{UInt32}
#     commit_id::String
# end

# Initialize a dictionary to store tokens by their hash codes
tokens_dict = Dict{UInt32, Token}()

# Function to tokenize the text
function tokenize(text::String)
    return split(text, r"\W+")
end

# Function to convert a token to a hash
function token_to_hash(token::AbstractString)
    return reinterpret(UInt32, sha1(String(token))[1:4])
end

# Function to process the text
function process_text(text::String)
    tokens = tokenize(text)
    num_tokens = length(tokens)
    
    for i in 1:num_tokens
        token = tokens[i]
        token_hash = token_to_hash(token)
        
        if haskey(tokens_dict, token_hash)
            # Token already exists, update properties
            existing_token = tokens_dict[token_hash]
            if i > 1
                preceding_token_hash = token_to_hash(tokens[i-1])
                push!(existing_token.preceding_tokens, preceding_token_hash)
            end
            if i < num_tokens
                following_token_hash = token_to_hash(tokens[i+1])
                push!(existing_token.following_tokens, following_token_hash)
            end
        else
            # Token does not exist, create a new instance
            new_token = Token(
                token_hash,
                Vector{Entity.Instance{10}}(),
                1,
                rand(UInt16),
                0,
                i > 1 ? [token_to_hash(tokens[i-1])] : [],
                i < num_tokens ? [token_to_hash(tokens[i+1])] : [],
                ""
            )
            tokens_dict[token_hash] = new_token
        end
    end
end


In [2]:
using SHA, Random, StatsBase

# Define the Token struct
struct Token
    hash_code::UInt32
    token::String
    tf_frequency::Float64
    bin_number::UInt16
    trailing_zeros::Int
    preceding_tokens::Vector{UInt32}
    following_tokens::Vector{UInt32}
    commit_id::String
    positions::Vector{Int}  # Add positions to store the original positions of the tokens
end

# Initialize a dictionary to store tokens by their hash codes
tokens_dict = Dict{UInt32, Token}()

# Function to tokenize the text
function tokenize(text::String)
    return split(text, r"\W+")
end

# Function to convert a token to a hash
function token_to_hash(token::AbstractString)
    return reinterpret(UInt32, sha1(String(token))[1:4])[1]
end

# Function to process the text
function process_text(text::String)
    tokens = tokenize(text)
    num_tokens = length(tokens)
    
    for i in 1:num_tokens
        token = tokens[i]
        token_hash = token_to_hash(token)
        
        if haskey(tokens_dict, token_hash)
            # Token already exists, update properties
            existing_token = tokens_dict[token_hash]
            push!(existing_token.positions, i)
            if i > 1
                preceding_token_hash = token_to_hash(tokens[i-1])
                push!(existing_token.preceding_tokens, preceding_token_hash)
            end
            if i < num_tokens
                following_token_hash = token_to_hash(tokens[i+1])
                push!(existing_token.following_tokens, following_token_hash)
            end
        else
            # Token does not exist, create a new instance
            new_token = Token(
                token_hash,
                token,
                1.0,
                rand(UInt16),
                0,
                i > 1 ? [token_to_hash(tokens[i-1])] : [],
                i < num_tokens ? [token_to_hash(tokens[i+1])] : [],
                "",
                [i]  # Initialize positions with the current position
            )
            tokens_dict[token_hash] = new_token
        end
    end
end

# Function to retrieve and reorder tokens for a given text
function restore_text()
    tokens_with_positions = []

    for (token_hash, token_info) in tokens_dict
        for pos in token_info.positions
            push!(tokens_with_positions, (pos, token_info.token))
        end
    end

    # Sort tokens based on their original positions
    sorted_tokens = sort(tokens_with_positions, by = x -> x[1])
    return join([token for (_, token) in sorted_tokens], " ")
end

# Example text
text = "The concept behind ST-GNN is illustrated in Figure 2, where each time step is a graph and is passed through a GCN/GAT network to obtain the resultant encoded graph that embed the inter-relational spatial dependence. Subsequently, these encoded graphs can be modelled exactly like time series data as long as the integrity of the graphical structure of the data at each time step is preserved. Figure 2 demonstrates these two steps, the temporal model could be any sequential model ranging from ARIMA or simple recurrent neural network to transformers."

# Process the text
process_text(text)

# Restore the original text
restored_text = restore_text()
println("Restored Text: $restored_text")

Restored Text: The concept behind ST GNN is illustrated in Figure 2 where each time step is a graph and is passed through a GCN GAT network to obtain the resultant encoded graph that embed the inter relational spatial dependence Subsequently these encoded graphs can be modelled exactly like time series data as long as the integrity of the graphical structure of the data at each time step is preserved Figure 2 demonstrates these two steps the temporal model could be any sequential model ranging from ARIMA or simple recurrent neural network to transformers 


In [3]:
# Function to get the positions of a specific token
function get_token_positions(token::String)
    token_hash = token_to_hash(token)
    if haskey(tokens_dict, token_hash)
        token_info = tokens_dict[token_hash]
        return token_info.positions
    else
        return "Token not found"
    end
end


Positions of 'concept': [2]


In [5]:
# Example usage
token = "is"
positions = get_token_positions(token)
println("Positions of '$token': $positions")

Positions of 'is': [6, 15, 19, 67]


In [None]:
using SHA, Random, StatsBase

# Define the Token struct
struct Token
    hash_code::UInt32
    token::String
    tf_frequency::Float64
    bin_number::UInt16
    trailing_zeros::Int
    preceding_tokens::Vector{UInt32}
    following_tokens::Vector{UInt32}
    commit_id::String
    positions::Vector{Float64}  # Positions as a normalized histogram
end

# Initialize a dictionary to store tokens by their hash codes
tokens_dict = Dict{UInt32, Token}()

# Function to tokenize the text
function tokenize(text::String)
    return split(text, r"\W+")
end

# Function to convert a token to a hash
function token_to_hash(token::AbstractString)
    return reinterpret(UInt32, sha1(String(token))[1:4])[1]
end

# Function to process the text in chunks
function process_text(text::String, chunk_size::Int)
    tokens = tokenize(text)
    num_tokens = length(tokens)
    
    for i in 1:chunk_size:num_tokens
        chunk = tokens[i:min(i+chunk_size-1, num_tokens)]
        for (j, token) in enumerate(chunk)
            token_hash = token_to_hash(token)
            
            if haskey(tokens_dict, token_hash)
                # Token already exists, update properties
                existing_token = tokens_dict[token_hash]
                if j <= length(existing_token.positions)
                    existing_token.positions[j] += 1.0
                else
                    push!(existing_token.positions, 1.0)
                end
            else
                # Token does not exist, create a new instance
                positions = zeros(Float64, chunk_size)
                positions[j] = 1.0
                new_token = Token(
                    token_hash,
                    token,
                    1.0,
                    rand(UInt16),
                    0,
                    j > 1 ? [token_to_hash(chunk[j-1])] : [],
                    j < length(chunk) ? [token_to_hash(chunk[j+1])] : [],
                    "",
                    positions
                )
                tokens_dict[token_hash] = new_token
            end
        end
    end
    
    # Normalize the positions histogram for each token
    for (token_hash, token_info) in tokens_dict
        total = sum(token_info.positions)
        if total > 0
            token_info.positions .= token_info.positions ./ total
        end
    end
end

# Function to retrieve and reorder tokens for a given text
function restore_text()
    tokens_with_positions = []

    for (token_hash, token_info) in tokens_dict
        for (pos, freq) in enumerate(token_info.positions)
            if freq > 0
                push!(tokens_with_positions, (pos, token_info.token, freq))
            end
        end
    end

    # Sort tokens based on their original positions
    sorted_tokens = sort(tokens_with_positions, by = x -> x[1])
    return join([token for (_, token, _) in sorted_tokens], " ")
end

# Example text
text = "The concept behind ST-GNN is illustrated in Figure 2, where each time step is a graph and is passed through a GCN/GAT network to obtain the resultant encoded graph that embed the inter-relational spatial dependence. Subsequently, these encoded graphs can be modelled exactly like time series data as long as the integrity of the graphical structure of the data at each time step is preserved. Figure 2 demonstrates these two steps, the temporal model could be any sequential model ranging from ARIMA or simple recurrent neural network to transformers."

# Process the text in chunks of 11 tokens
process_text(text, 11)

# Restore the original text
restored_text = restore_text()
println("Restored Text: $restored_text")

In [None]:

# Example text
text = "The concept behind ST-GNN is illustrated in Figure 2, where each time step is a graph and is passed through a GCN/GAT network to obtain the resultant encoded graph that embed the inter-relational spatial dependence. Subsequently, these encoded graphs can be modelled exactly like time series data as long as the integrity of the graphical structure of the data at each time step is preserved. Figure 2 demonstrates these two steps, the temporal model could be any sequential model ranging from ARIMA or simple recurrent neural network to transformers."

# Process the text
process_text(text)

# Print the tokens dictionary
for (hash_code, token) in tokens_dict
    println("Token Hash: $hash_code")
    println("Preceding Tokens: $(token.preceding_tokens)")
    println("Following Tokens: $(token.following_tokens)")
    println()
end