In [1]:
include("src/lisa_store.jl")
using ..Graph
using SQLite
using DBInterface
using MurmurHash3
using TextAnalysis
using JSON3
using PooledArrays
using UUIDs
using EasyConfig
using HDF5


# Domains and co-domains

Here we'll try to reduce ambiguity of tracing tokens from the HllSet presentation.  

The consistency of conversion of datasets to HllSets depends on two factors:

 1. The $p$ parameter that defines the precision of the conversion through the number of bins;
 2. The type of a hash function that we are using.

Specifically, the output from hash function depends on the seed values used in initiating of the hash function. Applying different seed values we can control the generated hashes.

Here is an idea of the algorithm:

 1. We are performing the dataset processing as usual, utilizing standard hash function:

$$F_{(std)}: X_{(std)} \to Y_{(std)}$$

 2. Then we are tracing original tokens by applying back processing:

$$G_{(std)}: Y_{(std)} \to X_{(std)}$$

 3. Now we can perform the same dataset processing using the same hash function but with different seed values:

$$F_{(seed)}: X_{(seed)} \to Y_{(seed)}$$

 4. And we will trace back the results from modified hash function:

$$G_{(seed)}: Y_{(seed)} \to X_{(seed)}$$

It is possible, that

 $$X_{(seed)} \not= X_{(std)}$$

But it is also obvious, that tokens from the original dataset should be in both results.

## Standard processing


In [2]:
db = Graph.DB("lisa_analytics.db")
db.sqlitedb

SQLite.DB("lisa_analytics.db")

In [3]:
Store.book_file(db, "/home/alexmy/JULIA/DEMO/sample/")

In [4]:
uuid = string(uuid4())
df = Graph.set_lock!(db, 
    "/home/alexmy/JULIA/DEMO/sample", 
    "csv", 
    "book_file", 
    "ingest_csv", 
    "waiting", 
    "waiting", 
    uuid; result=true)

println(df)

for row in eachrow(df)
    assign = Graph.Assignment(row) 
    println(assign)
    col_uuid = string(uuid4())
    Store.ingest_csv_by_column(db, assign, col_uuid) #; limit=10000, offset=10)
end

[1m2×7 DataFrame[0m
[1m Row [0m│[1m id                                [0m[1m parent                         [0m[1m item                              [0m[1m a_type [0m[1m processor_id [0m[1m lock_uuid                         [0m[1m status  [0m
     │[90m String                            [0m[90m String                         [0m[90m String                            [0m[90m String [0m[90m String       [0m[90m String                            [0m[90m String  [0m
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │ 0b90b1fee69c77ffa3efe57db7788112…  /home/alexmy/JULIA/DEMO/sample  /home/alexmy/JULIA/DEMO/sample/I…  csv     ingest_csv    5dd63530-7d0c-4afd-8514-0cd169f8…  waiting
   2 │ 6be12bee4edf7c96016907e44bb520be…  /home/alexmy/JULIA/DEMO/sample  /home/alexmy/JULIA/DEMO/sample/R…  csv     ingest_csv    5dd63530-7d0c-4afd-85

In [5]:
ds_id = "3f9526f8d331b9519b8632a11b2d344ab7c647b6"
try
    node = Graph.getnode(db, ds_id, :; table_name="t_nodes")
    println(node)
catch e
    println(e)
end

ArgumentError("collection must be non-empty")


In [6]:
result = Graph.gettokens(db, "3f9526f8d331b9519b8632a11b2d344ab7c647b6", :)
tokens = Store.collect_tokens(db, result)
println(tokens)

Set{String}()


## Processing with seeded hash

We'll go through the same steps with the same params except the database.
 - the db name would be "db_seed.db"
  

In [7]:
db_seed = Graph.DB("db_seed.db")

Graph.DB("db_seed.db") (34 assignments, 0 commits, 106 tokens, 0 nodes, 0 edges25 t_nodes, 23 t_edges)

In [8]:
Store.book_file(db_seed, "/home/alexmy/JULIA/DEMO/sample/"; seed=42, P=10)

In [9]:
uuid = string(uuid4())
df = Graph.set_lock!(db_seed, 
    "/home/alexmy/JULIA/DEMO/sample", 
    "csv", 
    "book_file", 
    "ingest_csv", 
    "waiting", 
    "waiting", 
    uuid; result=true)

for row in eachrow(df)
    assign = Graph.Assignment(row) 
    col_uuid = string(uuid4())
    # Important, do not forget to set HllSet precission parameter p to 8
    Store.ingest_csv_by_column(db_seed, assign, col_uuid; limit=10000, offset=10, p=10, seed=42)
end

["Easting", "Northing", "Severity_of_casualty", "Casualty Class", "Sex", "Age", "Date", "Day_of_week", "Hour_of_day", "Local_authority", "Vehicle_type", "Pedestrian_Locality"]
edge: Edge(0b90b1fee69c77ffa3efe57db7788112ef96dba6, 48d3028983aa5e6b7c3bedefeda686acacfb1257, "has_column"; source="/home/alexmy/JULIA/DEMO/sample/Isc_london-ksi-only-since2010.csv", target="Casualty Class", source_label="csv_file", target_label="csv_column")
edge: Edge(0b90b1fee69c77ffa3efe57db7788112ef96dba6, 593c66c6d82d797a818caee475cd32abe120db76, "has_column"; source="/home/alexmy/JULIA/DEMO/sample/Isc_london-ksi-only-since2010.csv", target="Sex", source_label="csv_file", target_label="csv_column")
edge: Edge(0b90b1fee69c77ffa3efe57db7788112ef96dba6, 5b3f4a68cdaf988b8f353b916ff9cfc9c95466f3, "has_column"; source="/home/alexmy/JULIA/DEMO/sample/Isc_london-ksi-only-since2010.csv", target="Date", source_label="csv_file", target_label="csv_column")
edge: Edge(0b90b1fee69c77ffa3efe57db7788112ef96dba6, 42d78f622

### We are getting the dataset directly from **nodes** table of the "db_seed.db" database.

We are utilizing the fact that SHA1 node ID is not affected by changing the hash function for the tokens encoding.

In [10]:
ds_id = "3f9526f8d331b9519b8632a11b2d344ab7c647b6"
node_seed = Graph.getnode(db_seed, ds_id, :; table_name="t_nodes")

Node(3f9526f8d331b9519b8632a11b2d344ab7c647b6; ["csv_column"]; props: column_name="Vehicle type", file_sha1="0b90b1fee69c77ffa3efe57db7788112ef96dba6", column_type="String")

In [11]:
result = Graph.gettokens(db, "3f9526f8d331b9519b8632a11b2d344ab7c647b6", :)
tokens_seed = Store.collect_tokens(db_seed, result)
println(tokens_seed)

Set{String}()


In [12]:
intersection = intersect(tokens, tokens_seed)

Set{String}()

In [13]:
println("tokens size: ", length(tokens), 
    ";\ntokens_seed size: ", length(tokens_seed), 
    ";\nintersection size: ", length(intersection))

tokens size: 0;
tokens_seed size: 0;
intersection size: 0


### Lets check how use of a seeded hash affected HllSets

In [22]:
try
    hll_std = SetCore.HllSet{10}()
    hll_seed = SetCore.HllSet{10}()

    dataset_std = node.dataset
    dataset_seed = node_seed.dataset

    println("dataset_std size: ", length(dataset_std), 
        ";\ndataset_seed size: ", length(dataset_seed))

    # Restore collect_hll_sets
    hll_std = SetCore.restore(hll_std, Vector{UInt64}(dataset_std))
    hll_seed = SetCore.restore(hll_seed, Vector{UInt64}(dataset_seed))

    println("hll_std size: ", SetCore.count(hll_std), 
        ";\nhll_seed size: ", SetCore.count(hll_seed))

    hll_intersection = intersect(hll_std, hll_seed)
    # SetCore.count(hll_intersection)

    println("hll_intersection size: ", SetCore.count(hll_intersection))
catch e
    println(e)
end

UndefVarError(:node)


### So, we are lucky , we got not empty intersection from two HllSets built using different hash functions. (Or may be not, because $1$ is small and could be within the range of an estimation error)

We also can see that the cardinality estimations in our case are not bad. The difference in both case is equal $2$, or about $2.63$%.

# Applying HllSets for Tabular data structures

In [15]:
include("src/lisa_store.jl")

using SQLite
using DBInterface
using MurmurHash3
using TextAnalysis
using JSON3
using PooledArrays
using UUIDs
using EasyConfig
using SparseArrays



In [16]:
db = Graph.DB("lisa_analytics.db")

Graph.DB("lisa_analytics.db") (34 assignments, 0 commits, 41 tokens, 0 nodes, 0 edges25 t_nodes, 23 t_edges)

In [17]:
Store.book_file(db, "/home/alexmy/JULIA/DEMO/sample/") #; column=false)

In [18]:
uuid = string(uuid4())
df = Graph.set_lock!(db, 
    "/home/alexmy/JULIA/DEMO/sample", 
    "csv", 
    "book_file", 
    "ingest_csv", 
    "waiting", 
    "waiting", 
    uuid; result=true)

for row in eachrow(df)
    assign = Graph.Assignment(row) 
    col_uuid = string(uuid4())    
    Store.ingest_csv_by_row(db, assign; limit=50, offset=10)
end

In [19]:
# Provide csv file sha1 id to extract row and column nodes
source_id = "0b90b1fee69c77ffa3efe57db7788112ef96dba6"
"""
    Here we are going to extract row and column nodes from the csv file.
    The resulting matrix will show the cardinality of intersection of row and column nodes.
"""
matrix = Store.get_card_matrix(db, source_id)
i = 0
for row in eachrow(matrix)
    i += 1
    if i < 20
        println(row)
    end
end

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [20]:
"""
    Running get_node_matrix function to extract row and column nodes from the csv file.
    Each cell of the resulting matrix will hold a node that would represent an intersection
    of corresponding row and column of the original csv file.
"""
node_matrix = Store.get_node_matrix(db, source_id)
i = 0
for row in eachrow(node_matrix)
    if i < 20
        println(row)
    end
    i += 1
end

row_nodes: [1m51×10 DataFrame[0m
[1m Row [0m│[1m sha1                              [0m[1m labels      [0m[1m d_sha1                            [0m[1m card  [0m[1m dataset                           [0m[1m props                             [0m[1m source                            [0m[1m target                            [0m[1m r_type  [0m[1m props_1                           [0m
     │[90m String                            [0m[90m String      [0m[90m String                            [0m[90m Int64 [0m[90m String                            [0m[90m String                            [0m[90m String                            [0m[90m String                            [0m[90m String  [0m[90m String                            [0m
─────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [21]:
"""
    Finally we are going to recreate the original csv file (a sample in our case) 
    from the node matrix.

    Important to keep in mind that the results of each cell in the matrix would not be the same 
    as in the original csv file.
    Possible discrepancies can include wrong order on tokens in multitoken cells, 
    missing cells, etc.
    It is a natural result of the probalistic approximation performed on original csv file.
    The original csv file was tokenized, compacted into HllSet, and then reconstructed.
"""
value_matrix = Store.get_value_matrix(db, source_id)
i = 0
for row in eachrow(value_matrix)
    if i < 20
        println(row)
    end
    i += 1
    # println(row)
end

["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
["[]", "[]", "[]", "[]", "[]", "[]", "[]", "[]"]
