In [1]:
# using Pkg
# Pkg.activate(".")
# Pkg.instantiate()
# Pkg.add("SQLite")
# Pkg.add("DBInterface")
# Pkg.add("JSON3")
# Pkg.add("EasyConfig")
# Pkg.add("MurmurHash3")
# Pkg.add("TextAnalysis")
# Pkg.add("UUIDs")
# Pkg.add("PooledArrays")
include("src/lisa_store.jl")

using SQLite
using DBInterface
using MurmurHash3
using TextAnalysis
using JSON3
using PooledArrays
using UUIDs
using EasyConfig
using HDF5


# Domains and co-domains

Here we'll try to reduce ambiguity of tracing tokens from the HllSet presentation.  

The consistency of conversion of datasets to HllSets depends on two factors:

 1. The $p$ parameter that defines the precision of the conversion through the number of bins;
 2. The type of a hash function that we are using.

Specifically, the output from hash function depends on the seed values used in initiating of the hash function. Applying different seed values we can control the generated hashes.

Here is an idea of the algorithm:

 1. We are performing the dataset processing as usual, utilizing standard hash function:

$$F_{(std)}: X_{(std)} \to Y_{(std)}$$

 2. Then we are tracing original tokens by applying back processing:

$$G_{(std)}: Y_{(std)} \to X_{(std)}$$

 3. Now we can perform the same dataset processing using the same hash function but with different seed values:

$$F_{(seed)}: X_{(seed)} \to Y_{(seed)}$$

 4. And we will trace back the results from modified hash function:

$$G_{(seed)}: Y_{(seed)} \to X_{(seed)}$$

It is obvious, that

 $$X_{(seed)} \not= X_{(std)}$$

But it is also obvious, that tokens from the original dataset should be in both results.

## Standard processing


In [7]:
db = Graph.DB("lisa_analytics.db")
hll = SetCore.HllSet{10}()

# Create an empty HDF5 file. Run it only once, because it will overwrite the file
# h5open("lisa_analytics.hdf5", "w") do f
#     # The file is now open, but it's empty
# end

hdf5_file_name = "lisa_analytics.hdf5"

"lisa_analytics.hdf5"

In [None]:
p = 10
hll = SetCore.HllSet{p}()
Store.book_file(db, "/home/alexmy/JULIA/DEMO/sample/", hll)

In [None]:
uuid = string(uuid4())
df = Graph.set_lock!(db, 
    "/home/alexmy/JULIA/DEMO/sample", 
    "csv", 
    "book_file", 
    "ingest_csv", 
    "waiting", 
    "waiting", 
    uuid; result=true)

for row in eachrow(df)
    assign = Graph.Assignment(row) 
    col_uuid = string(uuid4())
    Store._ingest_csv_by_column(db, assign, col_uuid; limit=10000, offset=10)
end

In [None]:
Store.commit(db, hdf5_file_name, "Alex Mylnikov", "alexmy@lisa-park.com", "commit 1", Config())

In [None]:
analytics = h5open(hdf5_file_name)

In [None]:
close(analytics)

In [8]:
commit_id = "fa385668-f538-4771-bbca-3464c861620b"
ds_id = "3f9526f8d331b9519b8632a11b2d344ab7c647b6"
dataset_path = "/$commit_id/nodes/$ds_id/_csv_column_"
data_out = Dict()
h5open(hdf5_file_name, "r") do file
    Store.read_datasets(file, data_out, dataset_path)
end
println(data_out["_csv_column_"][1])

0


In [9]:
dataset = data_out["_csv_column_"]
tokens = Store.collect_tokens(dataset, ds_id, db)
println(tokens)

Set(["Taxi/Private", "vehicle", "seats", "coach", "more", "Motor", "horse", "motor", "cycle", "Motorcycle", "Private", "gross", "weight", "from", "mgw", "maximum", "Pedal"])


## Processing with seeded hash

We'll go through the same steps with the same params except the database and HDF5 file.
 - the db name would be "db_seed.db"
 - the HDF5 name would be "hdf5_seed.hdf5" (only, if we want to use commit)
  

In [2]:
db_seed = Graph.DB("db_seed.db")
hll = SetCore.HllSet{10}()

# Create an empty HDF5 file. Run it only once, because it will overwrite the file
# h5open("hdf5_seed.hdf5", "w") do f
#     # The file is now open, but it's empty
# end

# hdf5_seed = "hdf5_seed.hdf5"

HllSet{10}()

In [3]:
Store.book_file(db_seed, "/home/alexmy/JULIA/DEMO/sample/", hll; seed=42)

In [None]:
uuid = string(uuid4())
df = Graph.set_lock!(db_seed, 
    "/home/alexmy/JULIA/DEMO/sample", 
    "csv", 
    "book_file", 
    "ingest_csv", 
    "waiting", 
    "waiting", 
    uuid; result=true)

for row in eachrow(df)
    assign = Graph.Assignment(row) 
    col_uuid = string(uuid4())
    Store._ingest_csv_by_column(db_seed, assign, col_uuid; limit=10000, offset=10, seed=42)
end

In [4]:
ds_id = "3f9526f8d331b9519b8632a11b2d344ab7c647b6"
node = Graph.getnode(db_seed, ds_id, :; table_name="t_nodes")

Node(3f9526f8d331b9519b8632a11b2d344ab7c647b6, "csv_column"; column_name="Vehicle type", file_sha1="0b90b1fee69c77ffa3efe57db7788112ef96dba6", column_type="String")

In [6]:
tokens_seed = Store.collect_tokens(node.dataset, ds_id, db_seed)
println(tokens_seed)

Set(["Taxi/Private", "vehicle", "seats", "coach", "more", "Motor", "horse", "motor", "cycle", "Motorcycle", "Private", "gross", "weight", "from", "mgw", "maximum", "Pedal"])


In [11]:
intersection = intersect(tokens, tokens_seed)

Set{String} with 17 elements:
  "Taxi/Private"
  "vehicle"
  "seats"
  "coach"
  "more"
  "Motor"
  "horse"
  "motor"
  "cycle"
  "Motorcycle"
  "Private"
  "gross"
  "weight"
  "from"
  "mgw"
  "maximum"
  "Pedal"

In [14]:
println("tokens size: ", length(tokens), 
    ";\ntokens_seed size: ", length(tokens_seed), 
    ";\nintersection size: ", length(intersection))

tokens size: 17;
tokens_seed size: 17;
intersection size: 17
