In [None]:
# using Pkg
# Pkg.activate(".")
# Pkg.instantiate()
# Pkg.add("CSV")
# Pkg.add("Arrow")
# Pkg.add("Tables")
# Pkg.add("JSON3")

In [1]:
using Random
using FilePathsBase: extension, Path

include("src/sets32.jl")

import .HllSets as set

# Initialize test HllSets
hll1 = set.HllSet{5}(); hll1_seeded = set.HllSet{5}()
hll2 = set.HllSet{5}(); hll2_seeded = set.HllSet{5}()
hll3 = set.HllSet{5}(); hll3_seeded = set.HllSet{5}()
hll4 = set.HllSet{5}(); hll4_seeded = set.HllSet{5}()
hll5 = set.HllSet{5}(); hll5_seeded = set.HllSet{5}()

# Generate datasets from random strings
s1 = Set(randstring(7) for _ in 1:10)
s2 = Set(randstring(7) for _ in 1:15)
s3 = Set(randstring(7) for _ in 1:100)
s4 = Set(randstring(7) for _ in 1:20)
s5 = Set(randstring(7) for _ in 1:130)

# Add datasets to HllSets
set.add!(hll1, s1); set.add!(hll1_seeded, s1, seed=123)
set.add!(hll2, s2); set.add!(hll2_seeded, s2, seed=123)
set.add!(hll3, s3); set.add!(hll3_seeded, s3, seed=123)
set.add!(hll4, s4); set.add!(hll4_seeded, s4, seed=123)
set.add!(hll5, s5); set.add!(hll5_seeded, s5, seed=123)

println(hll1.counts, "\n", count(hll1))
println(hll1_seeded.counts, "\n", count(hll1_seeded))

println("Size of hll1: ", set.sizeof(hll1), "; \nSize of hll1_seeded: ", set.sizeof(hll1_seeded))

UInt32[0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000020, 0x00000000, 0x00000002, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000008, 0x00000000, 0x00000015, 0x00000008, 0x00000000, 0x00000000, 0x00000001, 0x00000001]
10
UInt32[0x00000001, 0x00000002, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000004, 0x00000040, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0000000a, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000002, 0x00000000, 0x00000000, 0x00000002, 0x00000000, 0x00000000, 0x00000000]
11
Size of hll1: 32; 
Size of hll1_seeded: 32


In [6]:
# Print cardinality of datasets and HllSets side by side
println(length(s1), " : ", count(hll1))
println(length(s2), " : ", count(hll2))
println(length(s3), " : ", count(hll3))
println(length(s4), " : ", count(hll4))
println(length(s5), " : ", count(hll5))

# union
println("\nunion:\n", length(s1 ∪ s2 ∪ s3 ∪ s4 ∪ s5), " : ", count(hll1 ∪ hll2 ∪ hll3 ∪ hll4 ∪ hll5), "\n")

# intersection
println("intersection (standard HllSet with seeded):\n", count(hll1 ∩ hll1_seeded))


10 : 10
15 : 17
100 : 97
20 : 16
130 : 126

union:
275 : 240

intersection (standard HllSet with seeded):
0


In [7]:
A = set.HllSet{5}()
B = set.HllSet{5}()
C = set.HllSet{5}()

items_t1 = Set(["string0", "string1", "string2", "string3", "string4", "string5", "string6", "string7", "string8", "string9", "string10"])
items_t2 = Set(["string3", "string4", "string5", "string6", "string7", "string8", "string9", "string10", "string11"])
items_t3 = Set(["string5", "string6", "string7", "string8", "string9", "string10", "string11"])

set.add!(A, items_t1)
set.add!(B, items_t2)
set.add!(C, items_t3)

U = A ∪ B ∪ C

println("A: ", count(A))
println("B: ", count(B))
println("C: ", count(C))
println("U: ", count(U), "\n")

println("AB = A ∩ B: ", count(A ∩ B))
println("AC = A ∩ C: ", count(A ∩ C))
println("BC = B ∩ C: ", count(B ∩ C), "\n")

println("P(A) = A / U: ", count(A) / count(U))
println("P(B) = B / U: ", count(B) / count(U))
println("P(C) = C / U: ", count(C) / count(U), "\n")

println("P(A | B) = AB / B: ", count(A ∩ B) / count(B))
println("P(B | A) = AB / A: ", count(A ∩ B) / count(A))
println("P(A | C) = AC / C: ", count(A ∩ C) / count(C))
println("P(C | A) = AC / A: ", count(A ∩ C) / count(A), "\n")

println("P(B | C) = BC / C: ", count(B ∩ C) / count(C))
println("P(C | B) = BC / B: ", count(B ∩ C) / count(B), "\n")

hll_diff = set.set_xor(A, B)
println("HLL xor: ", count(hll_diff))

hll_int = intersect(A, B)

println("hll_int: ", count(hll_int))

println()
println("=====================================")
hll_comp_1 = set.set_comp(A, B)
println("Comp 1: ", count(hll_comp_1))
println("A: ", count(A))

println()
println("=====================================")
hll_comp_2 = set.set_comp(B, A)
println("Comp 2: ", count(hll_comp_2))
println("B: ", count(B))

A: 11
B: 10
C: 7
U: 12

AB = A ∩ B: 9
AC = A ∩ C: 6
BC = B ∩ C: 7

P(A) = A / U: 0.9166666666666666
P(B) = B / U: 0.8333333333333334
P(C) = C / U: 0.5833333333333334

P(A | B) = AB / B: 0.9
P(B | A) = AB / A: 0.8181818181818182
P(A | C) = AC / C: 0.8571428571428571
P(C | A) = AC / A: 0.5454545454545454

P(B | C) = BC / C: 1.0
P(C | B) = BC / B: 0.7

HLL xor: 4
hll_int: 9

Comp 1: 3
A: 11

Comp 2: 1
B: 10
