In [1]:
include("src/lisa_meta.jl")
include("src/lisa_util.jl")

using ..LisaNeo4j
using ..LisaMeta
using ..Store
using ..Graph
using ..Util

using CSV
using SQLite
using DBInterface
using MurmurHash3
using TextAnalysis
using JSON3
using PooledArrays
using UUIDs
using HDF5
using EasyConfig
using SHA
using Base64
using DataFrames

hll = SetCore.HllSet{10}()

db_source = SQLite.DB("lisa_enron.db")
# Create an empty HDF5 file. Run it only once, because it will overwrite the file
h5open("hll_algebra.hdf5", "cw") do f
    # The file is now open, but it's empty
end

# Connect to the SQLite meta database
db_meta = Graph.DB("hll_algebra.db")

# Define the URL of your Neo4j instance
url = "http://localhost:7474/db/neo4j/tx/commit"
# Define the headers
headers = ["Content-Type" => "application/json", "Authorization" => "Basic " * base64encode("neo4j:neo4j")]



2-element Vector{Pair{String, String}}:
  "Content-Type" => "application/json"
 "Authorization" => "Basic bmVvNGo6bmVvNGo="

In [2]:
# Query the first 10 rows of the table
fields = "\"Message-ID\" AS message_id, Date, \"From\", \"To\", Subject, content, user, labeled"

"\"Message-ID\" AS message_id, Date, \"From\", \"To\", Subject, content, user, labeled"

In [None]:
# Create list of dates
df_dates = DBInterface.execute(db_source, "SELECT DISTINCT strftime('%Y-%m-%d', Date) AS Date FROM emails ORDER BY Date") |> DataFrame
# Get the first 10 rows of the DataFrame
df_dates_10 = first(df_dates, 210)
# Print the first 10 rows
# print(df_dates_10)

In [None]:
"""
│ 191 │ 1999-04-01 │
│ 192 │ 1999-04-02 │
│ 193 │ 1999-04-03 │
│ 194 │ 1999-04-04 │
│ 195 │ 1999-04-05 │
│ 196 │ 1999-04-06 │
│ 197 │ 1999-04-07 │
│ 198 │ 1999-04-08 │
│ 199 │ 1999-04-09 │
│ 200 │ 1999-04-12 │
│ 201 │ 1999-04-13 │
│ 202 │ 1999-04-14 │
│ 203 │ 1999-04-15 │
│ 204 │ 1999-04-19 │
│ 205 │ 1999-04-20 │
│ 206 │ 1999-04-21 │
│ 207 │ 1999-04-22 │
│ 208 │ 1999-04-23 │
│ 209 │ 1999-04-26 │
│ 210 │ 1999-04-27 │
"""
date = "1999-04-13"
df_day = LisaMeta.get_emails_by_date(db_source, date, fields, 100)

In [5]:
# Ingest the data into the store
columns_daily = LisaMeta.ingest_df_by_column(db_meta, df_day, "daily")
row_daily = LisaMeta.ingest_df_by_row(db_meta, df_day, "daily"; p=10)

HllSet{10}()

In [None]:
# Commit the data to the store
message = string("Ingested data for ", date)
Store.commit(db_meta, "hll_algebra.hdf5", "Alex Mylnikov", "alexmy@lisa-park.com", message, Config())

In [7]:
HDF5.h5open("hll_algebra.hdf5", "r") do file
    LisaMeta.print_hdf5_tree(file)
end

11b4bd95-40f9-46b2-876b-26e1dcc1a160
    nodes
        6927c490b1aac716b820616b2de8b7191ee9d68d
            ["column"]
        8342e461fb2cb55b121fca66cc8ca745ee2f72ae
            ["column"]
        aaf03de4d2597380806e6726f4dc17ce4b32ae74
            ["column"]
        bc018f6051278ee0732303d5197154dc61a6f167
            ["column"]
        d0d201c405adf0df65759c16de98f8ef4795d737
            ["column"]
        db3170975744a4abd9d401e15876f414f7988bba
            ["column"]
        e2e5a01ca8ea3e4800700f5d4a784ee75e6d1ad0
            ["column"]
        e65cbaf2e457609a0d9d3bb58bcd62a1cd0e73be
            ["column"]
405c4dd6-0353-4cc4-a47c-eef4e69c50e2
    nodes
        6927c490b1aac716b820616b2de8b7191ee9d68d
            ["column"]
        8342e461fb2cb55b121fca66cc8ca745ee2f72ae
            ["column"]
        aaf03de4d2597380806e6726f4dc17ce4b32ae74
            ["column"]
        bc018f6051278ee0732303d5197154dc61a6f167
            ["column"]
        d0d201c405adf0df65759c16de98f8ef47

In [None]:
query = raw"SELECT * FROM nodes WHERE json_extract(props, '$.From') = 'jeffery.fawcett@enron.com'"

refs_col = LisaMeta.select_sha1_by_label(db_meta, "column", "nodes", -1)
println(refs_col)
refs_rows = LisaMeta.select_sha1_by_query(db_meta, query, -1)
# println(refs_rows)

refs = union(refs_col, refs_rows)

nodes = Vector()
LisaNeo4j.select_nodes(db_meta.sqlitedb, refs, nodes)
LisaNeo4j.add_neo4j_nodes_by_refs(db_meta.sqlitedb, Set(refs), url, headers)

In [None]:
row_nodes = Vector()
LisaNeo4j.select_nodes(db_meta.sqlitedb, refs_rows, row_nodes)

if length(row_nodes) > 0
    refs = Graph.node_union(db_meta, row_nodes)
    # println("Union node added: ", refs)

    LisaNeo4j.add_neo4j_nodes_by_refs(db_meta.sqlitedb, Set(refs), url, headers)
    LisaNeo4j.add_neo4j_edges_by_refs(db_meta.sqlitedb, Set(refs), url, headers)
end

In [10]:
query = raw"SELECT * FROM nodes"
df_nodes = DBInterface.execute(db_meta.sqlitedb, query) |> DataFrame

LisaNeo4j.add_neo4j_nodes(df_nodes, url, headers)

In [11]:
query = raw"SELECT * FROM edges"
df_edges = DBInterface.execute(db_meta.sqlitedb, query) |> DataFrame

LisaNeo4j.add_neo4j_edges(df_edges, url, headers)

# References

1. https://github.com/rafaelmartinelli/Knapsacks.jl