# SGS simulation using Enron emails as a source. Day-by-day processing

## Initialization

In [1]:
include("src/sgs_store.jl")

using ..Entity
using ..HllSets
using ..Util
using JSON3
using TextAnalysis

using PyCall
using CSV
using DataFrames
using WordTokenizers
using Base.Threads
using Dates

redis   = pyimport("redis")
# Connect to Redis
r = redis.Redis(host="localhost", port=6379, db=0)

csv_file_path = "/home/alexmy/Downloads/POC/DATA/enron_05_17_2015_with_labels_v2.csv"
# Read only specific columns from the CSV file
df = DataFrame(CSV.File(csv_file_path, header=true, select=[:Date, :From, :To, :Subject, :content, :user]))

# Reformat fields :Date, f:From, and :To
df.Date = map(x -> Dates.format(Dates.DateTime(x, "yyyy-mm-dd HH:MM:SS"), "yyyy-mm-dd"), df.Date)
df.From = map(x -> ismissing(x) ? "" : (isnothing(match(r"'([^']*)'", x)) ? "" : match(r"'([^']*)'", x).captures[1]), df.From)
df.To   = map(x -> ismissing(x) ? "" : (isnothing(match(r"'([^']*)'", x)) ? "" : match(r"'([^']*)'", x).captures[1]), df.To)

# Replace all remaining missing values with "unknown"
for col in names(df)
    df[!, col] = coalesce.(df[!, col], "unknown")
end

# Extract distinct dates from the Date column, order them in ascending order, and convert to a vector
distinct_dates = unique(df.Date)
sorted_dates = sort(distinct_dates)
dates_vector = collect(sorted_dates)

tokenizer = WordTokenizers.Words

println(dates_vector)

# println(first(df, 10))

["1980-01-01", "1986-04-26", "1986-05-01", "1997-01-01", "1997-03-03", "1997-03-05", "1997-03-06", "1997-03-07", "1997-03-11", "1997-03-16", "1997-03-20", "1997-03-21", "1997-03-31", "1997-04-07", "1997-04-10", "1997-04-11", "1997-04-15", "1997-04-17", "1997-04-18", "1997-04-25", "1997-04-29", "1997-05-01", "1997-05-13", "1997-05-14", "1997-05-15", "1997-05-16", "1997-05-22", "1997-05-28", "1997-05-29", "1997-06-04", "1997-06-09", "1997-06-10", "1997-06-12", "1997-06-16", "1997-06-17", "1997-06-18", "1997-06-20", "1997-06-23", "1997-06-25", "1997-06-26", "1997-06-27", "1997-06-30", "1997-07-01", "1997-07-02", "1997-07-15", "1997-07-16", "1997-07-17", "1997-07-22", "1997-07-24", "1997-07-25", "1997-07-28", "1997-07-29", "1997-07-30", "1997-07-31", "1997-08-01", "1997-08-04", "1997-08-05", "1997-08-06", "1997-08-07", "1997-08-08", "1997-08-14", "1997-08-18", "1997-08-20", "1997-08-21", "1997-08-22", "1997-08-23", "1997-08-25", "1997-08-26", "1997-08-27", "1997-08-28", "1997-08-29", "1997

## Simulation loop

In [4]:
cols = [:From, :To, :Subject, :content, :user]
p::Int=10 
chunk_size::Int=512000
_parent = csv_file_path

hist = 10
i = 1
for the_date in dates_vector
    # Select all rows with the specified date
    filtered_df = df[df.Date .== the_date, :]

    for column in cols    
        col_values  = filtered_df[:, column]
        col_sha1    = Util.sha1_union([_parent, string(column)])
        column_size = Base.summarysize(col_values)
        num_chunks  = ceil(Int, column_size / chunk_size)
        chunks      = Store.chunk_array(col_values, num_chunks)

        println(col_sha1, "; num_chunks: ", num_chunks)
        dataset = Store.ingest_df_column(r, tokenizer, chunks, col_sha1)
        # println(dataset)
        hll = HllSets.HllSet{10}()
        # println(hll)
        dataset = JSON3.write(dataset)
        hll = HllSets.restore(hll, dataset)
        # println(hll)
        entity = Entity.Instance{10}(r, hll)
        println("Column entity instance: ", entity)
    end
    i = i + 1
    if i > hist
        break
    end
end

6f983ba3758e7233f7379a9c7b6ee565808a8de6; num_chunks: 1
Column entity instance: 
Instance(
 sha1: fc3b53f72ae425e68eae0661bd419cacc2453eaa
 card: 28
 hll: HllSet{10}()
 grad: 0.0
 op: nothing)


6bc47f481f9b458cf32e52dbd4d6731a5d198af5; num_chunks: 1
Column entity instance: 
Instance(
 sha1: 39f6298865a7e2191e564e7a11cb8b53e6318c27
 card: 71
 hll: HllSet{10}()
 grad: 0.0
 op: nothing)


f6c9fedfe796b71638efc125e924040013ef5234; num_chunks: 1
Column entity instance: 
Instance(
 sha1: 5e87e23a52e0c58717111bd94941d28d099d4c89
 card: 199
 hll: HllSet{10}()
 grad: 0.0
 op: nothing)


65875368cc6392683f42a0e2938b5c0789485b97; num_chunks: 2
Column entity instance: 
Instance(
 sha1: 29493fe194199b4eb504fcb3105890efd7e763f5
 card: 3103
 hll: HllSet{10}()
 grad: 0.0
 op: nothing)


981f459d81197edf542958361ef219372da6bd82; num_chunks: 1
Column entity instance: 
Instance(
 sha1: 67f781b4934ef6f8d6332ab7cc61bd521c4dd099
 card: 20
 hll: HllSet{10}()
 grad: 0.0
 op: nothing)


6f983ba3758e7233f7379a