# SGS simulation using Enron emails as a source. Day-by-day processing

## Initialization

In [1]:
include("src/sgs_store.jl")

using ..Entity
using ..HllSets
using ..Util
using JSON3
using TextAnalysis

using PyCall
using CSV
using DataFrames
# using WordTokenizers
using Base.Threads
using Dates


# Import the Hugging Face Transformers library
transformers    = pyimport("transformers")
torch           = pyimport("torch")

# Import the fine_tune_model and parse_decoded_strings functions from the Python script
py"""
import sys
sys.path.append(".")
from SGS_Transformers import BertTokenizerWrapper, RobertaTokenizerWrapper, GPT2TokenizerWrapper
"""

redis   = pyimport("redis")
# Connect to Redis
r = redis.Redis(host="localhost", port=6379, db=0)

csv_file_path = "/home/alexmy/Downloads/POC/DATA/enron_05_17_2015_with_labels_v2.csv"
# Read only specific columns from the CSV file
df = DataFrame(CSV.File(csv_file_path, header=true, select=[:Date, :From, :To, :Subject, :content, :user]))

# Reformat fields :Date, f:From, and :To
df.Date = map(x -> Dates.format(Dates.DateTime(x, "yyyy-mm-dd HH:MM:SS"), "yyyy-mm-dd"), df.Date)
df.From = map(x -> ismissing(x) ? "" : (isnothing(match(r"'([^']*)'", x)) ? "" : match(r"'([^']*)'", x).captures[1]), df.From)
df.To   = map(x -> ismissing(x) ? "" : (isnothing(match(r"'([^']*)'", x)) ? "" : match(r"'([^']*)'", x).captures[1]), df.To)

# Replace all remaining missing values with "unknown"
for col in names(df)
    df[!, col] = coalesce.(df[!, col], "unknown")
end

# Extract distinct dates from the Date column, order them in ascending order, and convert to a vector
distinct_dates = unique(df.Date)
sorted_dates = sort(distinct_dates)
dates_vector = collect(sorted_dates)

println(dates_vector)

["1980-01-01", "1986-04-26", "1986-05-01", "1997-01-01", "1997-03-03", "1997-03-05", "1997-03-06", "1997-03-07", "1997-03-11", "1997-03-16", "1997-03-20", "1997-03-21", "1997-03-31", "1997-04-07", "1997-04-10", "1997-04-11", "1997-04-15", "1997-04-17", "1997-04-18", "1997-04-25", "1997-04-29", "1997-05-01", "1997-05-13", "1997-05-14", "1997-05-15", "1997-05-16", "1997-05-22", "1997-05-28", "1997-05-29", "1997-06-04", "1997-06-09", "1997-06-10", "1997-06-12", "1997-06-16", "1997-06-17", "1997-06-18", "1997-06-20", "1997-06-23", "1997-06-25", "1997-06-26", "1997-06-27", "1997-06-30", "1997-07-01", "1997-07-02", "1997-07-15", "1997-07-16", "1997-07-17", "1997-07-22", "1997-07-24", "1997-07-25", "1997-07-28", "1997-07-29", "1997-07-30", "1997-07-31", "1997-08-01", "1997-08-04", "1997-08-05", "1997-08-06", "1997-08-07", "1997-08-08", "1997-08-14", "1997-08-18", "1997-08-20", "1997-08-21", "1997-08-22", "1997-08-23", "1997-08-25", "1997-08-26", "1997-08-27", "1997-08-28", "1997-08-29", "1997

## Simulation loop

In [2]:
cols = [:From, :To, :Subject, :content, :user]
p::Int = 10 
chunk_size::Int = 512000
_parent = csv_file_path
# Manage running daily, weekly, monthly etc. by setting batch 
batch = 10
threshold = batch
i = 1

1

In [3]:
# Instantiate a tokenizer wrapper (e.g., BERT)
tokenizer = py"RobertaTokenizerWrapper"()

while true && i < length(dates_vector)
    the_date = dates_vector[i]
    # Select all rows with the specified date
    filtered_df = df[df.Date .== the_date, :]

    for column in cols    
        col_values  = filtered_df[:, column]
        col_sha1    = Util.sha1_union([_parent, string(column)])
        column_size = Base.summarysize(col_values)
        num_chunks  = ceil(Int, column_size / chunk_size)
        chunks      = Store.chunk_array(col_values, num_chunks)

        println(col_sha1, "; num_chunks: ", num_chunks)
        dataset = Store.ingest_df_column(r, tokenizer, chunks, col_sha1)
        # Convert dataset to Vector{UInt32}
        dataset_vector = Vector{UInt32}(dataset)

        hll = HllSets.HllSet{10}()        
        _hll = HllSets.restore!(hll, dataset_vector)
        
        println("hll: ", HllSets.id(_hll), "; ", HllSets.count(_hll))
        
        entity = Entity.Instance{10}(r, _hll)
        
        println("Current Date:", the_date)
    end
    i = i + 1
    println("i = ", i)
    if i > threshold
        threshold = threshold + batch
        break
    end
end

6f983ba3758e7233f7379a9c7b6ee565808a8de6; num_chunks: 1




hll: 9748514db03a7c8affd6cff2f1597705f09a40a4; 45
Current Date:1980-01-01
6bc47f481f9b458cf32e52dbd4d6731a5d198af5; num_chunks: 1
hll: 911e0d4a984ad93372d1598ed34836edad38cac0; 86
Current Date:1980-01-01
f6c9fedfe796b71638efc125e924040013ef5234; num_chunks: 1
hll: 3cd34a9b0ad788ee8ebebee17f3e4f6ed84fdf5a; 231
Current Date:1980-01-01
65875368cc6392683f42a0e2938b5c0789485b97; num_chunks: 2
hll: a1966e48ce1bb578d1b63c1a3aff5b922981acef; 2289
Current Date:1980-01-01
981f459d81197edf542958361ef219372da6bd82; num_chunks: 1
hll: e4774580ffd7e39d75c731e24ad74c8f65b69829; 23
Current Date:1980-01-01
i = 2
6f983ba3758e7233f7379a9c7b6ee565808a8de6; num_chunks: 1
hll: f2a23e837cd0a755d58fd6258c714af11c07c52d; 2
Current Date:1986-04-26
6bc47f481f9b458cf32e52dbd4d6731a5d198af5; num_chunks: 1
hll: 1c4a447aec77086225c5490e8ee2ed6414876730; 3
Current Date:1986-04-26
f6c9fedfe796b71638efc125e924040013ef5234; num_chunks: 1
hll: da253ec912637266081fe1b3c90eab51c84e694d; 6
Current Date:1986-04-26
65875368cc

## Run garbage collection and free GPU memory

In [4]:
# using CUDA

# # Release memory from tokenizer
# tokenizer.release_memory()

# # Trigger garbage collection on the GPU from Julia as well
# GC.gc(true)
# # Check memory status again
# CUDA.memory_status()