# SGS simulation using Enron emails as a source. Day-by-day processing

## Initialization

In [1]:
include("src/sgs_store.jl")

using ..Entity
using ..HllSets
using ..Util
using JSON3
using TextAnalysis

using PyCall
using CSV
using DataFrames
# using WordTokenizers
using Base.Threads
using Dates


# Import the Hugging Face Transformers library
transformers    = pyimport("transformers")
torch           = pyimport("torch")

# Import the fine_tune_model and parse_decoded_strings functions from the Python script
py"""
import sys
sys.path.append(".")
from SGS_Transformers import BertTokenizerWrapper, RobertaTokenizerWrapper, GPT2TokenizerWrapper
"""

redis   = pyimport("redis")
# Connect to Redis
r = redis.Redis(host="localhost", port=6379, db=0)

csv_file_path = "/home/alexmy/Downloads/POC/DATA/enron_05_17_2015_with_labels_v2.csv"
# Read only specific columns from the CSV file
df = DataFrame(CSV.File(csv_file_path, header=true, select=[:Date, :From, :To, :Subject, :content, :user]))

# Reformat fields :Date, f:From, and :To
df.Date = map(x -> Dates.format(Dates.DateTime(x, "yyyy-mm-dd HH:MM:SS"), "yyyy-mm-dd"), df.Date)
df.From = map(x -> ismissing(x) ? "" : (isnothing(match(r"'([^']*)'", x)) ? "" : match(r"'([^']*)'", x).captures[1]), df.From)
df.To   = map(x -> ismissing(x) ? "" : (isnothing(match(r"'([^']*)'", x)) ? "" : match(r"'([^']*)'", x).captures[1]), df.To)

# Replace all remaining missing values with "unknown"
for col in names(df)
    df[!, col] = coalesce.(df[!, col], "unknown")
end

# Extract distinct dates from the Date column, order them in ascending order, and convert to a vector
distinct_dates = unique(df.Date)
sorted_dates = sort(distinct_dates)
dates_vector = collect(sorted_dates)

println(dates_vector)

["1980-01-01", "1986-04-26", "1986-05-01", "1997-01-01", "1997-03-03", "1997-03-05", "1997-03-06", "1997-03-07", "1997-03-11", "1997-03-16", "1997-03-20", "1997-03-21", "1997-03-31", "1997-04-07", "1997-04-10", "1997-04-11", "1997-04-15", "1997-04-17", "1997-04-18", "1997-04-25", "1997-04-29", "1997-05-01", "1997-05-13", "1997-05-14", "1997-05-15", "1997-05-16", "1997-05-22", "1997-05-28", "1997-05-29", "1997-06-04", "1997-06-09", "1997-06-10", "1997-06-12", "1997-06-16", "1997-06-17", "1997-06-18", "1997-06-20", "1997-06-23", "1997-06-25", "1997-06-26", "1997-06-27", "1997-06-30", "1997-07-01", "1997-07-02", "1997-07-15", "1997-07-16", "1997-07-17", "1997-07-22", "1997-07-24", "1997-07-25", "1997-07-28", "1997-07-29", "1997-07-30", "1997-07-31", "1997-08-01", "1997-08-04", "1997-08-05", "1997-08-06", "1997-08-07", "1997-08-08", "1997-08-14", "1997-08-18", "1997-08-20", "1997-08-21", "1997-08-22", "1997-08-23", "1997-08-25", "1997-08-26", "1997-08-27", "1997-08-28", "1997-08-29", "1997

## Simulation loop

In [2]:
cols = [:From, :To, :Subject, :content, :user]
p::Int = 10 
chunk_size::Int = 512000
_parent = csv_file_path
# Manage running daily, weekly, monthly etc. by setting batch 
batch = 100
threshold = batch
i = 1

1

In [3]:
# Instantiate a tokenizer wrapper (e.g., BERT)
tokenizer = py"RobertaTokenizerWrapper"()

while true && i < length(dates_vector)
    the_date = dates_vector[i]
    # Select all rows with the specified date
    filtered_df = df[df.Date .== the_date, :]

    for column in cols    
        col_values  = filtered_df[:, column]
        col_sha1    = Util.sha1_union([_parent, string(column)])
        column_size = Base.summarysize(col_values)
        num_chunks  = ceil(Int, column_size / chunk_size)
        chunks      = Store.chunk_array(col_values, num_chunks)

        println(col_sha1, "; num_chunks: ", num_chunks)
        dataset = Store.ingest_df_column(r, tokenizer, chunks, col_sha1)
        # Convert dataset to Vector{UInt32}
        dataset_vector = Vector{UInt32}(dataset)
        hll = HllSets.HllSet{10}()        
        hll = HllSets.restore(hll, dataset_vector)
        
        entity = Entity.Instance{10}(r, hll)
        
        println("Current Date:", the_date)
    end
    i = i + 1
    println("i = ", i)
    if i > threshold
        threshold = threshold + batch
        break
    end
end

6f983ba3758e7233f7379a9c7b6ee565808a8de6; num_chunks: 1




Current Date:1980-01-01
6bc47f481f9b458cf32e52dbd4d6731a5d198af5; num_chunks: 1
Current Date:1980-01-01
f6c9fedfe796b71638efc125e924040013ef5234; num_chunks: 1
Current Date:1980-01-01
65875368cc6392683f42a0e2938b5c0789485b97; num_chunks: 2
Current Date:1980-01-01
981f459d81197edf542958361ef219372da6bd82; num_chunks: 1
Current Date:1980-01-01
i = 2
6f983ba3758e7233f7379a9c7b6ee565808a8de6; num_chunks: 1
Current Date:1986-04-26
6bc47f481f9b458cf32e52dbd4d6731a5d198af5; num_chunks: 1
Current Date:1986-04-26
f6c9fedfe796b71638efc125e924040013ef5234; num_chunks: 1
Current Date:1986-04-26
65875368cc6392683f42a0e2938b5c0789485b97; num_chunks: 1
Current Date:1986-04-26
981f459d81197edf542958361ef219372da6bd82; num_chunks: 1
Current Date:1986-04-26
i = 3
6f983ba3758e7233f7379a9c7b6ee565808a8de6; num_chunks: 1
Current Date:1986-05-01
6bc47f481f9b458cf32e52dbd4d6731a5d198af5; num_chunks: 1
Current Date:1986-05-01
f6c9fedfe796b71638efc125e924040013ef5234; num_chunks: 1
Current Date:1986-05-01
6587

## Run garbage collection and free GPU memory

In [None]:
using CUDA

# Release memory from tokenizer
tokenizer.release_memory()

# Trigger garbage collection on the GPU from Julia as well
GC.gc(true)
# Check memory status again
CUDA.memory_status()

tokens: ["ĠUS","ĠAcad","ARK","05","Arch","ĠPorter","ly","24","Vol","ĠClaim","ĠLists","ĠCapacity","South","ĠVol","Ġ/","legal","or","[","14","AD","ĠOnline","Ġtermination","ĠArea","ĠBank","a","ĠWin","Supp","21","ĠOperations","ĠRock","ĠProduction","ĠRoe","West","Ġ4","avor","ĠServices","2001","-","ĠWorks","ment","iday","ĠPed","Ġ18","Summer","ĠJim","cer","Your","D","ĠDO","ĠThompson","Dog","Ġflood","Ġtrade","bec","Ġagreement","Ġweekend","sex","ĠAgreement","ange","ĠGREEN","Ġshe","Ġmanagement","Las","ĠL","wing","Ġtraders","ĠWork","ĠShe","Ġtechnical","#","Ġtransaction","Con","Ġfiles","ĠDepartment","08","ĠGenerator","Ġtoday","yer","FW","Re","ĠSiem","Ġ!","ĠCounty","ĠChange","ISSION","ĠAsset","RON","Ġinquiry","ĠMt","Ġ22","Ġcapacity","ĠVal","EST","ĠEn","More","ĠElectronic","V","Ġw","ĠBand","TH","Ġcompressor","Ġup","CE","ĠYour","JC","Nat","ĠBP","osition","Ġport","ĠManagement","ĠWHAT","ĠRem","ITED","AP","Ġ201","Ġgolf","&","ĠAnnual","ĠDecision","Ġletter","ic","36","ĠDawn","Gro","Ġissues","ro","ĠContact


tokens: ["32","Ġinsurance","1","78","Ġbest","!","Ġ30","ĠLife","ĠMISS","ĠDON","27","ĠOn","75","'","ĠHERE","'s","Ġvalues","Ġremoved","Ġcomparing","ĠTo","Ġa","Ġalso","ĠA","ĠFree","26","250","!!",")","ĠTHIS","000","24","ĠOut","87","oker","Ġminute","ED","ĠAge","ĠPrices","59","31","ĠTake","43","Ġsimple","ĠHUN","Ġtop","%","259","ANT","Ġ70","161","ĠThese","S","Ġ40","ĠHere","Ġ$","ĠINST","Ġnation","37","ĠYour","12","T","ĠFemale","ĠSave","Ġand","Ġmailing","Ġrates",",","ĠQuote","Ġour","Ġlist","Ġfrom","DR","!!!","ĠClick","Save","Ġto","ĠCheck","Ġform","19","ĠInsurance","Ġcompanies","Ġavailable","107","Ġreceive","Sm","Ġ50","Ġbelow","Ġ(","Ġ60","Ġout","Ġof","Ġbe","11","13","134","ĠFREE","Ġquote","ĠFor","15","ĠCLICK","500","21","38","46","Ġthe","Ġfill","ĠCustom","Ġamong","ĠMale"]
tokens: ["iam","-","j","will","s"]
tokens: ["gc","x","l","a","com","@","ol","gr","z","s","."]
tokens: ["d","1","jp","is","ip","co","inter","rec","@","ns","brand","closed","ients",".","und"]
tokens: ["ĠAll","54","!","Ġ130","ĠWh