From e09612524eeb4289603562d6598f83deb32adb3e Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Thu, 14 Jul 2022 09:04:22 -0700
Subject: [PATCH 01/25] Minor changes for sessions usability

---
 Banyan/src/requests.jl | 2 +-
 Banyan/src/sessions.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 1be181e6..0a398a05 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -361,8 +361,8 @@ function partitioned_computation_concrete(
     # require the last value to be merged simply because it is being evaluated.
 
     sessions = get_sessions_dict()
-    session_id = get_session_id()
     session = get_session()
+    session_id = get_session_id()
     resource_id = session.resource_id
 
 
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index 1ed9b78e..3f4c7e4f 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -531,7 +531,7 @@ function get_session_status(session_id::String=get_session_id(); kwargs...)::Str
     end
     response = send_request_get_response(:describe_sessions, params)
     if !haskey(response["sessions"], session_id)
-        @warn "Session with ID $session_id is assumed to still be creating"
+        @warn "Session with ID $session_id is assumed to have just started creating"
         return "creating"
     end
     session_status = response["sessions"][session_id]["status"]

From 42173a9d2f4234174af0b49bc0584185cc928bdd Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Wed, 3 Aug 2022 08:23:12 -0700
Subject: [PATCH 02/25] Implement configure_sampling, get_sampling_config,
 get_sample_rate, has_metadata, configure_sampling

---
 Banyan/Project.toml               |   3 +-
 Banyan/src/Banyan.jl              |   5 ++
 Banyan/src/location.jl            | 123 ++++++++++++++++++++++++++++++
 Banyan/src/locations.jl           |   2 +-
 Banyan/src/requests.jl            |   2 +
 Banyan/src/sample.jl              |  16 +++-
 Banyan/src/samples.jl             |  22 ++++++
 Banyan/src/session.jl             |  27 ++++++-
 Banyan/src/sessions.jl            |  25 ++++--
 BanyanDataFrames/src/locations.jl |   2 +-
 BanyanDataFrames/src/pfs.jl       |   2 +-
 BanyanHDF5/src/locations.jl       |   2 +-
 BanyanImages/src/locations.jl     |   4 +-
 13 files changed, 214 insertions(+), 21 deletions(-)

diff --git a/Banyan/Project.toml b/Banyan/Project.toml
index 8fae7de8..8551e6a2 100644
--- a/Banyan/Project.toml
+++ b/Banyan/Project.toml
@@ -4,6 +4,7 @@ authors = ["Banyan <support@banyancomputing.com>"]
 version = "0.4.1"
 
 [deps]
+AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
 AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598"
 AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
 AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812"
@@ -17,8 +18,8 @@ HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 IniFile = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LibGit2 = "76f85450-5226-5b5a-8eaa-529ad045b433"
-MethodAnalysis = "85b6ec6f-f7df-4429-9514-a64bcd9ee824"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+MethodAnalysis = "85b6ec6f-f7df-4429-9514-a64bcd9ee824"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 796f022b..97941e14 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -40,6 +40,9 @@ using AWSCore,
     Serialization,
     TOML
 
+using S3: @service
+@service S3
+
 global BANYAN_API_ENDPOINT
 
 # Account management
@@ -84,6 +87,7 @@ export AbstractFuture, Future, partitioned_computation, compute_inplace, compute
 export Sample, ExactSample, sample, sample_for_grouping, SampleForGrouping, setsample!
 export sample_memory_usage, total_memory_usage, sample_axes, sample_keys, sample_by_key
 export NOTHING_SAMPLE
+export SamplingConfig
 
 # Locations
 export Location, LocationSource, LocationDestination, located, sourced, destined
@@ -98,6 +102,7 @@ export get_remotepath_id,
     cache_location,
     get_max_exact_sample_length,
     set_max_exact_sample_length
+export LocationPath
 
 # Serialization
 export from_jl_value_contents, to_jl_value_contents
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 2eb889af..60b0663f 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -38,4 +38,127 @@ mutable struct Location
     #         sample
     #     )
     # end
+end
+
+struct LocationPath
+    original_path::String
+    path::String
+    path_hash_uint::UInt
+    path_hash::String
+    format_name::String
+    format_version::String
+
+    function LocationPath(path, format_name, format_version)
+        # This function is responsible for "normalizing" the path.
+        # If there are multiple path strings that are technically equivalent,
+        # this function should map them to the same string.
+        path_hash = hash(path)
+        new(
+            path,
+            path,
+            path_hash,
+            string(path_hash),
+            format_name,
+            format_version
+        )
+    end
+end
+
+global TABLE_FORMATS = ["csv", "parquet", "arrow"]
+z
+function get_location_path_with_format(p::String, kwargs...)::LocationPath
+    if isempty(p)
+        return NO_LOCATION_PATH
+    end
+
+    format_name = get(kwargs, :format, "jl")
+    is_sample_format_arrow = format_name == "arrow"
+    if is_sample_format_arrow
+        return LocationPath(p, "arrow", get(kwargs, :format_version, "2"))
+    else
+        for table_format in TABLE_FORMATS
+            if occursin(table_format, p) || format_name == p
+                return LocationPath(p, "arrow", "2")
+            end
+        end
+    end
+    LocationPath(p, "jl", get_julia_version())
+end
+
+function get_sample_path_prefix(lp::LocationPath)
+    format_name_sep = !isempty(lp.format_name) ? "_" : ""
+    format_version_sep = !isempty(lp.format_version) ? "_" : ""
+    lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version * format_version_sep
+end
+get_sample_path(lp::LocationPath, sample_rate::Int64) =
+    get_sample_path_prefix(lp) * string(sample_rate)
+get_metadata_path(lp::LocationPath) = lp.path_hash
+
+Base.hash(lp::LocationPath) = lp.path_hash_uint
+
+const NO_LOCATION_PATH = LocationPath("", "", "")
+
+get_sampling_config(path="", kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...))
+function get_sampling_configs()
+    global session_sampling_configs
+    session_sampling_configs[_get_session_id_no_error()]
+end
+get_sampling_config(l_path::LocationPath)::SamplingConfig =
+    get(get_sampling_configs(), l_path, sampling_configs[NO_LOCATION_PATH])
+
+get_sample_rate(p::String; kwargs...) =
+    get_sample_rate(get_location_path_with_format(p; kwargs...))
+function get_sample_rate(l_path::LocationPath)
+    # Get the desired sample rate
+    desired_sample_rate = get_sampling_config(l_path).rate
+
+    # Find a cached sample with a similar sample ratBucket=e
+    # TODO: Just have a try/catch here so that if the bucket doesn't exist we just return the default
+    # TODO: Make the above code get used in location constructors for getting the desired sample rate
+    sc = get_sampling_config(l_path)
+    pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
+    banyan_samples_objects = try
+        S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"]
+    catch
+        return desired_sample_rate
+    end
+    sample_rate = -1
+    for banyan_samples_object in banyan_samples_objects
+        object_key = banyan_samples_object["Key"]
+        if startswith(object_key, banyan_samples_object_prefix)
+            object_sample_rate = parse(Int64, object_key[(findlast("_", object_key).start+1):end])
+            object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
+            curr_sample_rate_diff = abs(object_sample_rate - sample_rate)
+            if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
+                sample_rate = object_sample_rate
+            end
+        end
+    end
+    sample_rate != -1 ? sample_rate : desired_sample_rate
+end
+
+# function get_location(l_path::LocationPath)
+#     sessions_dict = get_sessions_dict()
+#     session_id = _get_session_id_no_error()
+#     desired_sample_rate = if haskey(sessions_dict, session_id)
+#         sampling_configs = sessions_dict[session_id].sampling_configs
+#         get(sampling_configs, l_path, sampling_configs[NO_LOCATION_PATH]).
+# end
+
+function has_metadata(l_path:: LocationPath)::Bool
+    try
+        !isempty(S3.list_objects_v2(Bucket="banyan_metadata", prefix=get_metadata_path(l_path))["Contents"])
+    catch
+        false
+    end
+end
+
+function has_sample(l_path:: LocationPath)::Bool
+    sc = get_sampling_config(l_path)
+    pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
+    try
+        !isempty(S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"])
+    catch
+        false
+    end
 end
\ No newline at end of file
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index c08bc71d..12cd3888 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -293,7 +293,7 @@ getsamplenrows(totalnrows::Int64)::Int64 =
         totalnrows
     else
         # Must have at least 1 row
-        cld(totalnrows, get_session().sample_rate)
+        cld(totalnrows, get_sample_rate())
     end
 
 # We maintain a cache of locations and a cache of samples. Locations contain
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 0a398a05..9aba3d95 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -566,6 +566,7 @@ function send_evaluation(value_id::ValueId, session_id::SessionId)
             "organization_id" => get_session().organization_id,
             "cluster_instance_id" => get_session().cluster_instance_id,
             "cluster_name" => get_session().cluster_name,
+            "sampling_configs" => sampling_configs_to_jl(get_sampling_configs())
         ),
     )
     if isnothing(response)
@@ -667,6 +668,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
             "organization_id" => get_session().organization_id,
             "cluster_instance_id" => get_session().cluster_instance_id,
             "cluster_name" => get_session().cluster_name,
+            "sampling_configs" => sampling_configs_to_jl(get_sampling_configs())
         ),
     )
     if isnothing(response)
diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl
index c98e1df8..105ab327 100644
--- a/Banyan/src/sample.jl
+++ b/Banyan/src/sample.jl
@@ -9,11 +9,11 @@ mutable struct Sample
     groupingkeys::Vector{<:Any}
 
     Sample() =
-        new(nothing, objectid(nothing), 0, get_session().sample_rate, Any[])
+        new(nothing, objectid(nothing), 0, get_sample_rate(), Any[])
     Sample(value::Any) =
-        new(value, objectid(value), sample_memory_usage(value), get_session().sample_rate, Any[])
+        new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[])
     function Sample(value::Any, memory_usage::Int64)
-        sample_rate = get_session().sample_rate
+        sample_rate = get_sample_rate()
         memory_usage = convert(Int64, round(memory_usage / sample_rate))::Int64
         new(value, objectid(value), memory_usage, sample_rate, Any[])
     end
@@ -22,3 +22,13 @@ mutable struct Sample
         new(value, objectid(value), memory_usage, rate, Any[])
     end
 end
+
+struct SamplingConfig
+    rate::Int64
+    always_exact::Bool
+    max_num_bytes_exact::Int64
+    force_new_sample_rate::Bool
+end
+
+const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false)
+session_sampling_configs = Dict{SessionId,SamplingConfig}("" => DEFAULT_SAMPLING_CONFIG)
\ No newline at end of file
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index 9daea353..a6210acd 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -1,3 +1,25 @@
+function configure_sampling(
+    path="";
+    rate=nothing,
+    always_exact=nothing,
+    max_num_bytes_exact=nothing,
+    kwargs...
+)
+    global session_sampling_configs
+
+    sc = get_sampling_config(path; kwargs...)
+    nsc = SamplingConfig(
+        !isnothing(sc.rate) ? rate : sc.rate,
+        !isnothing(sc.always_exact) ? always_exact : sc.always_exact,
+        !isnothing(sc.max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact,
+        !isnothing(sc.force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate,
+    )
+
+    session_id = _get_session_id_no_error()
+    lp = get_location_path_with_format(path; kwargs...)
+    session_sampling_configs[session_id][lp] = nsc
+end
+
 ###############################################################
 # Sample that caches properties returned by an AbstractSample #
 ###############################################################
diff --git a/Banyan/src/session.jl b/Banyan/src/session.jl
index 8306fe45..cbec24ec 100644
--- a/Banyan/src/session.jl
+++ b/Banyan/src/session.jl
@@ -2,7 +2,6 @@ mutable struct Session
     id::SessionId
     resource_id::ResourceId
     nworkers::Int64
-    sample_rate::Int64
     locations::Dict{ValueId,Location}
     pending_requests::Vector{Request}
     # This is a `WeakKeyDict` so that futures can be GC-ed as long as all
@@ -30,7 +29,6 @@ mutable struct Session
         session_id::SessionId,
         resource_id::ResourceId,
         nworkers::Int64,
-        sample_rate::Int64,
         organization_id::String = "",
         cluster_instance_id::String = "",
         not_using_modules::Vector{String} = NOT_USING_MODULES,
@@ -44,7 +42,6 @@ mutable struct Session
             session_id,
             resource_id,
             nworkers,
-            sample_rate,
             Dict{ValueId,Location}(),
             [],
             Dict{ValueId,Future}(),
@@ -58,7 +55,29 @@ mutable struct Session
             is_session_ready,
             scatter_queue_url,
             gather_queue_url,
-            execution_queue_url,
+            execution_queue_url
         )
     end
 end
+
+function sampling_configs_to_jl(sampling_configs::Dict{LocationPath,SamplingConfig})
+    res = Tuple{Tuple{String,String,String},Tuple{Int64,Bool,Int64,Bool}}[]
+    for (l::LocationPath, s::SamplingConfig) in sampling_configs
+        push!(
+            res,
+            (
+                (l.original_path, l.format_name, l.format_version),
+                (s.rate, s.always_exact, s.max_num_bytes_exact, s.force_new_sample_rate),
+            ),
+        )
+    end
+    res
+end
+
+function sampling_configs_from_jl(sampling_configs)
+    res = Dict{LocationPath,SamplingConfig}()
+    for (l, s) in sampling_configs
+        res[LocationPath(l[1], l[2], l[3])] = SamplingConfig(s[1], s[2], s[3], s[4])
+    end
+    res
+end
\ No newline at end of file
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index 3f4c7e4f..ab3fd091 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -86,7 +86,6 @@ function _start_session(
     store_logs_in_s3::Bool,
     store_logs_on_cluster::Bool,
     log_initialization::Bool,
-    sample_rate::Int64,
     session_name::String,
     files::Vector{String},
     code_files::Vector{String},
@@ -109,7 +108,10 @@ function _start_session(
     no_email::Bool,
     for_running::Bool,
     sessions::Dict{String,Session},
+    sampling_configs::Dict{LocationPath,SamplingConfig}
 )
+    global session_sampling_configs
+
     # Construct parameters for starting session
     cluster_name = if cluster_name == NOTHING_STRING
         running_clusters = get_running_clusters()
@@ -129,7 +131,6 @@ function _start_session(
     session_configuration = Dict{String,Any}(
         "cluster_name" => cluster_name,
         "num_workers" => nworkers,
-        "sample_rate" => sample_rate,
         "release_resources_after" => release_resources_after == -1 ? nothing : release_resources_after,
         "return_logs" => print_logs,
         "store_logs_in_s3" => store_logs_in_s3,
@@ -141,7 +142,8 @@ function _start_session(
         "using_modules" => using_modules,
         "reuse_resources" => !force_update_files,
         "estimate_available_memory" => estimate_available_memory,
-        "language" => "jl"
+        "language" => "jl",
+        "sampling_configs" => sampling_configs_to_jl(sampling_configs)
     )
     if session_name != NOTHING_STRING
         session_configuration["session_name"] = session_name
@@ -269,7 +271,6 @@ function _start_session(
         session_id,
         resource_id,
         nworkers,
-        sample_rate,
         organization_id,
         cluster_instance_id,
         not_using_modules,
@@ -279,6 +280,7 @@ function _start_session(
         gather_queue_url=gather_queue_url,
         execution_queue_url=execution_queue_url
     )
+    session_sampling_configs[session_id] = sampling_configs
 
     if !nowait
         wait_for_session(session_id)
@@ -298,7 +300,6 @@ function start_session(;
     store_logs_in_s3::Bool = true,
     store_logs_on_cluster::Bool = false,
     log_initialization::Bool = false,
-    sample_rate::Int64 = nworkers,
     session_name::String = NOTHING_STRING,
     files::Vector{String} = String[],
     code_files::Vector{String} = String[],
@@ -318,6 +319,10 @@ function start_session(;
     nowait::Bool = true,
     email_when_ready::Union{Bool,Nothing} = nothing,
     for_running::Bool = false,
+    always_exact=nothing,
+    sample_rate=nothing,
+    max_num_bytes_exact=nothing,
+    force_new_sample_rate=nothing,
     kwargs...,
 )::SessionId
     # Should save 5ms of overhead
@@ -331,6 +336,12 @@ function start_session(;
 
     # Configure
     configure(; kwargs...)
+    configure_sampling(;
+        always_exact=always_exact,
+        sample_rate=sample_rate,
+        max_num_bytes_exact=max_num_bytes_exact,
+        force_new_sample_rate=force_new_sample_rate
+    )
     
     current_session_id = _start_session(
         cluster_name,
@@ -340,7 +351,6 @@ function start_session(;
         store_logs_in_s3,
         store_logs_on_cluster,
         log_initialization,
-        sample_rate,
         session_name,
         files,
         code_files,
@@ -362,7 +372,8 @@ function start_session(;
         isnothing(email_when_ready) ? false : email_when_ready,
         isnothing(email_when_ready),
         for_running,
-        sessions
+        sessions,
+        get_sampling_configs()
     )
     current_session_id
 end
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index 73063560..9da81827 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -3,7 +3,7 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end]
 Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5))
 
 function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length)::Location
-    session_sample_rate = get_session().sample_rate
+    session_sample_rate = get_sample_rate()
     is_main = is_main_worker()
     
     # Get cached Location and if it has valid parameters and sample, return
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index a935365d..98027383 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -517,7 +517,7 @@ function WriteHelper(@nospecialize(format_value))
 
         # Gather # of rows, # of bytes, empty sample, and actual sample
         nbytes = part_res isa Empty ? 0 : Banyan.total_memory_usage(part_res)
-        sample_rate = get_session().sample_rate
+        sample_rate = get_sample_rate()
         sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows)
         gathered_data =
             gather_across((nrows, nbytes, part_res isa Empty ? part_res : empty(part_res), sampled_part), comm)
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 1120e83b..537bf573 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -28,7 +28,7 @@ HDF5_getindex_retry = retry(HDF5.getindex; delays=Base.ExponentialBackOff(; n=5)
 
 function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length)
     # Get session information
-    session_sample_rate = get_session().sample_rate
+    session_sample_rate = get_sample_rate()
     worker_idx, nworkers = get_worker_idx(), get_nworkers()
     is_main = worker_idx == 1
 
diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl
index 9ce67042..eb244b33 100644
--- a/BanyanImages/src/locations.jl
+++ b/BanyanImages/src/locations.jl
@@ -98,7 +98,7 @@
 
 #     if isnothing(remote_sample)
 
-#         samplesize = (nimages <= MAX_EXACT_SAMPLE_NUM_IMAGES) ? nimages : ceil(Int64, nimages / get_session().sample_rate)
+#         samplesize = (nimages <= MAX_EXACT_SAMPLE_NUM_IMAGES) ? nimages : ceil(Int64, nimages / get_sample_rate())
 #         nbytes_of_sample = 0
 
 #         progressbar = Progress(length(files_to_read_from), "Collecting sample from $remotepath")
@@ -282,7 +282,7 @@ function _remote_image_source(
     add_channelview
 )
     # Get session information
-    session_sample_rate = get_session().sample_rate
+    session_sample_rate = get_sample_rate()
     worker_idx, nworkers = get_worker_idx(), get_nworkers()
     is_main = worker_idx == 1
 

From d8ebaf20fcacece7f3127f0b34a66088af0c6d77 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Fri, 5 Aug 2022 09:51:07 -0700
Subject: [PATCH 03/25] Implement get_location_source and get_organization_id

---
 Banyan/Project.toml    |   2 +
 Banyan/src/Banyan.jl   |   1 +
 Banyan/src/location.jl | 187 +++++++++++++++++++++++++++++++++++++----
 Banyan/src/utils.jl    |  21 +++++
 4 files changed, 194 insertions(+), 17 deletions(-)

diff --git a/Banyan/Project.toml b/Banyan/Project.toml
index 8551e6a2..02ad86b3 100644
--- a/Banyan/Project.toml
+++ b/Banyan/Project.toml
@@ -8,6 +8,7 @@ AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
 AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598"
 AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
 AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812"
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -33,6 +34,7 @@ TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
 AWSCore = "0.6"
 AWSS3 = "0.7"
 AWSSQS = "0.6"
+Arrow = "2"
 DataStructures = "0.18"
 Downloads = "^1.4"
 FileIO = "1.9.1"
diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 97941e14..5a34287c 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -40,6 +40,7 @@ using AWSCore,
     Serialization,
     TOML
 
+using AWS.AWSServices: s3
 using S3: @service
 @service S3
 
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 60b0663f..006752f5 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -93,6 +93,8 @@ end
 get_sample_path(lp::LocationPath, sample_rate::Int64) =
     get_sample_path_prefix(lp) * string(sample_rate)
 get_metadata_path(lp::LocationPath) = lp.path_hash
+banyan_samples_bucket_name() = "banyan-samples-$(get_organization_id())"
+banyan_metadata_bucket_name() = "banyan-metadata-$(get_organization_id())"
 
 Base.hash(lp::LocationPath) = lp.path_hash_uint
 
@@ -108,25 +110,30 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig =
 
 get_sample_rate(p::String; kwargs...) =
     get_sample_rate(get_location_path_with_format(p; kwargs...))
+parse_sample_rate(object_key) =
+    parse(Int64, object_key[(findlast("_", object_key).start+1):end])
 function get_sample_rate(l_path::LocationPath)
     # Get the desired sample rate
     desired_sample_rate = get_sampling_config(l_path).rate
 
-    # Find a cached sample with a similar sample ratBucket=e
-    # TODO: Just have a try/catch here so that if the bucket doesn't exist we just return the default
-    # TODO: Make the above code get used in location constructors for getting the desired sample rate
     sc = get_sampling_config(l_path)
-    pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
+    if sc.force_new_sample_rate
+        return desired_sample_rate
+    end
+
+    # Find a cached sample with a similar sample rate
+    pre = get_sample_path_prefix(l_path)
     banyan_samples_objects = try
-        S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"]
+        res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"]
+        res isa Base.Vector ? res : [res]
     catch
         return desired_sample_rate
     end
     sample_rate = -1
     for banyan_samples_object in banyan_samples_objects
         object_key = banyan_samples_object["Key"]
-        if startswith(object_key, banyan_samples_object_prefix)
-            object_sample_rate = parse(Int64, object_key[(findlast("_", object_key).start+1):end])
+        if startswith(object_key, pre)
+            object_sample_rate = parse_sample_rate(object_key)
             object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
             curr_sample_rate_diff = abs(object_sample_rate - sample_rate)
             if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
@@ -137,17 +144,9 @@ function get_sample_rate(l_path::LocationPath)
     sample_rate != -1 ? sample_rate : desired_sample_rate
 end
 
-# function get_location(l_path::LocationPath)
-#     sessions_dict = get_sessions_dict()
-#     session_id = _get_session_id_no_error()
-#     desired_sample_rate = if haskey(sessions_dict, session_id)
-#         sampling_configs = sessions_dict[session_id].sampling_configs
-#         get(sampling_configs, l_path, sampling_configs[NO_LOCATION_PATH]).
-# end
-
 function has_metadata(l_path:: LocationPath)::Bool
     try
-        !isempty(S3.list_objects_v2(Bucket="banyan_metadata", prefix=get_metadata_path(l_path))["Contents"])
+        !isempty(S3.list_objects_v2(Bucket=banyan_metadata_bucket_name(), prefix=get_metadata_path(l_path))["Contents"])
     catch
         false
     end
@@ -157,8 +156,162 @@ function has_sample(l_path:: LocationPath)::Bool
     sc = get_sampling_config(l_path)
     pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
     try
-        !isempty(S3.list_objects_v2(Bucket="banyan_samples", prefix=pre)["Contents"])
+        !isempty(S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"])
     catch
         false
     end
+end
+
+twodigit(i::Int64) = i < 10 ? ("0" * string(i)) : string(i)
+
+get_src_params_dict(d::Union{Nothing,Base.ImmutableDict{String, String}}) =
+    isnothing(d) ? Dict{String,String}() : Dict{String,String}(d)
+
+get_src_params_dict_from_arrow(p) = Arrow.Table(p) |> Arrow.getmetadata |> get_src_params_dict
+
+struct AWSExceptionInfo
+    is_aws::Bool
+    unmodified_since::Bool
+    not_found::Bool
+
+    function AWSExceptionInfo(e)
+        is_aws = e isa AWSException && e.cause isa AWS.HTTP.ExceptionRequest.StatusError
+        new(is_aws, is_aws && e.cause.status == 304, is_aws && e.cause.status == 404)
+    end
+end
+
+function get_location_source(lp::LocationPath)::Tuple{Location,String}
+    # Load in metadata
+    metadata_path = get_metadata_path(lp)
+    metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path)
+    metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path"
+    src_params::Dict{String, String} = if exists(metadata_local_path)
+        lm = Dates.unix2datetime(mtime(metadata_local_path))
+        if_modified_since_string =
+            "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
+        try
+            get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))))
+        catch e
+            if is_debug_on()
+                show(e)
+            end
+            ei = AWSExceptionInfo(e)
+            if ei.not_found
+                Dict{String, String}()
+            elseif ei.unmodified_since
+                get_src_params_dict_from_arrow(metadata_local_path)
+            else
+                @warn "Assumming locally stored metadata is invalid because of following error in accessing the metadata copy in the cloud"
+                show(e)
+                Dict{String, String}()
+            end
+        end
+    else
+        try
+            get_src_params_dict_from_arrow(s3("GET", metadata_s3_path))
+        catch e
+            if is_debug_on()
+                show(e)
+            end
+            if !AWSExceptionInfo(e).not_found
+                @warn "Assumming metadata isn't copied in the cloud because of following error in attempted access"
+                show(e)
+            end
+            Dict{String, String}()
+        end
+    end
+
+    # Load in sample
+
+    sc = get_sampling_config()
+    force_new_sample_rate = sc.force_new_sample_rate
+    desired_sample_rate = sc.rate
+    sample_path_prefix = get_sample_path_prefix(lp)
+
+    # Find local samples
+    found_local_samples = Tuple{String,Int64}[]
+    found_local_sample_rate_diffs = Int64[]
+    samples_local_dir = joinpath(homedir(), ".banyan", "samples")
+    for local_sample_path in readdir(samples_local_dir, join=true)
+        if startswith(local_sample_path, sample_path_prefix)
+            local_sample_rate = parse_sample_rate(object_key)
+            diff_sample_rate = abs(local_sample_rate - desired_sample_rate)
+            if !force_new_sample_rate || sample_rate_diff == 0
+                push!(found_local_samples, (local_sample_path, local_sample_rate))
+                push!(found_local_sample_rate_diffs, diff_sample_rate)
+            end
+        end
+    end
+
+    # Sort in descending suitability (the most suitable sample is the one with sample
+    # rate closest to the desired sample rate)
+    found_local_samples = found_local_samples[sortperm(found_local_sample_rate_diffs)]
+
+    # Find a local sample that is up-to-date
+    final_local_sample_path = ""
+    for (sample_local_path, sample_rate) in found_local_samples
+        lm = Dates.unix2datetime(mtime(sample_local_path))
+        if_modified_since_string =
+            "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
+        sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix$sample_rate"
+        try
+            blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))
+            write(sample_local_path, blob)  # This overwrites the existing file
+            final_local_sample_path = sample_local_path
+            break
+        catch e
+            if is_debug_on()
+                show(e)
+            end
+            ei = AWSExceptionInfo(e)
+            if ei.not_found
+                @warn "Assumming locally stored metadata is invalid because it is not backed up to the cloud"
+            elseif ei.unmodified_since
+                final_local_sample_path = sample_local_path
+                break
+            else
+                @warn "Assumming locally stored metadata is invalid because of following error in accessing the metadata copy in the cloud"
+                show(e)
+            end
+        end
+    end
+
+    # If no such sample is found, search the S3 bucket
+    banyan_samples_objects = try
+        res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=sample_path_prefix)["Contents"]
+        res isa Base.Vector ? res : [res]
+    catch e
+        if is_debug_on()
+            show(e)
+        end
+        []
+    end
+    banyan_samples_object_sample_rate = -1
+    for banyan_samples_object in banyan_samples_objects
+        object_key = banyan_samples_object["Key"]
+        if startswith(object_key, banyan_samples_object_prefix)
+            object_sample_rate = parse_sample_rate(object_key)
+            object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
+            curr_sample_rate_diff = abs(object_sample_rate - sample_rate)
+            if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
+                banyan_samples_object_sample_rate = object_sample_rate
+            end
+        end
+    end
+    if banyan_samples_object_sample_rate != -1
+        sample_path_suffix = "$sample_path_prefix$banyan_samples_object_sample_rate"
+        blob = s3("GET", "/$(banyan_samples_bucket_name())/$sample_path_suffix")
+        final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix)
+        write(final_local_sample_path, blob)
+    end
+
+    res_location = LocationSource(
+        get(src_params, "name", "Remote"),
+        src_params,
+        get(src_params, "total_memory_usage", 0),
+        NOTHING_SAMPLE
+    )
+    res_location.parameters_invalid = isempty(src_params)
+    res_location.sample_invalid = isempty(final_local_sample_path)
+    (res_location, final_local_sample_path)
 end
\ No newline at end of file
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index 53b96b74..3b02e192 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -199,6 +199,25 @@ function configure(user_id, api_key, ec2_key_pair_name, banyanconfig_path)
     return banyan_config
 end
 
+# Getting organization IDs
+
+organization_ids = Dict{String,String}
+function get_organization_id()
+    global organization_ids
+    global sessions
+    user_id = configure()["banyan"]["user_id"]
+    session_id = _get_session_id_no_error()
+    if haskey(organization_ids, user_id)
+        organization_ids[user_id]
+    elseif haskey(sessions, session_id)
+        sessions[session_id].organization_ids
+    else
+        organization_id = send_request_get_response(:describe_users, Dict())["organization_id"]
+        organization_ids[user_id] = organization_id
+        organization_id
+    end
+end
+
 @specialize
 
 """
@@ -293,6 +312,8 @@ method_to_string(method::Symbol)::String = begin
         "update-cluster"
     elseif method == :set_cluster_ready
         "set-cluster-ready"
+    elseif method == :describe_users
+        "describe-users"
     end
 end
 

From fbf648170cd1e12a5c7f59a997dec66651434f59 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Sat, 6 Aug 2022 19:51:39 -0700
Subject: [PATCH 04/25] Make changes to location constructor and PFs for BDF.jl

---
 Banyan/src/Banyan.jl                       |   6 +-
 Banyan/src/location.jl                     |  45 +++-
 Banyan/src/locations.jl                    |  20 +-
 Banyan/src/precompile.jl                   |   2 +-
 Banyan/src/queues.jl                       |   4 +-
 Banyan/src/requests.jl                     |  14 +-
 Banyan/src/sample.jl                       |  19 +-
 Banyan/src/samples.jl                      |  27 ++-
 Banyan/src/sessions.jl                     |  11 +-
 Banyan/src/utils_pfs.jl                    |   6 +-
 BanyanDataFrames/src/locations.jl          | 256 +++++++++++++--------
 BanyanDataFrames/src/pfs.jl                | 100 ++++----
 BanyanDataFrames/src/utils_pfs.jl          |   9 +
 BanyanDataFrames/test/sample_collection.jl |  11 +-
 BanyanHDF5/src/locations.jl                |   4 +-
 BanyanImages/src/locations.jl              |  16 +-
 BanyanImages/src/pfs.jl                    |   2 +-
 BanyanImages/test/pfs.jl                   |   4 +-
 18 files changed, 335 insertions(+), 221 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 5a34287c..f51bad16 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -106,7 +106,7 @@ export get_remotepath_id,
 export LocationPath
 
 # Serialization
-export from_jl_value_contents, to_jl_value_contents
+export from_jl_string, to_jl_string
 
 # Queues
 export receive_from_client, send_to_client, get_sqs_dict_from_url
@@ -171,8 +171,8 @@ export is_debug_on,
     get_partition_idx_from_divisions,
     isoverlapping,
     to_jl_value,
-    to_jl_value_contents,
-    from_jl_value_contents,
+    to_jl_string,
+    from_jl_string,
     get_divisions,
     getpath,
     buftovbuf,
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 006752f5..93e41110 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -10,7 +10,7 @@ mutable struct Location
     dst_parameters::LocationParameters
     total_memory_usage::Int64
     sample::Sample
-    parameters_invalid::Bool
+    metadata_invalid::Bool
     sample_invalid::Bool
 
     # function Location(
@@ -106,9 +106,11 @@ function get_sampling_configs()
     session_sampling_configs[_get_session_id_no_error()]
 end
 get_sampling_config(l_path::LocationPath)::SamplingConfig =
-    get(get_sampling_configs(), l_path, sampling_configs[NO_LOCATION_PATH])
+    let scs = get_sampling_configs()
+        get(scs, l_path, scs[NO_LOCATION_PATH])
+    end
 
-get_sample_rate(p::String; kwargs...) =
+get_sample_rate(p::String=""; kwargs...) =
     get_sample_rate(get_location_path_with_format(p; kwargs...))
 parse_sample_rate(object_key) =
     parse(Int64, object_key[(findlast("_", object_key).start+1):end])
@@ -116,6 +118,11 @@ function get_sample_rate(l_path::LocationPath)
     # Get the desired sample rate
     desired_sample_rate = get_sampling_config(l_path).rate
 
+    # If we just want the default sample rate or if a new sample rate is being
+    # forced, then just return that.
+    if isempty(l_path.path)
+        return desired_sample_rate
+    end
     sc = get_sampling_config(l_path)
     if sc.force_new_sample_rate
         return desired_sample_rate
@@ -180,17 +187,24 @@ struct AWSExceptionInfo
     end
 end
 
-function get_location_source(lp::LocationPath)::Tuple{Location,String}
+function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
+    # This checks local cache and S3 cache for sample and metadata files.
+    # It then returns a Location object (with a null sample) and the local file names
+    # to read/write the metadata and sample from/to.
+
     # Load in metadata
     metadata_path = get_metadata_path(lp)
     metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path)
     metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path"
+    src_params_not_stored_locally = false
     src_params::Dict{String, String} = if exists(metadata_local_path)
         lm = Dates.unix2datetime(mtime(metadata_local_path))
         if_modified_since_string =
             "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
         try
-            get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))))
+            d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))))
+            src_params_not_stored_locally = true
+            d
         catch e
             if is_debug_on()
                 show(e)
@@ -208,7 +222,9 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String}
         end
     else
         try
-            get_src_params_dict_from_arrow(s3("GET", metadata_s3_path))
+            d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path))
+            src_params_not_stored_locally = true
+            d
         catch e
             if is_debug_on()
                 show(e)
@@ -220,6 +236,10 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String}
             Dict{String, String}()
         end
     end
+    # Store metadata locally
+    if src_params_not_stored_locally && !isempty(d)
+        Arrow.write(metadata_local_path, Arrow.Table(); metadata=src_params)
+    end
 
     # Load in sample
 
@@ -304,14 +324,19 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String}
         final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix)
         write(final_local_sample_path, blob)
     end
-
+    
+    # Construct and return LocationSource
     res_location = LocationSource(
         get(src_params, "name", "Remote"),
         src_params,
-        get(src_params, "total_memory_usage", 0),
+        parse(Int64, get(src_params, "total_memory_usage", "0")),
         NOTHING_SAMPLE
     )
-    res_location.parameters_invalid = isempty(src_params)
+    res_location.metadata_invalid = isempty(src_params)
     res_location.sample_invalid = isempty(final_local_sample_path)
-    (res_location, final_local_sample_path)
+    (
+        res_location,
+        metaata_local_path,
+        isempty(final_local_sample_path) ? final_local_sample_path : "sample_path_prefix$desired_sample_rate"
+    )
 end
\ No newline at end of file
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 12cd3888..f1564ed5 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -68,7 +68,7 @@ function sourced(fut::Future, loc::Location)
                     # Otherwise just make a fresh new sample.
                     Sample()
                 end,
-                loc.parameters_invalid,
+                loc.metadata_invalid,
                 loc.sample_invalid
             ),
         )
@@ -92,7 +92,7 @@ function sourced(fut::Future, loc::Location)
                     # location if there is one.
                     fut_location.sample
                 end,
-                loc.parameters_invalid,
+                loc.metadata_invalid,
                 loc.sample_invalid
             ),
         )
@@ -116,7 +116,7 @@ function destined(fut::Future, loc::Location)
                 loc.dst_parameters,
                 fut_location.total_memory_usage,
                 Sample(),
-                loc.parameters_invalid,
+                loc.metadata_invalid,
                 loc.sample_invalid
             ),
         )
@@ -131,7 +131,7 @@ function destined(fut::Future, loc::Location)
                 loc.dst_parameters,
                 fut_location.total_memory_usage,
                 fut_location.sample,
-                fut_location.parameters_invalid,
+                fut_location.metadata_invalid,
                 fut_location.sample_invalid
             ),
         )
@@ -219,7 +219,7 @@ Size(val)::Location = LocationSource(
     "Value",
     Dict{String,Any}("value" => to_jl_value(val)),
     0,
-    Sample(indexapply(getsamplenrows, val, 1)),
+    Sample(indexapply(getsamplenrows, val, 1), 1),
 )
 
 function Client(val::T)::Location where {T}
@@ -313,7 +313,7 @@ _invalidate_metadata(remotepath) =
     let p = get_location_path(remotepath)
         if isfile(p)
             loc = deserialize_retry(p)
-            loc.parameters_invalid = true
+            loc.metadata_invalid = true
             serialize(p, loc)
         end
     end
@@ -368,10 +368,10 @@ function get_cached_location(remotepath, remotepath_id, metadata_invalid, sample
         INVALID_LOCATION
     end
     curr_location.sample_invalid = curr_location.sample_invalid || sample_invalid
-    curr_location.parameters_invalid = curr_location.parameters_invalid || metadata_invalid
+    curr_location.metadata_invalid = curr_location.metadata_invalid || metadata_invalid
     curr_sample_invalid = curr_location.sample_invalid
-    curr_parameters_invalid = curr_location.parameters_invalid
-    curr_location, curr_sample_invalid, curr_parameters_invalid
+    curr_metadata_invalid = curr_location.metadata_invalid
+    curr_location, curr_sample_invalid, curr_metadata_invalid
 end
 
 get_cached_location(remotepath, metadata_invalid, sample_invalid) =
@@ -381,7 +381,7 @@ function cache_location(remotepath, remotepath_id, location_res::Location, inval
     location_path = get_location_path(remotepath, remotepath_id)
     location_to_write = deepcopy(location_res)
     location_to_write.sample_invalid = location_to_write.sample_invalid || invalidate_sample
-    location_to_write.parameters_invalid = location_to_write.parameters_invalid || invalidate_metadata
+    location_to_write.metadata_invalid = location_to_write.metadata_invalid || invalidate_metadata
     serialize(location_path, location_to_write)
 end
 cache_location(remotepath, location_res::Location, invalidate_sample, invalidate_metadata) =
diff --git a/Banyan/src/precompile.jl b/Banyan/src/precompile.jl
index 9be89220..e3e89a2f 100644
--- a/Banyan/src/precompile.jl
+++ b/Banyan/src/precompile.jl
@@ -296,7 +296,7 @@ function _precompile_()
     precompile(download_remote_path, (String,))
     precompile(download_remote_s3_path, (String,))
     Base.precompile(Tuple{typeof(sqs_get_queue_with_retries),Dict{Symbol, Any},Vararg{Any}})   # time: 0.24037404
-    precompile(to_jl_value_contents, (Function,))
+    precompile(to_jl_string, (Function,))
 
     # futures.jl
     precompile(create_new_future, (Location, Future, String))
diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl
index 6e231a7d..2cc6c23c 100644
--- a/Banyan/src/queues.jl
+++ b/Banyan/src/queues.jl
@@ -132,7 +132,7 @@ function receive_from_client(value_id::ValueId)
     )
     # Receive response from client
     m = JSON.parse(get_next_message(get_scatter_queue())[1])
-    v = from_jl_value_contents(m["contents"]::String)
+    v = from_jl_string(m["contents"]::String)
     v
 end
 
@@ -153,7 +153,7 @@ end
 
 function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
     MAX_MESSAGE_LENGTH = 220_000
-    message = to_jl_value_contents(value)::String
+    message = to_jl_string(value)::String
     i = 1
     while true
         is_last_message = length(message) <= MAX_MESSAGE_LENGTH
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 9aba3d95..6be89660 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -24,7 +24,7 @@
 )::Tuple{Union{Nothing,String},Union{Nothing,DateTime}}
     value_id = message["value_id"]::ValueId
     if value_id == "-2" && isnothing(error_for_main_stuck_time)
-        error_for_main_stuck_msg::String = from_jl_value_contents(message["contents"]::String)
+        error_for_main_stuck_msg::String = from_jl_string(message["contents"]::String)
         if contains(error_for_main_stuck_msg, "session $(get_session_id())")
             error_for_main_stuck = error_for_main_stuck_msg
             error_for_main_stuck_time = Dates.now()
@@ -277,7 +277,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
                 JSON.json(
                     Dict{String,Any}(
                         "value_id" => value_id,
-                        "contents" => to_jl_value_contents(f.value)
+                        "contents" => to_jl_string(f.value)
                     ),
                 ),
             )
@@ -296,7 +296,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
             contents = get(partial_gathers, value_id, "") * message["contents"]::String
             # @debug "Received gather request for $value_id"
             if haskey(session.futures_on_client, value_id)
-                value = from_jl_value_contents(contents)
+                value = from_jl_string(contents)
                 f = session.futures_on_client[value_id]::Future
                 f.value = value
                 # TODO: Update stale/mutated here to avoid costly
@@ -623,11 +623,11 @@ end
 # Make the `offloaded` function on the client side keep looping and 
 #     (1) checking receive_next_message and 
 #     (2) checking for message[“kind”] == "GATHER" and 
-#     (3) `break`ing and `return`ing the value (using `from_jl_value_contents(message["contents"])`) 
+#     (3) `break`ing and `return`ing the value (using `from_jl_string(message["contents"])`) 
 #         if value_id == -1
 # Make `offloaded` function in Banyan.jl 
 #   which calls evaluate passing in a string of bytes 
-#   by serializing the given function (just call to_jl_value_contents on it) 
+#   by serializing the given function (just call to_jl_string on it) 
 #   and passing it in with the parameter offloaded_function_code
 #
 # Make `offloaded` function specify 
@@ -642,7 +642,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
     # doesn't need information about memory usage from intiial package loading.
 
     # Get serialized function
-    serialized::String = to_jl_value_contents((given_function, args))
+    serialized::String = to_jl_string((given_function, args))
 
     # Submit evaluation request
     !isempty(get_session().organization_id) || error("Organization ID not stored locally for this session")
@@ -713,7 +713,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
                 # recompute the initial available memory every time we start a session
                 # and this should presumably include the offloaded memory usage.
                 get_session().worker_memory_used = get_session().worker_memory_used + memory_used
-                stored_message = from_jl_value_contents(contents)
+                stored_message = from_jl_string(contents)
             end
             error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(message, error_for_main_stuck, error_for_main_stuck_time) 
         elseif (message_type == "EVALUATION_END")
diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl
index 105ab327..1837e244 100644
--- a/Banyan/src/sample.jl
+++ b/Banyan/src/sample.jl
@@ -10,16 +10,16 @@ mutable struct Sample
 
     Sample() =
         new(nothing, objectid(nothing), 0, get_sample_rate(), Any[])
-    Sample(value::Any) =
-        new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[])
-    function Sample(value::Any, memory_usage::Int64)
-        sample_rate = get_sample_rate()
-        memory_usage = convert(Int64, round(memory_usage / sample_rate))::Int64
+    # Sample(value::Any) =
+    #     new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[])
+    function Sample(value::Any, total_memory_usage::Int64, sample_rate::Int64)
+        # sample_rate = get_sample_rate()
+        memory_usage = convert(Int64, round(total_memory_usage / sample_rate))::Int64
         new(value, objectid(value), memory_usage, sample_rate, Any[])
     end
-    function Sample(value::Any, memory_usage::Int64, rate::Int64)
+    function Sample(value::Any, sample_rate::Int64)
         # This is only for the NOTHING_SAMPLE and ExactSample
-        new(value, objectid(value), memory_usage, rate, Any[])
+        new(value, objectid(value), sample_memory_usage(value), sample_rate, Any[])
     end
 end
 
@@ -28,7 +28,8 @@ struct SamplingConfig
     always_exact::Bool
     max_num_bytes_exact::Int64
     force_new_sample_rate::Bool
+    assume_shuffled::Bool
 end
 
-const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false)
-session_sampling_configs = Dict{SessionId,SamplingConfig}("" => DEFAULT_SAMPLING_CONFIG)
\ No newline at end of file
+const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false, true)
+session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG))
\ No newline at end of file
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index a6210acd..7d93572d 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -1,30 +1,41 @@
 function configure_sampling(
     path="";
-    rate=nothing,
+    sample_rate=nothing,
     always_exact=nothing,
     max_num_bytes_exact=nothing,
+    force_new_sample_rate=nothing,
+    assume_shuffled=nothing,
+    for_all_locations=false,
     kwargs...
 )
     global session_sampling_configs
 
     sc = get_sampling_config(path; kwargs...)
     nsc = SamplingConfig(
-        !isnothing(sc.rate) ? rate : sc.rate,
-        !isnothing(sc.always_exact) ? always_exact : sc.always_exact,
-        !isnothing(sc.max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact,
-        !isnothing(sc.force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate,
+        !isnothing(sample_rate) ? rate : sc.rate,
+        !isnothing(always_exact) ? always_exact : sc.always_exact,
+        !isnothing(max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact,
+        !isnothing(force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate,
+        !isnothing(assume_shuffled) ? assume_shuffled : sc.assume_shuffled,
     )
 
     session_id = _get_session_id_no_error()
     lp = get_location_path_with_format(path; kwargs...)
-    session_sampling_configs[session_id][lp] = nsc
+    sampling_configs = session_sampling_configs[session_id]
+    if for_all_locations
+        empty!(sampling_configs)
+        sampling_configs[NO_LOCATION_PATH] = nsc
+    else
+        sampling_configs[lp] = nsc
+    end
+    
 end
 
 ###############################################################
 # Sample that caches properties returned by an AbstractSample #
 ###############################################################
 
-ExactSample(value::Any) = Sample(value, sample_memory_usage(value), 1)
+ExactSample(value::Any) = Sample(value, 1)
 ExactSample(value::Any, memory_usage::Int64) = Sample(value, memory_usage, 1)
 
 function setsample!(fut::Future, value::Any)
@@ -188,7 +199,7 @@ function sample_max(A::T, key::K) where {T,K}
     isempty(A) ? nothing : _maximum(orderinghashes(A, key))
 end
 
-const NOTHING_SAMPLE = Sample(nothing, -1, -1)
+const NOTHING_SAMPLE = Sample(nothing, UInt(0), Int64(-1), Int64(-1), Int64[])
 
 Base.isnothing(s::Sample) = s.rate == -1
 
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index ab3fd091..ea1945e5 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -319,10 +319,6 @@ function start_session(;
     nowait::Bool = true,
     email_when_ready::Union{Bool,Nothing} = nothing,
     for_running::Bool = false,
-    always_exact=nothing,
-    sample_rate=nothing,
-    max_num_bytes_exact=nothing,
-    force_new_sample_rate=nothing,
     kwargs...,
 )::SessionId
     # Should save 5ms of overhead
@@ -336,12 +332,7 @@ function start_session(;
 
     # Configure
     configure(; kwargs...)
-    configure_sampling(;
-        always_exact=always_exact,
-        sample_rate=sample_rate,
-        max_num_bytes_exact=max_num_bytes_exact,
-        force_new_sample_rate=force_new_sample_rate
-    )
+    configure_sampling(; kwargs...)
     
     current_session_id = _start_session(
         cluster_name,
diff --git a/Banyan/src/utils_pfs.jl b/Banyan/src/utils_pfs.jl
index ef82e907..9def168c 100644
--- a/Banyan/src/utils_pfs.jl
+++ b/Banyan/src/utils_pfs.jl
@@ -194,10 +194,10 @@ isoverlapping(a::AbstractRange, b::AbstractRange) = a.start ≤ b.stop && b.star
 
 @nospecialize
 
-to_jl_value(jl) = Dict{String,Any}("is_banyan_value" => true, "contents" => to_jl_value_contents(jl))
+to_jl_value(jl) = Dict{String,Any}("is_banyan_value" => true, "contents" => to_jl_string(jl))
 
 # NOTE: This function is shared between the client library and the PT library
-function to_jl_value_contents(jl)::String
+function to_jl_string(jl)::String
     # Handle functions defined in a module
     # TODO: Document this special case
     # if jl isa Function && !(isdefined(Base, jl) || isdefined(Core, jl) || isdefined(Main, jl))
@@ -211,7 +211,7 @@ function to_jl_value_contents(jl)::String
 end
 
 # NOTE: This function is shared between the client library and the PT library
-function from_jl_value_contents(jl_value_contents::String)
+function from_jl_string(jl_value_contents::String)
     # Converty string to Julia object
     io = IOBuffer()
     iob64_decode = Base64DecodePipe(io)
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index 9da81827..36610a09 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -2,23 +2,31 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end]
 
 Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5))
 
-function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length)::Location
-    session_sample_rate = get_sample_rate()
+function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location
+    # Setup for sampling
+    remotepath = lp.path
+    sampling_config = get_sampling_config(lp)
+    shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact
+    # TODO: Replace `max_exact_sample_length` with `max_num_bytes_exact`
     is_main = is_main_worker()
     
     # Get cached Location and if it has valid parameters and sample, return
-    curr_location, curr_sample_invalid, curr_parameters_invalid = get_cached_location(remotepath, metadata_invalid, sample_invalid)
-    if !curr_parameters_invalid && !curr_sample_invalid
-        return curr_location
+    curr_metadata_invalid, curr_sample_invalid = loc.metadata_invalid, loc.sample_invalid
+    if !curr_metadata_invalid && !curr_sample_invalid
+        return loc
     end
 
     # There are two things we cache for each call `to _remote_table_source`:
-    # 1. A `Location` serialized to a `location_path`
-    # 2. Metadata stored in an Arrow file at `meta_path`
+    # 1. sample
+    # 2. metadata
+
+    # Get paths for writing sample and metadata
+    metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
+    sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)"
 
     # Get metadata if it is still valid
-    curr_meta::Arrow.Table = if !curr_parameters_invalid
-        Arrow_Table_retry(curr_location.src_parameters["meta_path"]::String)
+    curr_meta::Arrow.Table = if !curr_metadata_invalid
+        Arrow_Table_retry(metadata_path)
     else
         Arrow.Table()
     end
@@ -31,7 +39,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
     # Get list of local paths. Note that in the future when we support a list of
     # Internet locations, we will want to only call getpath laterin this code when/if
     # we actually read stuff in.
-    localpaths::Base.Vector{String}, remotepaths::Base.Vector{String} = if !curr_parameters_invalid
+    localpaths::Base.Vector{String}, remotepaths::Base.Vector{String} = if !curr_metadata_invalid
         remotepaths_res = convert(Base.Vector{String}, curr_meta[:path])
         map(getpath, remotepaths_res), remotepaths_res
     else
@@ -52,7 +60,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
             String[localpath], String[remotepath]
         end
     end
-    curr_meta_nrows::Base.Vector{Int64} = !curr_parameters_invalid ? convert(Base.Vector{Int64}, curr_meta[:nrows]) : Int64[]
+    curr_meta_nrows::Base.Vector{Int64} = !curr_metadata_invalid ? convert(Base.Vector{Int64}, curr_meta[:nrows]) : Int64[]
     local_paths_on_curr_worker::Base.Vector{String} = split_across(localpaths)
 
     # Get format
@@ -61,14 +69,14 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
     format_has_separate_metadata = has_separate_metadata(format_value)
 
     # Get nrows, nbytes for each file in local_paths_on_curr_worker
-    meta_nrows_on_worker::Base.Vector{Int64} = if curr_parameters_invalid
+    meta_nrows_on_worker::Base.Vector{Int64} = if curr_metadata_invalid
         meta_nrows_on_worker_res = Base.zeros(length(local_paths_on_curr_worker))
-        if format_has_separate_metadata
-            for (i, local_path_on_curr_worker) in enumerate(local_paths_on_curr_worker)
-                path_nrows_on_worker = get_metadata(format_value, local_path_on_curr_worker)
-                meta_nrows_on_worker_res[i] = path_nrows_on_worker
-            end
-        end
+        # if format_has_separate_metadata
+        #     for (i, local_path_on_curr_worker) in enumerate(local_paths_on_curr_worker)
+        #         path_nrows_on_worker = get_metadata(format_value, local_path_on_curr_worker)
+        #         meta_nrows_on_worker_res[i] = path_nrows_on_worker
+        #     end
+        # end
         # If this format doesn't have separate metadata, we will have to
         # read it in later along with the sample itself.
         meta_nrows_on_worker_res
@@ -77,36 +85,39 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
     end
 
     if Banyan.INVESTIGATING_COLLECTING_SAMPLES
-        println("In _remote_table_source on get_worker_idx()=$(get_worker_idx()) with curr_sample_invalid=$curr_sample_invalid, curr_parameters_invalid=$curr_parameters_invalid, localpaths=$localpaths, remotepaths=$remotepaths, local_paths_on_curr_worker=$local_paths_on_curr_worker, meta_nrows_on_worker=$meta_nrows_on_worker")
+        println("In _remote_table_source on get_worker_idx()=$(get_worker_idx()) with curr_sample_invalid=$curr_sample_invalid, curr_metadata_invalid=$curr_metadata_invalid, localpaths=$localpaths, remotepaths=$remotepaths, local_paths_on_curr_worker=$local_paths_on_curr_worker, meta_nrows_on_worker=$meta_nrows_on_worker")
     end
 
     # Compute the total # of rows so that if the current sample is invalid
     # we can determine whether to get an exact or inexact sample and
     # otherwise so that we can update the sample rate.
-    total_nrows_res = if curr_parameters_invalid
-        if format_has_separate_metadata
-            reduce_and_sync_across(+, sum(meta_nrows_on_worker))
-        else
+    total_nrows_res = if curr_metadata_invalid
+        # if format_has_separate_metadata
+        #     reduce_and_sync_across(+, sum(meta_nrows_on_worker))
+        # else
             # For formats with metadata stored with the data (CSV), we
             # determine the # of rows later in the below case where
             # `!is_metadata_valid``.
             -1
-        end
+        # end
     else
-        curr_location.src_parameters["nrows"]
+        parse(Int64, loc.src_parameters["nrows"])
     end
-    exact_sample_needed = total_nrows_res < max_exact_sample_length
+    total_nbytes = curr_metadata_invalid ? -1 : parse(Int64, loc.src_parameters["total_memory_usage"])
+    exact_sample_needed = sampling_config.always_exact || total_nbytes <= max_num_bytes_exact
 
     # inv: (a) `meta_nrows_on_worker`, (b) `total_nrows_res`, and
     # (c) `exact_sample_needed` are only valid if either the format has
     # separate metadata (like Parquet and Arrow) or the metadata is already
     # stored and valid.
-    is_metadata_valid = format_has_separate_metadata || !curr_parameters_invalid
+    # NOTE: Actually - we changed this because we no longer use
+    # is_metadata_valid = format_has_separate_metadata || !curr_metadata_invalid
+    is_metadata_valid = !curr_metadata_invalid
     # If the metadata isn't valid then we anyway have to read in all the data
     # so we can't leverage the data being shuffled by only reading in some of the files
     shuffled = shuffled && is_metadata_valid && !exact_sample_needed
 
-    # Get sample and also metadata if not yet valid at this point
+    # Get sample and also metadata if not yet valid!curr_metadata_invalid at this point
     recollected_sample_needed = curr_sample_invalid || !is_metadata_valid
     if Banyan.INVESTIGATING_COLLECTING_SAMPLES
         println("In _remote_table_source on get_worker_idx()=$(get_worker_idx()) with is_metadata_valid=$is_metadata_valid, shuffled = $shuffled, recollected_sample_needed=$recollected_sample_needed")
@@ -126,7 +137,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
                 perm_for_shuffling = randperm(length(meta_nrows_on_worker))
                 shuffled_meta_nrows_on_worker = meta_nrows_on_worker[perm_for_shuffling]
                 nrows_on_worker_so_far = 0
-                nrows_on_worker_target = cld(sum(meta_nrows_on_worker), session_sample_rate)
+                nrows_on_worker_target = cld(sum(meta_nrows_on_worker), sample_rate)
                 nfiles_on_worker_res = 0
                 for nrows_on_worker in shuffled_meta_nrows_on_worker
                     nrows_on_worker_so_far += nrows_on_worker
@@ -151,11 +162,11 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
                     let df = get_sample(
                         format_value,
                         local_path_on_curr_worker,
-                        (shuffled || exact_sample_needed) ? 1.0 : session_sample_rate,
+                        (shuffled || exact_sample_needed) ? 1.0 : sample_rate,
                         meta_nrows_for_worker[i]::Int64
                     )
                         if Banyan.INVESTIGATING_COLLECTING_SAMPLES
-                            println("Sampling on get_worker_idx()=$(get_worker_idx()) from local_path_on_curr_worker=$local_path_on_curr_worker with session_sample_rate=$session_sample_rate with meta_nrows_for_worker[i]=$(meta_nrows_for_worker[i]) and i=$i with nrow(df)=$(DataFrames.nrow(df)) and nrows_extra_on_worker=$nrows_extra_on_worker")
+                            println("Sampling on get_worker_idx()=$(get_worker_idx()) from local_path_on_curr_worker=$local_path_on_curr_worker with sample_rate=$sample_rate with meta_nrows_for_worker[i]=$(meta_nrows_for_worker[i]) and i=$i with nrow(df)=$(DataFrames.nrow(df)) and nrows_extra_on_worker=$nrows_extra_on_worker")
                         end
                         if shuffled && i == nfiles_on_worker && nrows_extra_on_worker > 0
                             df[1:(end-nrows_extra_on_worker), :]
@@ -175,24 +186,28 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
             # just have been read from the Arrow metadata file.
 
             local_nrows = 0
-            for exact_sample_needed_res in [false, true]
+            for exact_sample_needed_res in (sampling_config.always_exact ? [true] : [false, true])
                 # First see if we can get a random (inexact sample).
                 empty!(local_samples)
                 local_nrows = 0
+                local_nbytes = 0
                 for (i, local_path_on_curr_worker) in enumerate(local_paths_on_curr_worker)
+                    path_sample_rate = exact_sample_needed_res ? 1.0 : sample_rate
                     path_sample, path_nrows = get_sample_and_metadata(
                         format_value,
                         local_path_on_curr_worker,
-                        exact_sample_needed_res ? 1.0 : session_sample_rate
+                        path_sample_rate
                     )
                     meta_nrows_on_worker[i] = path_nrows
                     push!(local_samples, path_sample)
                     local_nrows += path_nrows
+                    local_nbytes += ceil(Int64, total_memory_usage(path_sample) * path_sample_rate)
                 end
                 total_nrows_res = reduce_and_sync_across(+, local_nrows)
+                total_nbytes_res = reduce_and_sync_across(+, local_nbytes)
 
                 # If the sample is too small, redo it, getting an exact sample
-                if !exact_sample_needed_res && total_nrows_res < max_exact_sample_length
+                if !exact_sample_needed_res && total_nbytes_res < max_exact_sample_length
                     exact_sample_needed = true
                     exact_sample_needed_res = true
                 else
@@ -207,7 +222,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
         local_sample::DataFrames.DataFrame = isempty(local_samples) ? DataFrames.DataFrame() : vcat(local_samples...)
 
         # Concatenate local samples and nrows together
-        remote_sample_value::DataFrames.DataFrame, meta_nrows_on_workers::Base.Vector{Int64} = if curr_parameters_invalid
+        remote_sample_value::DataFrames.DataFrame, meta_nrows_on_workers::Base.Vector{Int64} = if curr_metadata_invalid
             sample_and_meta_nrows_per_worker::Base.Vector{Tuple{DataFrames.DataFrame,Base.Vector{Int64}}} =
                 gather_across((local_sample, meta_nrows_on_worker))
             if is_main
@@ -239,7 +254,7 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
 
         # Return final Sample on main worker now that we have gathered both the sample and metadata
         if is_main
-            empty_sample_value_serialized::String = to_jl_value_contents(empty(remote_sample_value))
+            empty_sample_value_serialized::String = to_arrow_string(empty(remote_sample_value))
 
             # Convert dataframe to a buffer storing Arrow-serialized data.
             # Then when we receive this on the client side we can simply
@@ -254,12 +269,12 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
             total_nbytes_res = if exact_sample_needed
                 remote_sample_value_memory_usage
             else
-                ceil(Int64, remote_sample_value_memory_usage * session_sample_rate)
+                ceil(Int64, remote_sample_value_memory_usage * sample_rate)
             end
             remote_sample_value_nrows = nrow(remote_sample_value)
             if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
                 @show total_nrows_res remote_sample_value_nrows
-                @show remote_sample_value_memory_usage total_nbytes_res session_sample_rate
+                @show remote_sample_value_memory_usage total_nbytes_res sample_rate
             end
             remote_sample_res::Sample = if exact_sample_needed
                 # Technically we don't need to be passing in `total_bytes_res`
@@ -269,11 +284,11 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
                 # constructors
                 ExactSample(remote_sample_value_arrow, total_nbytes_res)
             else
-                Sample(remote_sample_value_arrow, total_nbytes_res)
+                Sample(remote_sample_value_arrow, total_nbytes_res, sample_rate)
             end
             meta_nrows_on_workers, total_nrows_res, total_nbytes_res, remote_sample_res, empty_sample_value_serialized
         else
-            Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_jl_value_contents(DataFrames.DataFrame())
+            Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_arrow_string(DataFrames.DataFrame())
         end
     else
         # This case is entered if we the format has metadata stored
@@ -287,38 +302,72 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
         if is_main
             meta_nrows_res::Base.Vector{Int64} = vcat(meta_nrows_per_worker...)
 
-            # Get the total # of bytes
-            cached_remote_sample_res::Sample = curr_location.sample
-            remote_sample_value_nrows = nrow(cached_remote_sample_res.value)
-            remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_res.value)
-            if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
-                @show remote_sample_value_nbytes remote_sample_value_nrows total_nrows_res
-            end
+            # # Get the total # of bytes
+            # cached_remote_sample_res = Sample(
+            #     DataFrames.DataFrame(Arrow.Table("s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)")),
+            #     sample_rate
+            # )
+            # remote_sample_value_nrows = nrow(cached_remote_sample_res.value)
+            # remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_res.value)
+            # if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
+            #     @show remote_sample_value_nbytes remote_sample_value_nrows total_nrows_res
+            # end
+            # total_nbytes_res = ceil(Int64, remote_sample_value_nbytes * total_nrows_res / remote_sample_value_nrows)
+
+            # # Update the sample's sample rate and memory usage based on the
+            # # new # of rows (since the metadata with info about # of rows
+            # # has been invalidated)
+            # cached_remote_sample_res.rate = ceil(Int64, total_nrows_res / remote_sample_value_nrows)
+            # cached_remote_sample_res.memory_usage = ceil(Int64, total_nbytes_res / cached_remote_sample_res.rate)::Int64
+            # if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
+            #     @show sample_rate total_nbytes_res cached_remote_sample_res.memory_usage
+            # end
+
+            cached_remote_sample_value = DataFrames.DataFrame(Arrow.Table(sample_path))
+            remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_value)
+            remote_sample_value_nrows = DataFrames.nrow(cached_remote_sample_value)
             total_nbytes_res = ceil(Int64, remote_sample_value_nbytes * total_nrows_res / remote_sample_value_nrows)
+            cached_remote_sample_res = NOTHING_SAMPLE
 
-            # Update the sample's sample rate and memory usage based on the
-            # new # of rows (since the metadata with info about # of rows
-            # has been invalidated)
-            cached_remote_sample_res.rate = ceil(Int64, total_nrows_res / remote_sample_value_nrows)
-            cached_remote_sample_res.memory_usage = ceil(Int64, total_nbytes_res / cached_remote_sample_res.rate)::Int64
-            if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
-                @show cached_remote_sample_res.rate total_nbytes_res cached_remote_sample_res.memory_usage
-            end
-
-            meta_nrows_res, total_nrows_res, total_nbytes_res, cached_remote_sample_res, curr_location.src_parameters["empty_sample"]
+            meta_nrows_res, total_nrows_res, total_nbytes_res, cached_remote_sample_res, loc.src_parameters["empty_sample"]
         else
-            Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_jl_value_contents(DataFrames.DataFrame())
+            Base.zeros(length(localpaths)), -1, -1, NOTHING_SAMPLE, to_arrow_string(DataFrames.DataFrame())
         end
     end
 
     # If a file does not exist, one of the get_metadata/get_sample functions
     # will error.
 
-    # Write the metadata to an Arrow file
-    meta_path = is_main ? get_meta_path(remotepath) : ""
-    if curr_parameters_invalid
+    # Get source parameters
+    src_params =
+        Dict(
+            "name" => "Remote",
+            "total_memory_usage" => string(total_nbytes),
+            # For dispatching the appropriate PF for this format
+            "format" => format_string,
+            # For constructing the `BanyanDataFrames.DataFrame`'s `nrows::Future` field
+            "nrows" => string(total_nrows),
+            # For diagnostics purposes in PFs (partitioning functions)
+            "path" => remotepath,
+            # For PFs to read from this source
+            # TODO
+            "empty_sample" => empty_sample
+        )
+
+    # Write the metadata to S3 cache if previously invalid
+    if curr_metadata_invalid
         # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write`
-        Arrow.write(is_main ? meta_path : IOBuffer(), (path=remotepaths, nrows=meta_nrows), compress=:zstd)
+        Arrow.write(
+            is_main ? metadata_path : IOBuffer(),
+            (path=remotepaths, nrows=meta_nrows);
+            compress=:zstd,
+            metadata=src_params
+        )
+    end
+
+    # Write the sample to S3 cache if previously invalid
+    if curr_sample_invalid
+        write(sample_path, remote_sample.value.data)
     end
 
     if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
@@ -327,54 +376,73 @@ function _remote_table_source(remotepath, shuffled, metadata_invalid, sample_inv
 
     # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)")
 
-    # Return LocationSource
+    # Return LocationSource to client side
     if is_main
         # Construct the `Location` to return
         if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
             @show total_nbytes
         end
-        location_res = LocationSource(
+        LocationSource(
             "Remote",
-            Dict(
-                # For dispatching the appropriate PF for this format
-                "format" => format_string,
-                # For constructing the `BanyanDataFrames.DataFrame`'s `nrows::Future` field
-                "nrows" => total_nrows,
-                # For diagnostics purposes in PFs (partitioning functions)
-                "path" => remotepath,
-                # For location constructor to use as caching
-                "meta_path" => meta_path,
-                # For PFs to read from this source
-                "empty_sample" => empty_sample
-            ),
+            src_params,
             total_nbytes,
             remote_sample
         )
-
-        # Write out the updated `Location`
-        cache_location(remotepath, location_res, invalidate_sample, invalidate_metadata)
-
-        location_res
     else
         NOTHING_LOCATION
     end
 end
 
-RemoteTableSource(remotepath; shuffled=true, metadata_invalid = false, sample_invalid = false, invalidate_metadata = false, invalidate_sample = false, max_exact_sample_length = Banyan.get_max_exact_sample_length())::Location =
-    let loc = offloaded(
-        _remote_table_source,
-        remotepath,
-        shuffled,
-        metadata_invalid,
-        sample_invalid,
-        invalidate_metadata,
-        invalidate_sample,
-        max_exact_sample_length;
-        distributed=true
-    )
-        loc.sample.value = loc.sample.value |> seekstart |> Arrow.Table |> DataFrames.DataFrame
+load_arrow_sample(f) = f |> Arrow.Table |> DataFrames.DataFrame
+load_arrow_sample_from_buf(iobuf) = iobuf |> seekstart |> load_arrow_sample
+
+# TODO: Modify offloaded function to:
+# - Use get_sampling_config() to get sample rate, shuffled, max_num_bytes_exact
+# - Use the passed in location to get info about validity of metdata and samples
+# - Use the passed in location to avoid reading from S3 to get the location
+# - Use the LocationPath to get_sample_rate properly here and elsewhere
+# - Write sample file and metadata file to S3 if needed
+# - Parse string values of location metadata
+# - Keep empty_sample but make it be a string of Arrow data with a to/from_arrow_value
+# - Return location with sample and metadata
+
+function RemoteTableSource(remotepath)::Location
+    lp = LocationPath(remotepath, "arrow", "2")
+    
+    # Look at local and S3 caches of metadata and samples to attempt to
+    # construct a Location.
+    loc, local_metadata_path, local_sample_path = get_location_source(lp)
+
+    if !loc.metadata_invalid && !loc.sample_invalid
+        # Case where both sample and parameters are valid
+        loc.sample.value = load_arrow_sample(local_sample_path)
         loc
+    elseif loc.metadata_invalid && !loc.sample_invalid
+        # Case where parameters are invalid
+        new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
+        Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
+        new_loc.sample.value = load_arrow_sample(local_sample_path)
+        new_loc
+    else
+        # Case where sample is invalid
+
+        # Get the Location with up-to-date metadata (source parameters) and sample
+        new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
+
+        if !loc.metadata_invalid
+            # Store the metadata locally. The local copy just has the source
+            # parameters but PFs can still access the S3 copy which will have the
+            # table of file names and #s of rows.
+            Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
+        end
+
+        # Store the Arrow sample locally and update the returned Sample
+        write(local_sample_path, new_loc.sample.value.data)
+        new_loc.sample.value = load_arrow_sample_from_buf(new_loc.sample.value)
+        
+        new_loc
     end
+end
 
 # Load metadata for writing
 # NOTE: `remotepath` should end with `.parquet` or `.csv` if Parquet
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index 98027383..0c877289 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -196,13 +196,14 @@ function ReadBlockHelper(@nospecialize(format_value))
         end
         
         loc_params_path = loc_params[symbol_path]::String
+        lp = LocationPath(loc_params_path, "arrow", "2")
         balanced = params[symbol_balanced]
-        m_path = loc_name == symbol_Disk ? sync_across(is_main_worker(comm) ? get_meta_path(loc_params_path) : "", comm=comm) : loc_params["meta_path"]::String
-        loc_params = loc_name == symbol_Disk ? (Banyan.deserialize_retry(get_location_path(loc_params_path))::Location).src_parameters : loc_params
+        m_path = "s3/$(banyan_metadata_bucket_name())/$(Banyan.get_metadata_path(lp))"
+        loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params
         if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
             @show (m_path, loc_params, get_worker_idx())
         end
-        meta = Arrow_Table_retry(m_path)
+        # meta = Arrow_Table_retry(m_path)
         filtering_op = get(params, symbol_filtering_op, identity)
 
         # Handle multi-file tabular datasets
@@ -216,8 +217,8 @@ function ReadBlockHelper(@nospecialize(format_value))
         # [1] https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
 
         # Initialize
-        meta_nrows = meta.nrows
-        meta_path = meta.path
+        meta_nrows = loc_params["nrows"]
+        meta_path = loc_params["path"]
         nworkers = get_nworkers(comm)
         npartitions = nbatches * nworkers
         partition_idx = get_partition_idx(batch_idx, nbatches, comm)
@@ -359,7 +360,7 @@ function ReadBlockHelper(@nospecialize(format_value))
         res = if isempty(dfs)
             # When we construct the location, we store an empty data frame with The
             # correct schema.
-            from_jl_value_contents(loc_params["empty_sample"])
+            from_arrow_string(loc_params["empty_sample"])
         elseif length(dfs) == 1
             dfs[1]
         else
@@ -398,6 +399,7 @@ function WriteHelper(@nospecialize(format_value))
         # Get path of directory to write to
         is_disk = loc_name == "Disk"
         loc_params_path = loc_params["path"]::String
+        lp = LocationPath(loc_params_path, "arrow", "2")
         path::String = loc_params_path
         if startswith(path, "http://") || startswith(path, "https://")
             error("Writing to http(s):// is not supported")
@@ -483,9 +485,13 @@ function WriteHelper(@nospecialize(format_value))
 
         # Get paths for reading in metadata and Location
         tmp_suffix = nbatches > 1 ? ".tmp" : ""
-        m_path = is_main ? get_meta_path(loc_params_path * tmp_suffix) : ""
-        location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : ""
-        m_path, location_path = sync_across((m_path, location_path), comm=comm)
+        lp_tmp = LocationPath(loc_params_path * tmp_suffix, "arrow", "2")
+        # m_path = is_main ? get_meta_path() : ""
+        # location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : ""
+        # m_path, location_path = sync_across((m_path, location_path), comm=comm)
+        m_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp_tmp))"
+        s_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp_tmp))$sample_rate"
+        # loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params
 
         # Read in meta path if it's there
         curr_remotepaths, curr_nrows = if nbatches > 1 && batch_idx > 1
@@ -498,31 +504,34 @@ function WriteHelper(@nospecialize(format_value))
 
         # Read in the current location if it's there
         empty_df = DataFrames.DataFrame()
-        curr_location::Location = if nbatches > 1 && batch_idx > 1
-            Banyan.deserialize_retry(location_path)
+        curr_metadata_tbl = if nbatches > 1 && batch_idx > 1
+            Arrow.Table(m_path)
         else
-            LocationSource(
-                "Remote",
-                Dict(
-                    "format" => format_string,
-                    "nrows" => 0,
-                    "path" => loc_params_path,
-                    "meta_path" => m_path,
-                    "empty_sample" => to_jl_value_contents(empty_df)
-                ),
-                0,
-                ExactSample(empty_df, 0)
+            Arrow.Table()
+        end
+        curr_src_parameters = if nbatches > 1 && batch_idx > 1
+            Dict{String,String}(Arrow.getmetadata(curr_metadata_tbl))
+        else
+            Dict(
+                "name" => "Remote",
+                "total_memory_usage" => "0",
+                "format" => format_string,
+                "nrows" => "0",
+                "path" => loc_params_path,
+                "empty_sample" => to_arrow_string(empty_df),
             )
         end
 
         # Gather # of rows, # of bytes, empty sample, and actual sample
         nbytes = part_res isa Empty ? 0 : Banyan.total_memory_usage(part_res)
-        sample_rate = get_sample_rate()
+        sampling_config = get_sampling_config(lp)
+        sample_rate = sampling_config.rate
         sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows)
         gathered_data =
             gather_across((nrows, nbytes, part_res isa Empty ? part_res : empty(part_res), sampled_part), comm)
         
         # On the main worker, finalize metadata and location info.
+        sample_invalid = false
         if is_main
             # Determine paths and #s of rows for metadata file
             for worker_i in 1:nworkers
@@ -538,48 +547,43 @@ function WriteHelper(@nospecialize(format_value))
             end
 
             # Update the # of bytes
-            total_nrows::Int64 = curr_location.src_parameters["nrows"]
+            total_nrows::Int64 = parse(Int64, curr_src_parameters["nrows"])
+            total_memory_usage::Int64 = parse(Int64, curr_src_parameters["total_memory_usage"])
             empty_sample_found = false
             for (new_nrows::Int64, new_nbytes::Int64, empty_part, sampled_part) in gathered_data
                 # Update the total # of rows and the total # of bytes
                 total_nrows += sum(new_nrows)
                 push!(curr_nrows, new_nrows)
-                curr_location.total_memory_usage += new_nbytes
+                total_memory_usage += new_nbytes
 
                 # Get the empty sample
                 if !empty_sample_found && !(empty_part isa Empty)
-                    curr_location.src_parameters["empty_sample"] = to_jl_value_contents(empty_part)
+                    curr_src_parameters["empty_sample"] = to_arrow_string(empty_part)
                     empty_sample_found = true
                 end
             end
-            curr_location.src_parameters["nrows"] = total_nrows
+            curr_src_parameters["nrows"] = string(total_nrows)
+            curr_src_parameters["total_memory_usage"] = string(total_memory_usage)
+
+            if !is_disk && batch_idx == nbatches && total_memory_usage <= sampling_config.max_num_bytes_exact
+                # If the total # of rows turns out to be inexact then we can simply mark it as
+                # stale so that it can be collected more efficiently later on
+                # We should be able to quickly recompute a more useful sample later
+                # on when we need to use this location.
+                sample_invalid = true
+            end
 
             # Get the actual sample by concatenating
-            curr_location.sample = if is_disk
-                Sample()
-            else
+            if !is_disk && !sample_invalid
                 sampled_parts = [gathered[4] for gathered in gathered_data]
                 if batch_idx > 1
                     push!(sampled_parts, curr_location.sample.value |> seekstart |> Arrow.Table |> DataFrames.DataFrame)
                 end
-                new_sample_value_arrow = IOBuffer()
-                Arrow.write(new_sample_value_arrow, vcat(sampled_parts...), compress=:zstd)
-                Sample(new_sample_value_arrow, curr_location.total_memory_usage)
+                Arrow.write(s_path, vcat(sampled_parts...), compress=:zstd)
             end
 
             # Determine paths for this batch and gather # of rows
-            Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows), compress=:zstd)
-
-            if !is_disk && batch_idx == nbatches && total_nrows <= get_max_exact_sample_length()
-                # If the total # of rows turns out to be inexact then we can simply mark it as
-                # stale so that it can be collected more efficiently later on
-                # We should be able to quickly recompute a more useful sample later
-                # on when we need to use this location.
-                curr_location.sample_invalid = true
-            end
-
-            # Write out the updated `Location`
-            serialize(location_path, curr_location)
+            Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows); compress=:zstd, metadata=curr_src_parameters)
         end
 
         ###################################
@@ -588,11 +592,11 @@ function WriteHelper(@nospecialize(format_value))
 
         if nbatches > 1 && batch_idx == nbatches
             # Copy over location and meta path
-            actual_meta_path = get_meta_path(loc_params_path)
-            actual_location_path = get_location_path(loc_params_path)
+            actual_meta_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
+            actual_sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate"
             if worker_idx == 1
                 cp(m_path, actual_meta_path, force=true)
-                cp(location_path, actual_location_path, force=true)
+                cp(s_path, actual_sample_path, force=true)
             end
 
             # Copy over files to actual location
diff --git a/BanyanDataFrames/src/utils_pfs.jl b/BanyanDataFrames/src/utils_pfs.jl
index 5ce0112d..1839e606 100644
--- a/BanyanDataFrames/src/utils_pfs.jl
+++ b/BanyanDataFrames/src/utils_pfs.jl
@@ -1,3 +1,12 @@
+function to_arrow_string(df::DataFrames.DataFrame)::String
+    io = IOBuffer()
+    Arrow.write(io, df)
+    base64encode(seekstart(io))
+end
+
+from_arrow_string(s::String)::DataFrames.DataFrame =
+    s |> base64decode |> Arrow.Table |> DataFrames.DataFrame
+
 const AnyDataFrame = Union{
     DataFrames.DataFrame,
     SubDataFrame{DataFrames.DataFrame, DataFrames.Index, Base.Vector{Int64}},
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index dff0e175..1634501e 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -35,12 +35,17 @@
         # Construct location
         if reusing != "nothing"
             RemoteTableSource(src_name, invalidate_metadata = true, invalidate_sample = true)
+            invalidate_location(src_name)
             RemoteTableSource(src_name, metadata_invalid = true, sample_invalid = true)
         end
+        if (reusing == "nothing" || reusing == "sample")
+            invalidate_metadata(src_name)
+        end
+        if (reusing == "nothing" || reusing == "location")
+            invalidate_sample(src_name)
+        end
         remote_source = RemoteTableSource(
             src_name,
-            metadata_invalid = (reusing == "nothing" || reusing == "sample"),
-            sample_invalid = (reusing == "nothing" || reusing == "location"),
             shuffled = with_or_without_shuffled == "with",
             max_exact_sample_length = max_exact_sample_length
         )
@@ -48,7 +53,7 @@
         # Verify the location
         
         @test remote_source.total_memory_usage > 0
-        @test !remote_source.parameters_invalid
+        @test !remote_source.metadata_invalid
         @test !remote_source.sample_invalid
         @test remote_source.src_parameters["nrows"] == src_nrows
         # if contains(src_name, "dir")
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 537bf573..75f7d5e9 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -33,8 +33,8 @@ function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sampl
     is_main = worker_idx == 1
 
     # Get current location
-    curr_location, curr_sample_invalid, curr_parameters_invalid = get_cached_location(path_and_subpath, metadata_invalid, sample_invalid)
-    if !curr_parameters_invalid && !curr_sample_invalid
+    curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location(path_and_subpath, metadata_invalid, sample_invalid)
+    if !curr_metadata_invalid && !curr_sample_invalid
         return curr_location
     end
 
diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl
index eb244b33..f1ab4144 100644
--- a/BanyanImages/src/locations.jl
+++ b/BanyanImages/src/locations.jl
@@ -173,7 +173,7 @@
 
 #     # Serialize generator
 #     if isnothing(remote_source)
-#         files = remotepath isa Tuple ? Banyan.to_jl_value_contents(remotepath) : files_to_read_from
+#         files = remotepath isa Tuple ? Banyan.to_jl_string(remotepath) : files_to_read_from
 #     end
 
 #     empty_part_size = (0, (datasize[2:end])...)
@@ -188,7 +188,7 @@
 #                 "ndims" => ndims,
 #                 "size" => datasize,
 #                 "eltype" => dataeltype,
-#                 "emptysample" => to_jl_value_contents(Base.Array{dataeltype}(undef, empty_part_size)),
+#                 "emptysample" => to_jl_string(Base.Array{dataeltype}(undef, empty_part_size)),
 #                 "format" => format,
 #                 "add_channelview" => add_channelview
 #             ),
@@ -287,8 +287,8 @@ function _remote_image_source(
     is_main = worker_idx == 1
 
     # Get current location
-    curr_location, curr_sample_invalid, curr_parameters_invalid = get_cached_location((remotepath, add_channelview), remotepath_id, metadata_invalid, sample_invalid)
-    if !curr_parameters_invalid && !curr_sample_invalid
+    curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location((remotepath, add_channelview), remotepath_id, metadata_invalid, sample_invalid)
+    if !curr_metadata_invalid && !curr_sample_invalid
         return curr_location
     end
 
@@ -302,12 +302,12 @@ function _remote_image_source(
     #       other is each iterated element and return a single path
 
     # Iterable object that iterates over local paths
-    meta_path = if !curr_parameters_invalid
+    meta_path = if !curr_metadata_invalid
         curr_location.src_parameters["meta_path"]::String
     else
         is_main ? get_meta_path((remotepath, add_channelview), remotepath_id) : ""
     end
-    if is_main && curr_parameters_invalid
+    if is_main && curr_metadata_invalid
         localpaths::Base.Vector{String} = getpaths(remotepath)
         Arrow.write(meta_path, (path=localpaths,))
     end
@@ -361,7 +361,7 @@ function _remote_image_source(
         # Construct location with metadata
         location_res = LocationSource(
             "Remote",
-            if curr_parameters_invalid
+            if curr_metadata_invalid
                 empty_part_size = (0, (datasize_res[2:end])...)
                 Dict{String,Any}(
                     "meta_path" => meta_path,
@@ -370,7 +370,7 @@ function _remote_image_source(
                     "ndims" => ndims_res,
                     "size" => datasize_res,
                     "eltype" => dataeltype_res,
-                    "empty_sample" => to_jl_value_contents(Base.Array{dataeltype_res}(undef, empty_part_size)),
+                    "empty_sample" => to_arrow_string(Base.Array{dataeltype_res}(undef, empty_part_size)),
                     "add_channelview" => add_channelview,
                     "format" => "image"
                 )
diff --git a/BanyanImages/src/pfs.jl b/BanyanImages/src/pfs.jl
index 97d7afe0..c8edf0e8 100644
--- a/BanyanImages/src/pfs.jl
+++ b/BanyanImages/src/pfs.jl
@@ -66,7 +66,7 @@ ReadBlockImage(
     loc_params["meta_path"]::String,
     loc_params["nimages"]::Int64,
     loc_params["size"],
-    Banyan.from_jl_value_contents(loc_params["empty_sample"]::String),
+    Banyan.from_jl_string(loc_params["empty_sample"]::String),
     loc_params["add_channelview"]
 )
 
diff --git a/BanyanImages/test/pfs.jl b/BanyanImages/test/pfs.jl
index ecaa05e8..c53994e6 100644
--- a/BanyanImages/test/pfs.jl
+++ b/BanyanImages/test/pfs.jl
@@ -33,7 +33,7 @@
 #         datasize = add_channelview ? (nimages, 3, 100, 100) : (nimages, 100, 100)
 #         empty_part_size = add_channelview ? (0, 3, 100, 100) : (0, 100, 100)
 #     elseif format == "generator"
-#         files = Banyan.to_jl_value_contents(path)
+#         files = Banyan.to_jl_string(path)
 #         datasize = add_channelview ? (nimages, 3, 512, 512) : (nimages, 512, 512)
 #         empty_part_size = add_channelview ? (0, 3, 512, 512) : (0, 512, 512)
 #     elseif format == "path"
@@ -66,7 +66,7 @@
 #             "ndims" => 3,
 #             "size" => datasize, # Inaccurate value
 #             "eltype" => dataeltype,
-#             "empty_sample" => Banyan.to_jl_value_contents(Base.Array{dataeltype}(undef, empty_part_size)),
+#             "empty_sample" => Banyan.to_jl_string(Base.Array{dataeltype}(undef, empty_part_size)),
 #             "format" => filetype,
 #             "add_channelview" => add_channelview
 #         ),

From d5c11b38bc29ccbe196fbabf0ad90e7504650854 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Sun, 7 Aug 2022 14:49:41 -0700
Subject: [PATCH 05/25] Refactor RemoteTableSource into RemoteSource

---
 Banyan/src/Banyan.jl              |  2 +-
 Banyan/src/locations.jl           | 46 ++++++++++++++++
 BanyanDataFrames/src/locations.jl | 89 +++++++++++--------------------
 BanyanDataFrames/src/pfs.jl       | 11 ++--
 4 files changed, 83 insertions(+), 65 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index f51bad16..2550c4db 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -92,7 +92,7 @@ export SamplingConfig
 
 # Locations
 export Location, LocationSource, LocationDestination, located, sourced, destined
-export Value, Size, Client, Disk, None
+export Value, Size, Client, Disk, None, RemoteSource
 export invalidate_all_locations, invalidate_metadata, invalidate_sample
 export NOTHING_LOCATION, INVALID_LOCATION
 export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index f1564ed5..670f1d5c 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -427,4 +427,50 @@ end
 function get_sample_and_metadata(::Val{:jl}, p, sample_rate)
     data = deserialize_retry(p)
     get_sample_from_data(data, sample_rate, size(data, 1)), size(data, 1)
+end
+
+function RemoteSource(
+    lp::LocationPath,
+    _remote_source::Function,
+    load_sample::Function,
+    load_sample_from_blob::Function,
+    write_sample::Function
+)::Location
+    # _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location
+    # load_sample accepts a file path
+    # load_sample_from_blob accepts an array of bytes
+    
+    # Look at local and S3 caches of metadata and samples to attempt to
+    # construct a Location.
+    loc, local_metadata_path, local_sample_path = get_location_source(lp)
+
+    if !loc.metadata_invalid && !loc.sample_invalid
+        # Case where both sample and parameters are valid
+        loc.sample.value = load_sample(local_sample_path)
+        loc
+    elseif loc.metadata_invalid && !loc.sample_invalid
+        # Case where parameters are invalid
+        new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
+        Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
+        new_loc.sample.value = load_sample(local_sample_path)
+        new_loc
+    else
+        # Case where sample is invalid
+
+        # Get the Location with up-to-date metadata (source parameters) and sample
+        new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
+
+        if !loc.metadata_invalid
+            # Store the metadata locally. The local copy just has the source
+            # parameters but PFs can still access the S3 copy which will have the
+            # table of file names and #s of rows.
+            Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
+        end
+
+        # Store the Arrow sample locally and update the returned Sample
+        write_sample(local_sample_path, new_loc.sample.value)
+        new_loc.sample.value = load_sample_from_blob(new_loc.sample.value)
+        
+        new_loc
+    end
 end
\ No newline at end of file
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index 36610a09..e29b73ee 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -262,7 +262,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6
             # latency for retrieving metadata/samples for BDF.jl.
             io = IOBuffer()
             Arrow.write(io, remote_sample_value, compress=:zstd)
-            remote_sample_value_arrow = io
+            remote_sample_value_arrow = io.data
 
             # Construct Sample with the concatenated value, memory usage, and sample rate
             remote_sample_value_memory_usage = total_memory_usage(remote_sample_value)
@@ -354,30 +354,31 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6
             "empty_sample" => empty_sample
         )
 
-    # Write the metadata to S3 cache if previously invalid
-    if curr_metadata_invalid
-        # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write`
-        Arrow.write(
-            is_main ? metadata_path : IOBuffer(),
-            (path=remotepaths, nrows=meta_nrows);
-            compress=:zstd,
-            metadata=src_params
-        )
-    end
+    if is_main
+        # Write the metadata to S3 cache if previously invalid
+        if curr_metadata_invalid
+            # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write`
+            Arrow.write(
+                is_main ? metadata_path : IOBuffer(),
+                (path=remotepaths, nrows=meta_nrows);
+                compress=:zstd,
+                metadata=src_params
+            )
+        end
 
-    # Write the sample to S3 cache if previously invalid
-    if curr_sample_invalid
-        write(sample_path, remote_sample.value.data)
-    end
+        # Write the sample to S3 cache if previously invalid
+        if curr_sample_invalid
+            write(sample_path, remote_sample.value.data)
+        end
 
-    if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
-        @show (remotepath, meta_path)
-    end
+        if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
+            @show (remotepath, meta_path)
+        end
 
-    # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)")
+        # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)")
+
+        # Return LocationSource to client specified
 
-    # Return LocationSource to client side
-    if is_main
         # Construct the `Location` to return
         if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
             @show total_nbytes
@@ -394,7 +395,6 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6
 end
 
 load_arrow_sample(f) = f |> Arrow.Table |> DataFrames.DataFrame
-load_arrow_sample_from_buf(iobuf) = iobuf |> seekstart |> load_arrow_sample
 
 # TODO: Modify offloaded function to:
 # - Use get_sampling_config() to get sample rate, shuffled, max_num_bytes_exact
@@ -406,43 +406,14 @@ load_arrow_sample_from_buf(iobuf) = iobuf |> seekstart |> load_arrow_sample
 # - Keep empty_sample but make it be a string of Arrow data with a to/from_arrow_value
 # - Return location with sample and metadata
 
-function RemoteTableSource(remotepath)::Location
-    lp = LocationPath(remotepath, "arrow", "2")
-    
-    # Look at local and S3 caches of metadata and samples to attempt to
-    # construct a Location.
-    loc, local_metadata_path, local_sample_path = get_location_source(lp)
-
-    if !loc.metadata_invalid && !loc.sample_invalid
-        # Case where both sample and parameters are valid
-        loc.sample.value = load_arrow_sample(local_sample_path)
-        loc
-    elseif loc.metadata_invalid && !loc.sample_invalid
-        # Case where parameters are invalid
-        new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
-        Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
-        new_loc.sample.value = load_arrow_sample(local_sample_path)
-        new_loc
-    else
-        # Case where sample is invalid
-
-        # Get the Location with up-to-date metadata (source parameters) and sample
-        new_loc = offloaded(_remote_table_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
-
-        if !loc.metadata_invalid
-            # Store the metadata locally. The local copy just has the source
-            # parameters but PFs can still access the S3 copy which will have the
-            # table of file names and #s of rows.
-            Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
-        end
-
-        # Store the Arrow sample locally and update the returned Sample
-        write(local_sample_path, new_loc.sample.value.data)
-        new_loc.sample.value = load_arrow_sample_from_buf(new_loc.sample.value)
-        
-        new_loc
-    end
-end
+RemoteTableSource(remotepath)::Location =
+    RemoteSource(
+        LocationPath(remotepath, "arrow", "2"),
+        _remote_table_source,
+        load_arrow_sample,
+        load_arrow_sample,
+        write
+    )
 
 # Load metadata for writing
 # NOTE: `remotepath` should end with `.parquet` or `.csv` if Parquet
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index 0c877289..aa4f453c 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -199,7 +199,8 @@ function ReadBlockHelper(@nospecialize(format_value))
         lp = LocationPath(loc_params_path, "arrow", "2")
         balanced = params[symbol_balanced]
         m_path = "s3/$(banyan_metadata_bucket_name())/$(Banyan.get_metadata_path(lp))"
-        loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params
+        m_tbl = Arrow_Table_retry(m_path)
+        loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(m_tbl)) : loc_params
         if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
             @show (m_path, loc_params, get_worker_idx())
         end
@@ -217,12 +218,12 @@ function ReadBlockHelper(@nospecialize(format_value))
         # [1] https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
 
         # Initialize
-        meta_nrows = loc_params["nrows"]
-        meta_path = loc_params["path"]
+        meta_nrows = m_tbl.nrows
+        meta_path = m_tbl.path
         nworkers = get_nworkers(comm)
         npartitions = nbatches * nworkers
         partition_idx = get_partition_idx(batch_idx, nbatches, comm)
-        nrows::Int64 = loc_params[symbol_nrows]::Int64
+        nrows::Int64 = meta_nrows
         rows_per_partition = cld(nrows, npartitions)
         sorting_perm = sortperm(meta_nrows, rev=true)
         files_by_partition = Base.Vector{Int64}[]
@@ -312,7 +313,7 @@ function ReadBlockHelper(@nospecialize(format_value))
             ndfs = 0
             rowsscanned = 0
             files_to_read = []
-            for file in Tables.rows(meta)
+            for file in Tables.rows(m_tbl)
                 path = file[1]
                 path_nrows = file[2]
                 newrowsscanned = rowsscanned + path_nrows

From 347d1f8f8789ccd42db236a2a2adfe33aca09cad Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Sun, 7 Aug 2022 16:20:46 -0700
Subject: [PATCH 06/25] Modify BanyanHDF5.jl to use new sample caching system

---
 Banyan/src/location.jl             |  2 +
 Banyan/src/locations.jl            | 14 +++---
 BanyanDataFrames/src/df.jl         |  2 +-
 BanyanDataFrames/src/locations.jl  |  7 ++-
 BanyanHDF5/src/hdf5.jl             |  5 +--
 BanyanHDF5/src/locations.jl        | 72 +++++++++++++++---------------
 BanyanONNXRunTime/src/locations.jl |  1 -
 7 files changed, 54 insertions(+), 49 deletions(-)

diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 93e41110..457ed3b1 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -62,6 +62,8 @@ struct LocationPath
             format_version
         )
     end
+
+    LocationPath(path) = LocationPath(path, "jl", get_julia_version())``
 end
 
 global TABLE_FORMATS = ["csv", "parquet", "arrow"]
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 670f1d5c..8379a0e5 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -433,16 +433,20 @@ function RemoteSource(
     lp::LocationPath,
     _remote_source::Function,
     load_sample::Function,
-    load_sample_from_blob::Function,
+    load_sample_after_offloaded::Function,
     write_sample::Function
 )::Location
     # _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location
     # load_sample accepts a file path
-    # load_sample_from_blob accepts an array of bytes
+    # load_sample_after_offloaded accepts the sampled value returned by the offloaded function
+    # (for BDF.jl, this is an Arrow blob of bytes that needs to be converted into an actual
+    # dataframe once sent to the client side)
     
     # Look at local and S3 caches of metadata and samples to attempt to
     # construct a Location.
     loc, local_metadata_path, local_sample_path = get_location_source(lp)
+    sc = get_sampling_config(lp)
+    sc.rate = parse_sample_rate(local_sample_path)
 
     if !loc.metadata_invalid && !loc.sample_invalid
         # Case where both sample and parameters are valid
@@ -450,7 +454,7 @@ function RemoteSource(
         loc
     elseif loc.metadata_invalid && !loc.sample_invalid
         # Case where parameters are invalid
-        new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
+        new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true)
         Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
         new_loc.sample.value = load_sample(local_sample_path)
         new_loc
@@ -458,7 +462,7 @@ function RemoteSource(
         # Case where sample is invalid
 
         # Get the Location with up-to-date metadata (source parameters) and sample
-        new_loc = offloaded(_remote_source, lp, loc, parse_sample_rate(local_sample_path); distributed=true)
+        new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true)
 
         if !loc.metadata_invalid
             # Store the metadata locally. The local copy just has the source
@@ -469,7 +473,7 @@ function RemoteSource(
 
         # Store the Arrow sample locally and update the returned Sample
         write_sample(local_sample_path, new_loc.sample.value)
-        new_loc.sample.value = load_sample_from_blob(new_loc.sample.value)
+        new_loc.sample.value = load_sample_after_offloaded(new_loc.sample.value)
         
         new_loc
     end
diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl
index 9ccede94..485a0d12 100644
--- a/BanyanDataFrames/src/df.jl
+++ b/BanyanDataFrames/src/df.jl
@@ -51,7 +51,7 @@ function read_table(path::String; kwargs...)
     @nospecialize
     df_loc = RemoteTableSource(path; kwargs...)
     df_loc.src_name == "Remote" || error("$path does not exist")
-    df_loc_nrows::Int64 = df_loc.src_parameters["nrows"]
+    df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"])
     df_nrows = Future(df_loc_nrows)
     DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows)
 end
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index e29b73ee..ab369514 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -2,10 +2,9 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end]
 
 Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5))
 
-function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location
+function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::SamplingConfig)::Location
     # Setup for sampling
     remotepath = lp.path
-    sampling_config = get_sampling_config(lp)
     shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact
     # TODO: Replace `max_exact_sample_length` with `max_num_bytes_exact`
     is_main = is_main_worker()
@@ -359,7 +358,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int6
         if curr_metadata_invalid
             # Write `NamedTuple` with metadata to `meta_path` with `Arrow.write`
             Arrow.write(
-                is_main ? metadata_path : IOBuffer(),
+                metadata_path,
                 (path=remotepaths, nrows=meta_nrows);
                 compress=:zstd,
                 metadata=src_params
@@ -423,7 +422,7 @@ RemoteTableDestination(remotepath)::Location =
         "Remote",
         Dict(
             "format" => get_file_ending(remotepath),
-            "nrows" => 0,
+            "nrows" => "0",
             "path" => remotepath,
         ),
     )
\ No newline at end of file
diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl
index 30e20ad5..8f3d1979 100644
--- a/BanyanHDF5/src/hdf5.jl
+++ b/BanyanHDF5/src/hdf5.jl
@@ -2,9 +2,8 @@ function read_hdf5(path; kwargs...)
     A_loc = RemoteHDF5Source(path; kwargs...)
     A_loc.src_name == "Remote" || error("$path does not exist")
     A = Future(datatype="Array", source=A_loc)
-    A_loc_size = A_loc.src_parameters["size"]
-    A_loc_eltype = A_loc.src_parameters["eltype"]
-    A_loc_ndims = A_loc.src_parameters["ndims"]
+    A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"])
+    A_loc_ndims = length(A_loc_size)
     BanyanArrays.Array{A_loc_eltype,A_loc_ndims}(A, Future(A_loc_size))
 end
 
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 75f7d5e9..2ad32aa7 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -26,16 +26,20 @@ end
 
 HDF5_getindex_retry = retry(HDF5.getindex; delays=Base.ExponentialBackOff(; n=5))
 
-function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sample_invalid, invalidate_metadata, invalidate_sample, max_exact_sample_length)
+function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig)
+    path_and_subpath = lp.path
+    shuffled = sc.assume_shuffled
+    curr_metadata_invalid = loc.metadata_invalid
+    curr_sample_invalid = loc.sample_invalid
+
     # Get session information
-    session_sample_rate = get_sample_rate()
+    sample_rate = sc.rate
     worker_idx, nworkers = get_worker_idx(), get_nworkers()
     is_main = worker_idx == 1
 
     # Get current location
-    curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location(path_and_subpath, metadata_invalid, sample_invalid)
     if !curr_metadata_invalid && !curr_sample_invalid
-        return curr_location
+        return loc
     end
 
     # Download the path
@@ -85,8 +89,8 @@ function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sampl
         # Read in the sample on each worker and
         # aggregate and concatenate it on the main worker
         rand_indices_range = split_len(datalength, worker_idx, nworkers)
-        rand_indices = sample_from_range(rand_indices_range, session_sample_rate)
-        exact_sample_needed = datalength < max_exact_sample_length
+        rand_indices = sample_from_range(rand_indices_range, sample_rate)
+        exact_sample_needed = nbytes < sc.max_num_bytes_exact
         remaining_colons = Base.fill(Colon(), datandims-1)
         dset_sample_value = if !exact_sample_needed
             samples_on_workers = gather_across(
@@ -125,48 +129,47 @@ function _remote_hdf5_source(path_and_subpath, shuffled, metadata_invalid, sampl
             NOTHING_SAMPLE
         end
     else
-        curr_location.sample
+        NOTHING_SAMPLE
     end
 
     # Close HDF5 file
     close(f)
 
     if is_main
-        location_res = LocationSource(
-            "Remote",
-            Dict{String,Any}(
-                "path_and_subpath" => path_and_subpath,
-                "path" => remotepath,
-                "subpath" => datasetpath,
-                "size" => datasize,
-                "ndims" => datandims,
-                "eltype" => dataeltype,
-                "nbytes" => nbytes,
-                "format" => "hdf5"
-            ),
-            nbytes,
-            dset_sample,
+        # Construct parameters for Location
+        src_params = Dict{String,String}(
+            "name" => "Remote",
+            "path_and_subpath" => path_and_subpath,
+            "path" => remotepath,
+            "subpath" => datasetpath,
+            "eltype_and_size" => Banyan.to_jl_string((dataeltype, datasize)),
+            "total_memory_usage" => string(nbytes),
+            "format" => "hdf5"
         )
-        cache_location(remotepath, location_res, invalidate_sample, invalidate_metadata)
-        location_res
+
+        # Get paths to store metadata and sample in
+        metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
+        sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)"
+
+        # Store metadata and sample in S3
+        Arrow.write(metadata_path; metadata=src_params)
+        serialize(sample_path, dset_sample)
+
+        # Return Location to client side
+        LocationSource("Remote", src_params, nbytes, dset_sample)
     else
         INVALID_LOCATION
     end
 end
 
-function RemoteHDF5Source(remotepath; shuffled=false, metadata_invalid = false, sample_invalid = false, invalidate_metadata = false, invalidate_sample = false, max_exact_sample_length = Banyan.get_max_exact_sample_length())::Location
-    offloaded(
+RemoteHDF5Source(remotepath)::Location =
+    RemoteSource(
+        LocationPath(remotepath),
         _remote_hdf5_source,
-        remotepath,
-        shuffled,
-        metadata_invalid,
-        sample_invalid,
-        invalidate_metadata,
-        invalidate_sample,
-        max_exact_sample_length;
-        distributed=true
+        deserialize,
+        identity,
+        serialize
     )
-end
 
 function RemoteHDF5Destination(remotepath)::Location
     path_and_subpath = remotepath
@@ -178,7 +181,6 @@ function RemoteHDF5Destination(remotepath)::Location
             "path" => remotepath,
             "subpath" => datasetpath,
             "path_and_subpath" => path_and_subpath,
-            "nbytes" => 0,
             "format" => "hdf5"
         )
     )
diff --git a/BanyanONNXRunTime/src/locations.jl b/BanyanONNXRunTime/src/locations.jl
index 613b7a66..812ad8f1 100644
--- a/BanyanONNXRunTime/src/locations.jl
+++ b/BanyanONNXRunTime/src/locations.jl
@@ -19,7 +19,6 @@ function RemoteONNXSource(remotepath)::Location
     loc_for_reading = "Remote"
     metadata_for_reading = Dict{String,Any}(
         "path" => remotepath,
-        "nbytes" => nbytes,
         "format" => "onnx",
         "datatype" => "ONNX"
     )

From ac9d1d15eb16d47e479ef0cc74d1904ca9605f82 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Sun, 7 Aug 2022 18:19:58 -0700
Subject: [PATCH 07/25] Modify BanyanImages.jl to use new sample caching system

---
 Banyan/src/locations.jl       |   7 +-
 Banyan/src/utils.jl           |  53 ++++++++++++-
 BanyanHDF5/src/hdf5.jl        |   2 +
 BanyanHDF5/src/locations.jl   |  27 ++++---
 BanyanImages/src/image.jl     |  11 +--
 BanyanImages/src/locations.jl | 138 +++++++++++++++++-----------------
 BanyanImages/src/pfs.jl       |  16 ++--
 7 files changed, 158 insertions(+), 96 deletions(-)

diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 8379a0e5..8c4a0378 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -434,7 +434,8 @@ function RemoteSource(
     _remote_source::Function,
     load_sample::Function,
     load_sample_after_offloaded::Function,
-    write_sample::Function
+    write_sample::Function,
+    args...
 )::Location
     # _remote_table_source(lp::LocationPath, loc::Location, sample_rate::Int64)::Location
     # load_sample accepts a file path
@@ -454,7 +455,7 @@ function RemoteSource(
         loc
     elseif loc.metadata_invalid && !loc.sample_invalid
         # Case where parameters are invalid
-        new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true)
+        new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true)
         Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
         new_loc.sample.value = load_sample(local_sample_path)
         new_loc
@@ -462,7 +463,7 @@ function RemoteSource(
         # Case where sample is invalid
 
         # Get the Location with up-to-date metadata (source parameters) and sample
-        new_loc = offloaded(_remote_source, lp, loc, sc; distributed=true)
+        new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true)
 
         if !loc.metadata_invalid
             # Store the metadata locally. The local copy just has the source
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index 3b02e192..6a609027 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -637,4 +637,55 @@ exponential_backoff_1s =
 # 0.20068919503553564
 # 0.29422854986603664
 # 0.4414150248213825
-# ```
\ No newline at end of file
+# ````
+
+invert(my_dict::AbstractDict) = Dict(value => key for (key, value) in my_dict)
+
+TYPE_TO_STR =
+    Dict{DataType,String}(
+        Int8 => "int8",
+        Int16 => "int16",
+        Int32 => "int32",
+        Int64 => "int64",
+        Int128 => "int128",
+        Float16 => "float16",
+        Float32 => "float32",
+        Float64 => "float64",
+        String => "str",
+        Bool => "bool",
+    )
+
+STR_TO_TYPE = invert(TYPE_TO_STR)
+
+function type_to_str(ty::DataType)::String
+    global TYPE_TO_STR
+    if haskey(TYPE_TO_STR, ty)
+        TYPE_TO_STR[ty]
+    else
+        "lang_jl_" * to_jl_string(ty)
+    end
+end
+
+function type_from_str(s::String)
+    if startswith(s, "lang_")
+        if startswith(s, "lang_jl_")
+            from_jl_string(s[4:end])
+        else
+            error("Cannot parse type $s from non-Julia language")
+        end
+    elseif haskey(TYPE_TO_STR, s)
+        TYPE_TO_STR[s]
+    else
+        error("Type not supported. You may need to update to the latest version of Banyan or declare the data/sample/metadata you are accessing invalid.")
+    end
+end
+
+size_to_str(sz) = join(map(string, sz), ",")
+size_from_str(s) =
+    let sz_strs = split(s, ",")
+        res = Vector{Int64}(undef, length(sz_strs))
+        for (i, sz_str) in enumerate(sz_strs)
+            res[i] = parse(Int64, sz_str)
+        end
+        Tuple(res)
+    end
\ No newline at end of file
diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl
index 8f3d1979..d7f8959a 100644
--- a/BanyanHDF5/src/hdf5.jl
+++ b/BanyanHDF5/src/hdf5.jl
@@ -3,6 +3,8 @@ function read_hdf5(path; kwargs...)
     A_loc.src_name == "Remote" || error("$path does not exist")
     A = Future(datatype="Array", source=A_loc)
     A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"])
+    A_loc_eltype = Banyan.type_from_str(A_loc.src_parameters["eltype"])
+    A_loc_size = Banyan.size_from_str(A_loc.src_parameters["size"])
     A_loc_ndims = length(A_loc_size)
     BanyanArrays.Array{A_loc_eltype,A_loc_ndims}(A, Future(A_loc_size))
 end
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 2ad32aa7..1dc78be0 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -123,7 +123,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig
             if exact_sample_needed
                 ExactSample(dset_sample_value, nbytes)
             else
-                Sample(dset_sample_value, nbytes)
+                Sample(dset_sample_value, nbytes, sample_rate)
             end
         else
             NOTHING_SAMPLE
@@ -137,22 +137,27 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig
 
     if is_main
         # Construct parameters for Location
-        src_params = Dict{String,String}(
-            "name" => "Remote",
-            "path_and_subpath" => path_and_subpath,
-            "path" => remotepath,
-            "subpath" => datasetpath,
-            "eltype_and_size" => Banyan.to_jl_string((dataeltype, datasize)),
-            "total_memory_usage" => string(nbytes),
-            "format" => "hdf5"
-        )
+        src_params = if curr_metadata_invalid
+            Dict{String,String}(
+                "name" => "Remote",
+                "path_and_subpath" => path_and_subpath,
+                "path" => remotepath,
+                "subpath" => datasetpath,
+                "eltype" => Banyan.size_to_str(dataszie),
+                "size" => Banyan.type_to_str(dataeltype),
+                "total_memory_usage" => string(nbytes),
+                "format" => "hdf5"
+            )
+        else
+            loc.src_parameters
+        end
 
         # Get paths to store metadata and sample in
         metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
         sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)"
 
         # Store metadata and sample in S3
-        Arrow.write(metadata_path; metadata=src_params)
+        Arrow.write(metadata_path, Arrow.Table(); metadata=src_params)
         serialize(sample_path, dset_sample)
 
         # Return Location to client side
diff --git a/BanyanImages/src/image.jl b/BanyanImages/src/image.jl
index 34f63b5c..77e4cd1a 100644
--- a/BanyanImages/src/image.jl
+++ b/BanyanImages/src/image.jl
@@ -1,10 +1,11 @@
-function read_png(path; kwargs...)
-    image_loc = RemoteImageSource(path; kwargs...)
+function read_png(path; add_channelview=false)
+    image_loc = RemoteImageSource(path, add_channelview)
     image_loc.src_name == "Remote" || error("$path does not exist")
     image = Future(;source=image_loc, datatype="Array")
-    image_loc_eltype = image_loc.src_parameters["eltype"]
-    image_loc_ndims = image_loc.src_parameters["ndims"]
-    BanyanArrays.Array{image_loc_eltype,image_loc_ndims}(image, Future(image_loc.src_parameters["size"]))
+    image_loc_eltype = type_from_str(image_loc.src_parameters["eltype"])
+    image_loc_size = size_from_str(image_loc.src_parameters["size"])
+    image_loc_ndims = length(image_loc_size)
+    BanyanArrays.Array{image_loc_eltype,image_loc_ndims}(image, Future(image_loc_size))
 end
 
 read_jpg(p; kwargs...) = read_png(p; kwargs...)
diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl
index f1ab4144..4d084f2a 100644
--- a/BanyanImages/src/locations.jl
+++ b/BanyanImages/src/locations.jl
@@ -272,24 +272,17 @@ _load_image_and_add_channelview(path_on_worker::String) = load_retry(path_on_wor
 
 _reshape_image(image) = reshape(image, (1, size(image)...))
 
-function _remote_image_source(
-    remotepath,
-    remotepath_id,
-    metadata_invalid,
-    sample_invalid,
-    invalidate_metadata,
-    invalidate_sample,
-    add_channelview
-)
+function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfig, remotepath, add_channelview::Bool)
+    curr_sample_invalid = loc.sample_invalid
+    curr_metadata_invalid = loc.metadata_invalid
+    
     # Get session information
-    session_sample_rate = get_sample_rate()
     worker_idx, nworkers = get_worker_idx(), get_nworkers()
     is_main = worker_idx == 1
 
     # Get current location
-    curr_location, curr_sample_invalid, curr_metadata_invalid = get_cached_location((remotepath, add_channelview), remotepath_id, metadata_invalid, sample_invalid)
     if !curr_metadata_invalid && !curr_sample_invalid
-        return curr_location
+        return loc
     end
 
     # Remote path is either
@@ -301,28 +294,23 @@ function _remote_image_source(
     #       that operates on two arguments where one is the object and the
     #       other is each iterated element and return a single path
 
-    # Iterable object that iterates over local paths
-    meta_path = if !curr_metadata_invalid
-        curr_location.src_parameters["meta_path"]::String
-    else
-        is_main ? get_meta_path((remotepath, add_channelview), remotepath_id) : ""
-    end
-    if is_main && curr_metadata_invalid
-        localpaths::Base.Vector{String} = getpaths(remotepath)
-        Arrow.write(meta_path, (path=localpaths,))
-    end
-    meta_path = sync_across(meta_path)
+    # Get paths to store metadata and sample in
+    metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
+    sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$(sc.rate))"
 
-    # Load in the metadata and get the # of images
-    meta_table = Arrow_Table_retry(meta_path)
-    nimages = Tables.rowcount(meta_table)
+    # Iterable object that iterates over local paths
+    localpaths = curr_metadata_invalid ? getpaths(remotepath) : Arrow.Table(metadata_path).path
+    nimages = length(localpaths)
     
     # Read in images on each worker. We need to read in at least one image
     # regardless of whether we want to get the sample or the metadata
-    exact_sample_needed = nimages < 10
+    _load_img = add_channelview ? _load_image_and_add_channelview : _load_image
+    first_img = is_main ? (localpaths[1] |> _load_img |> _reshape_image) : nothing
+    exact_sample_needed = is_main ? ((total_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false
+    exact_sample_needed = sync_across(exact_sample_needed)
     need_to_parallelize = nimages >= 10
     total_num_images_to_read_in = if curr_sample_invalid
-        exact_sample_needed ? nimages : cld(nimages, session_sample_rate)
+        exact_sample_needed ? nimages : cld(nimages, sc.rate)
     else
         # We still have to read in an image even if we have a valid sample
         # because to get the metadata we need at least one image.
@@ -332,9 +320,21 @@ function _remote_image_source(
         # If we don't need to paralellize then we are only reading on the main
         # worker amd we don't gather across.
         images_range_on_worker = need_to_parallelize ? split_len(total_num_images_to_read_in, worker_idx, nworkers) : 1:1
-        paths_on_worker = map(getpath, meta_table.path[images_range_on_worker])
-        images = map(add_channelview ? _load_image_and_add_channelview : _load_image, paths_on_worker)
-        sample_on_worker = map(_reshape_image, images)
+        first_img_usable = false
+        if images_range_on_worker.start == 1 && !isnothing(first_img)
+            first_img_usable = true
+            images_range_on_worker = 2:(images_range_on_worker.stop)
+        end
+        sample_on_worker = if length(images_range_on_worker) > 0
+            paths_on_worker = map(getpath, localpaths[images_range_on_worker])
+            images = map(_load_img, paths_on_worker)
+            map(_reshape_image, images)
+        else
+            []
+        end
+        if first_img_usable
+            push!(sample_on_worker, first_img)
+        end
         # sample_on_worker is an array of images
         need_to_parallelize ? gather_across(sample_on_worker) : [sample_on_worker]
         # result is an array of arrays of images
@@ -348,58 +348,62 @@ function _remote_image_source(
         # though if we only need the sample we don't technically need the
         # metadata)
         remote_sample_value = cat(vcat(samples_on_workers...)..., dims=1)
-        ndims_res = ndims(remote_sample_value)
         dataeltype_res = eltype(remote_sample_value)
         nbytes_res = cld(length(remote_sample_value) * sizeof(dataeltype_res) * nimages, total_num_images_to_read_in)
         datasize_res = indexapply(nimages, size(remote_sample_value), 1)
         remote_sample = if curr_sample_invalid
-            exact_sample_needed ? ExactSample(remote_sample_value, nbytes_res) : Sample(remote_sample_value, nbytes_res)
+            if exact_sample_needed
+                ExactSample(remote_sample_value, nbytes_res)
+            else
+                Sample(remote_sample_value, nbytes_res, sc.rate)
+            end
         else
-            curr_location.sample
+            NOTHING_SAMPLE
+        end
+
+        src_parameters = if curr_metadata_invalid
+            Dict{String,Any}(
+                "name" => "Remote",
+                "nimages" => string(nimages),
+                "total_memory_usage" => string(nbytes_res),  # NOTE: We assume all files have same size
+                "size" => size_to_str(datasize_res),
+                "eltype" => type_to_str(dataeltype_res),
+                "add_channelview" => add_channelview ? "1" : "0",
+                "format" => "image"
+            )
+        else
+            curr_location.src_parameters
+        end
+
+        # Store metadata and sample in S3
+        if curr_metadata_invalid
+            Arrow.write(metadata_path, (path=localpaths,); metadata=src_params)
+        end
+        if curr_sample_invalid
+            serialize(sample_path, remote_sample)
         end
 
         # Construct location with metadata
-        location_res = LocationSource(
-            "Remote",
-            if curr_metadata_invalid
-                empty_part_size = (0, (datasize_res[2:end])...)
-                Dict{String,Any}(
-                    "meta_path" => meta_path,
-                    "nimages" => nimages,
-                    "nbytes" => nbytes_res,  # NOTE: We assume all files have same size
-                    "ndims" => ndims_res,
-                    "size" => datasize_res,
-                    "eltype" => dataeltype_res,
-                    "empty_sample" => to_arrow_string(Base.Array{dataeltype_res}(undef, empty_part_size)),
-                    "add_channelview" => add_channelview,
-                    "format" => "image"
-                )
-            else
-                curr_location.src_parameters
-            end,
-            nbytes_res,
-            remote_sample,
-        )
-        cache_location(remotepath, remotepath_id, location_res, invalidate_sample, invalidate_metadata)
-        location_res
+        LocationSource("Remote", src_parameters, nbytes_res, remote_sample)
     else
         INVALID_LOCATION
     end
 end
 
-function RemoteImageSource(remotepath; metadata_invalid = false, sample_invalid = false, invalidate_metadata = false, invalidate_sample = false, add_channelview=false)::Location
-    offloaded(
+RemoteImageSource(remotepath, add_channelview)::Location =
+    RemoteSource(
+        LocationPath(
+            remotepath isa String ? remotepath : "lang_jl_$(hash(remotepath))",
+            add_channelview ? "jl_channelview" : "jl",
+            Banyan.get_julia_version()
+        ),
         _remote_image_source,
+        deserialize,
+        identity,
+        serialize,
         remotepath,
-        Banyan.get_remotepath_id(remotepath),
-        metadata_invalid,
-        sample_invalid,
-        invalidate_metadata,
-        invalidate_sample,
-        add_channelview;
-        distributed=true
+        add_channelview
     )
-end
 
 # TODO: Implement writing
 
diff --git a/BanyanImages/src/pfs.jl b/BanyanImages/src/pfs.jl
index c8edf0e8..a01989d8 100644
--- a/BanyanImages/src/pfs.jl
+++ b/BanyanImages/src/pfs.jl
@@ -12,8 +12,7 @@ function ReadBlockImageHelper(
     meta_path::String,
     nimages::Int64,
     datasize,
-    empty_sample,
-    add_channelview::Bool
+    add_channelview::Int64
 )
     # path = Banyan.getpath(loc_params["path"]) ? isa(loc_params["path"], String) : path
     # ndims = loc_params["ndims"]
@@ -31,13 +30,13 @@ function ReadBlockImageHelper(
     files_sub = meta_table.path[filerange]
 
     part_size = (length(files_sub), (datasize)[2:end]...)
-    empty_sample_eltype = eltype(empty_sample)
-    images = Base.Array{empty_sample_eltype}(undef, part_size)
+    elty = Banyan.type_from_str(loc_params["eltype"])
+    images = Base.Array{elty}(undef, part_size)
     # TODO: Make it so that the Arrow file only contains the paths and the local paths are computed here
     for (i, f) in enumerate(files_sub)
         filepath = Banyan.getpath(f)
         image = load_retry(filepath)
-        if add_channelview
+        if add_channelview == 1
             image = ImageCore.channelview(image)
             images[i, :, :, :] = image
         else
@@ -64,10 +63,9 @@ ReadBlockImage(
     loc_name,
     loc_params,
     loc_params["meta_path"]::String,
-    loc_params["nimages"]::Int64,
-    loc_params["size"],
-    Banyan.from_jl_string(loc_params["empty_sample"]::String),
-    loc_params["add_channelview"]
+    parse(Int64, loc_params["nimages"]),
+    Banyan.size_from_str(loc_params["size"]),
+    parse(Int64, loc_params["add_channelview"])
 )
 
 # function WriteImage(

From 08c5fbfa9b66c8c09a6efeeb48b8bd0e4e962204 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Mon, 8 Aug 2022 06:13:57 -0700
Subject: [PATCH 08/25] Implement invalidation functions, update exports, add
 kwargs to all read functions

---
 Banyan/src/Banyan.jl                       |  15 +-
 Banyan/src/location.jl                     |  10 +-
 Banyan/src/locations.jl                    | 181 +++++++++++----------
 Banyan/src/sample.jl                       |   2 +-
 Banyan/src/samples.jl                      |  11 +-
 Banyan/test/runtests.jl                    |   6 +-
 BanyanArrays/test/runtests.jl              |   4 -
 BanyanDataFrames/src/df.jl                 |   2 +
 BanyanDataFrames/src/locations.jl          |   2 +-
 BanyanDataFrames/test/sample_collection.jl |   9 +-
 BanyanDataFrames/test/utils_sessions.jl    |   4 -
 BanyanHDF5/src/hdf5.jl                     |   2 +
 BanyanHDF5/src/pfs.jl                      |   9 +-
 BanyanHDF5/test/runtests.jl                |   4 -
 BanyanHDF5/test/sample_collection.jl       |  17 +-
 BanyanImages/src/image.jl                  |   2 +
 BanyanImages/test/runtests.jl              |   4 -
 BanyanONNXRunTime/test/runtests.jl         |   4 -
 18 files changed, 142 insertions(+), 146 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 2550c4db..ce14b371 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -93,17 +93,12 @@ export SamplingConfig
 # Locations
 export Location, LocationSource, LocationDestination, located, sourced, destined
 export Value, Size, Client, Disk, None, RemoteSource
-export invalidate_all_locations, invalidate_metadata, invalidate_sample
-export NOTHING_LOCATION, INVALID_LOCATION
+export invalidate_all_locations, invalidate_location, invalidate_metadata, invalidate_samples, invalidate
+export NOTHING_LOCATION, INVALID_LOCATION, NO_LOCATION_PATH
 export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata
-export get_remotepath_id,
-    get_meta_path,
-    get_location_path,
-    get_cached_location,
-    cache_location,
-    get_max_exact_sample_length,
-    set_max_exact_sample_length
-export LocationPath
+export LocationPath, SamplingConfig
+export has_metadata, has_sample, get_sample_rate, configure_sampling
+export type_to_str, str_to_type
 
 # Serialization
 export from_jl_string, to_jl_string
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 457ed3b1..8f52660e 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -133,7 +133,7 @@ function get_sample_rate(l_path::LocationPath)
     # Find a cached sample with a similar sample rate
     pre = get_sample_path_prefix(l_path)
     banyan_samples_objects = try
-        res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"]
+        res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"]
         res isa Base.Vector ? res : [res]
     catch
         return desired_sample_rate
@@ -155,7 +155,7 @@ end
 
 function has_metadata(l_path:: LocationPath)::Bool
     try
-        !isempty(S3.list_objects_v2(Bucket=banyan_metadata_bucket_name(), prefix=get_metadata_path(l_path))["Contents"])
+        !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"])
     catch
         false
     end
@@ -165,7 +165,7 @@ function has_sample(l_path:: LocationPath)::Bool
     sc = get_sampling_config(l_path)
     pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
     try
-        !isempty(S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=pre)["Contents"])
+        !isempty(S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"])
     catch
         false
     end
@@ -199,7 +199,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path)
     metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path"
     src_params_not_stored_locally = false
-    src_params::Dict{String, String} = if exists(metadata_local_path)
+    src_params::Dict{String, String} = if isfile(metadata_local_path)
         lm = Dates.unix2datetime(mtime(metadata_local_path))
         if_modified_since_string =
             "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
@@ -300,7 +300,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
 
     # If no such sample is found, search the S3 bucket
     banyan_samples_objects = try
-        res = S3.list_objects_v2(Bucket=banyan_samples_bucket_name(), prefix=sample_path_prefix)["Contents"]
+        res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"]
         res isa Base.Vector ? res : [res]
     catch e
         if is_debug_on()
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 8c4a0378..7c0a37a2 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -279,21 +279,10 @@ Disk()::Location = deepcopy(DISK)
 # - You might have lots of huge images
 # - You might have lots of workers so your sample rate is really large
 
-MAX_EXACT_SAMPLE_LENGTH = parse(Int64, get(ENV, "BANYAN_MAX_EXACT_SAMPLE_LENGTH", "1024")::String)
-get_max_exact_sample_length()::Int64 = MAX_EXACT_SAMPLE_LENGTH
-function set_max_exact_sample_length(val)
-    global MAX_EXACT_SAMPLE_LENGTH
-    MAX_EXACT_SAMPLE_LENGTH = val
-end
-
-getsamplenrows(totalnrows::Int64)::Int64 =
-    if totalnrows <= get_max_exact_sample_length()
-        # NOTE: This includes the case where the dataset is empty
-        # (totalnrows == 0)
-        totalnrows
-    else
+getsamplenrows(totalnrows::Int64)::Int64 = begin
+        sc = get_sampling_config()
         # Must have at least 1 row
-        cld(totalnrows, get_sample_rate())
+        cld(totalnrows, sc.always_exact ? 1 : sc.rate)
     end
 
 # We maintain a cache of locations and a cache of samples. Locations contain
@@ -304,88 +293,114 @@ getsamplenrows(totalnrows::Int64)::Int64 =
 # Banyan is not aware of mutates the location. Locations should be
 # eventually stored and updated in S3 on each write.
 
-_invalidate_all_locations() = begin
-    for dir_name in ["banyan_locations", "banyan_meta"]
-        rm("s3/$(get_cluster_s3_bucket_name())/$dir_name/", force=true, recursive=true)
+function invalidate_metadata(p; kwargs...)
+    lp = get_location_path_with_format(p; kwargs...)
+
+    # Delete locally
+    p = joinpath(homedir(), ".banyan", "metadata", get_metadata_path(lp))
+    if isfile(p)
+        rm(p)
     end
-end
-_invalidate_metadata(remotepath) =
-    let p = get_location_path(remotepath)
-        if isfile(p)
-            loc = deserialize_retry(p)
-            loc.metadata_invalid = true
-            serialize(p, loc)
+
+    # Delete from S3
+    try
+        S3.delete_object(banyan_samples_bucket_name(), get_metadata_path(lp))
+    catch e
+        if is_debug_on()
+            show(e)
         end
     end
-_invalidate_sample(remotepath) =
-    let p = get_location_path(remotepath)
-        if isfile(p)
-            loc = deserialize_retry(p)
-            loc.sample_invalid = true
-            serialize(p, loc)
+end
+function invalidate_samples(p; kwargs...)
+    lp = get_location_path_with_format(p; kwargs...)
+
+    # Delete locally
+    samples_local_dir = joinpath(homedir(), ".banyan", "samples")
+    if isdir(samples_local_dir)
+        sample_path_prefix = get_sample_path_prefix(lp)
+        for local_sample_path in readdir(samples_local_dir, join=true)
+            if startswith(local_sample_path, sample_path_prefix)
+                rm(local_sample_path)
+            end
         end
     end
-invalidate_all_locations() = offloaded(_invalidate_all_locations)
-invalidate_metadata(p) = offloaded(_invalidate_metadata, p)
-invalidate_sample(p) = offloaded(_invalidate_sample, p)
-
-@specialize
 
-# Helper functions for location constructors; these should only be called from the main worker
-
-# TODO: Hash in a more general way so equivalent paths hash to same value
-# This hashes such that an extra slash at the end won't make a difference``
-get_remotepath_id(remotepath::String) =
-    (get_julia_version(), (remotepath |> splitpath |> joinpath)) |> hash
-get_remotepath_id(remotepath) = (get_julia_version(), remotepath) |> hash
-function get_location_path(remotepath, remotepath_id)
-    session_s3_bucket_name = get_cluster_s3_bucket_name()
-    if !isdir("s3/$session_s3_bucket_name/banyan_locations/")
-        mkdir("s3/$session_s3_bucket_name/banyan_locations/")
+    # Delete from S3
+    banyan_samples_objects = try
+        res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"]
+        res isa Base.Vector ? res : [res]
+    catch e
+        if is_debug_on()
+            show(e)
+        end
+        []
     end
-    "s3/$session_s3_bucket_name/banyan_locations/$(remotepath_id)"
-end
-function get_meta_path(remotepath, remotepath_id)
-    session_s3_bucket_name = get_cluster_s3_bucket_name()
-    if !isdir("s3/$session_s3_bucket_name/banyan_meta/")
-        mkdir("s3/$session_s3_bucket_name/banyan_meta/")
+    if !isempty(banyan_samples_objects)
+        objects_to_delete = []
+        for d in banyan_samples_objects
+            push!(objects_to_delete, Dict("Key" => d["Key"]))
+        end
+        S3.delete_objects(
+            banyan_samples_bucket_name(),
+            Dict("Objects" => objects_to_delete)
+        )
     end
-    "s3/$session_s3_bucket_name/banyan_meta/$remotepath_id"
 end
-get_location_path(remotepath) =
-    get_location_path(remotepath, get_remotepath_id(remotepath))
-get_meta_path(remotepath) =
-    get_meta_path(remotepath, get_remotepath_id(remotepath))
-
-function get_cached_location(remotepath, remotepath_id, metadata_invalid, sample_invalid)
-    Random.seed!(hash((get_session_id(), remotepath_id)))
-    session_s3_bucket_name = get_cluster_s3_bucket_name()
-    location_path = "s3/$session_s3_bucket_name/banyan_locations/$remotepath_id"
-
-    curr_location::Location = try
-        deserialize_retry(location_path)
-    catch
-        INVALID_LOCATION
-    end
-    curr_location.sample_invalid = curr_location.sample_invalid || sample_invalid
-    curr_location.metadata_invalid = curr_location.metadata_invalid || metadata_invalid
-    curr_sample_invalid = curr_location.sample_invalid
-    curr_metadata_invalid = curr_location.metadata_invalid
-    curr_location, curr_sample_invalid, curr_metadata_invalid
+function invalidate_location(p; kwargs...)
+    invalidate_metadata(p; kwargs...)
+    invalidate_samples(p; kwargs...)
 end
+function invalidate_all_locations(p; kwargs...)
+    for subdir in ["samples", "metadata"]
+        local_dir = joinpath(homedir(), ".banyan", subdir)
+        if isdir(samples_local_dir)
+            rm(local_dir; force=true, recrusive=true)
+        end
+    end
 
-get_cached_location(remotepath, metadata_invalid, sample_invalid) =
-    get_cached_location(remotepath, get_remotepath_id(remotepath), metadata_invalid, sample_invalid)
+    # Delete from S3
+    for bucket_name in [banyan_samples_bucket_name(), banyan_metadata_bucket_name()]
+        banyan_samples_objects = try
+            res = S3.list_objects_v2(bucket_name)["Contents"]
+            res isa Base.Vector ? res : [res]
+        catch e
+            if is_debug_on()
+                show(e)
+            end
+            []
+        end
+        if !isempty(banyan_samples_objects)
+            objects_to_delete = []
+            for d in banyan_samples_objects
+                push!(objects_to_delete, Dict("Key" => d["Key"]))
+            end
+            try
+                S3.delete_objects(
+                    banyan_samples_bucket_name(),
+                    Dict("Objects" => objects_to_delete)
+                )
+            catch e
+                if is_debug_on()
+                    show(e)
+                end
+            end
+        end
+    end 
+end
 
-function cache_location(remotepath, remotepath_id, location_res::Location, invalidate_sample, invalidate_metadata)
-    location_path = get_location_path(remotepath, remotepath_id)
-    location_to_write = deepcopy(location_res)
-    location_to_write.sample_invalid = location_to_write.sample_invalid || invalidate_sample
-    location_to_write.metadata_invalid = location_to_write.metadata_invalid || invalidate_metadata
-    serialize(location_path, location_to_write)
+function invalidate(p; after=false, kwargs...)
+    if get(kwargs, after ? :invalidate_all_locations : :all_locations_invalid, false)
+        invalidate_all_location()
+    elseif get(kwargs, after ? :invalidate_location : :location_invalid, false)
+        invalidate_location(p; kwargs...)
+    elseif get(kwargs, after ? :invalidate_metadata : :metadata_invalid, false)
+        invalidate_metadata(p; kwargs...)
+    elseif get(kwargs, after ? :invalidate_samples : :samples_invalid, false)
+        invalidate_samples(p; kwargs...)
+    end
 end
-cache_location(remotepath, location_res::Location, invalidate_sample, invalidate_metadata) =
-    cache_location(remotepath, get_remotepath_id(remotepath), location_res, invalidate_sample, invalidate_metadata)
+
+@specialize
 
 # Functions to be extended for different data formats
 
diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl
index 1837e244..da4f70f8 100644
--- a/Banyan/src/sample.jl
+++ b/Banyan/src/sample.jl
@@ -31,5 +31,5 @@ struct SamplingConfig
     assume_shuffled::Bool
 end
 
-const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("256 MB"), false, true)
+const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true)
 session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG))
\ No newline at end of file
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index 7d93572d..667f5fbd 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -6,17 +6,18 @@ function configure_sampling(
     force_new_sample_rate=nothing,
     assume_shuffled=nothing,
     for_all_locations=false,
+    default=false,
     kwargs...
 )
     global session_sampling_configs
 
     sc = get_sampling_config(path; kwargs...)
     nsc = SamplingConfig(
-        !isnothing(sample_rate) ? rate : sc.rate,
-        !isnothing(always_exact) ? always_exact : sc.always_exact,
-        !isnothing(max_num_bytes_exact) ? max_num_bytes_exact : sc.max_num_bytes_exact,
-        !isnothing(force_new_sample_rate) ? force_new_sample_rate : sc.force_new_sample_rate,
-        !isnothing(assume_shuffled) ? assume_shuffled : sc.assume_shuffled,
+        (!isnothing(sample_rate) && !default) ? rate : sc.rate,
+        (!isnothing(always_exact) && !default) ? always_exact : sc.always_exact,
+        (!isnothing(max_num_bytes_exact) && !default) ? max_num_bytes_exact : sc.max_num_bytes_exact,
+        (!isnothing(force_new_sample_rate) && !default) ? force_new_sample_rate : sc.force_new_sample_rate,
+        (!isnothing(assume_shuffled) && !default) ? assume_shuffled : sc.assume_shuffled,
     )
 
     session_id = _get_session_id_no_error()
diff --git a/Banyan/test/runtests.jl b/Banyan/test/runtests.jl
index 911c5324..36ea9c3b 100644
--- a/Banyan/test/runtests.jl
+++ b/Banyan/test/runtests.jl
@@ -18,7 +18,6 @@ function use_session_for_testing(
     nworkers = parse(Int64, get(ENV, "BANYAN_NWORKERS", "2")),
     sample_rate = 2,
     nworkers = 2,
-    max_exact_sample_length = 50,
     scheduling_config_name = "default scheduling",
 )
     haskey(ENV, "BANYAN_CLUSTER_NAME") || error(
@@ -67,10 +66,7 @@ function use_session_for_testing(
         end
     )
     # If selected session has already failed, this will throw an error.
-    sessions_for_testing[session_config_hash] = get_session_id()
-
-    # Set the maximum exact sample length
-    set_max_exact_sample_length(max_exact_sample_length)
+    sessions_for_testing[session_config_hash] = get_session_id()(max_exact_sample_length)
 
     configure_scheduling(name = scheduling_config_name)
 
diff --git a/BanyanArrays/test/runtests.jl b/BanyanArrays/test/runtests.jl
index f0430d4d..a7b9bbc7 100644
--- a/BanyanArrays/test/runtests.jl
+++ b/BanyanArrays/test/runtests.jl
@@ -15,7 +15,6 @@ end
 function use_session_for_testing(
     f::Function;
     sample_rate = 2,
-    max_exact_sample_length = 50,
     scheduling_config_name = "default scheduling",
 )
     haskey(ENV, "BANYAN_CLUSTER_NAME") || error(
@@ -69,9 +68,6 @@ function use_session_for_testing(
     # If selected session has already failed, this will throw an error.
     sessions_for_testing[session_config_hash] = get_session_id()
 
-    # Set the maximum exact sample length
-    set_max_exact_sample_length(max_exact_sample_length)
-
     configure_scheduling(name = scheduling_config_name)
 
     try
diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl
index 485a0d12..b51eb4f0 100644
--- a/BanyanDataFrames/src/df.jl
+++ b/BanyanDataFrames/src/df.jl
@@ -49,8 +49,10 @@ Base.propertynames(df::DataFrame) = propertynames(sample(df)::DataFrames.DataFra
 
 function read_table(path::String; kwargs...)
     @nospecialize
+    invalidate(path; kwargs...)
     df_loc = RemoteTableSource(path; kwargs...)
     df_loc.src_name == "Remote" || error("$path does not exist")
+    invalidate(path; after=true, kwargs...)
     df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"])
     df_nrows = Future(df_loc_nrows)
     DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows)
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index ab369514..17bca8f0 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -206,7 +206,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
                 total_nbytes_res = reduce_and_sync_across(+, local_nbytes)
 
                 # If the sample is too small, redo it, getting an exact sample
-                if !exact_sample_needed_res && total_nbytes_res < max_exact_sample_length
+                if !exact_sample_needed_res && total_nbytes_res <= sampling_config.max_num_bytes_exact
                     exact_sample_needed = true
                     exact_sample_needed_res = true
                 else
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index 1634501e..88a85235 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -26,7 +26,6 @@
     reusing in ["nothing", "sample", "location", "sample and location"]
 
     # Use session with appropriate sample collection configuration
-    max_exact_sample_length = exact_or_inexact == "Exact" ? 1_024_000 : 0
     use_session_for_testing(sample_rate = 2) do
 
         # Use data to collect a sample from
@@ -44,11 +43,11 @@
         if (reusing == "nothing" || reusing == "location")
             invalidate_sample(src_name)
         end
-        remote_source = RemoteTableSource(
-            src_name,
-            shuffled = with_or_without_shuffled == "with",
-            max_exact_sample_length = max_exact_sample_length
+        configure_sampling(
+            always_exact = exact_or_inexact,
+            assume_shuffled = with_or_without_shuffled == "with",
         )
+        remote_source = RemoteTableSource(src_name)
 
         # Verify the location
         
diff --git a/BanyanDataFrames/test/utils_sessions.jl b/BanyanDataFrames/test/utils_sessions.jl
index 0abe03db..d569af08 100644
--- a/BanyanDataFrames/test/utils_sessions.jl
+++ b/BanyanDataFrames/test/utils_sessions.jl
@@ -11,7 +11,6 @@ end
 function use_session_for_testing(
     f::Function;
     sample_rate = 2,
-    max_exact_sample_length = 50,
     scheduling_config_name = "default scheduling",
 )
     haskey(ENV, "BANYAN_CLUSTER_NAME") || error(
@@ -67,9 +66,6 @@ function use_session_for_testing(
     # If selected session has already failed, this will throw an error.
     sessions_for_testing[session_config_hash] = get_session_id()
 
-    # Set the maximum exact sample length
-    set_max_exact_sample_length(max_exact_sample_length)
-
     configure_scheduling(name = scheduling_config_name)
 
     try
diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl
index d7f8959a..3bef15de 100644
--- a/BanyanHDF5/src/hdf5.jl
+++ b/BanyanHDF5/src/hdf5.jl
@@ -1,6 +1,8 @@
 function read_hdf5(path; kwargs...)
+    invalidate(path; kwargs...)
     A_loc = RemoteHDF5Source(path; kwargs...)
     A_loc.src_name == "Remote" || error("$path does not exist")
+    invalidate(path; after=true, kwargs...)
     A = Future(datatype="Array", source=A_loc)
     A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"])
     A_loc_eltype = Banyan.type_from_str(A_loc.src_parameters["eltype"])
diff --git a/BanyanHDF5/src/pfs.jl b/BanyanHDF5/src/pfs.jl
index b6d551a0..73456013 100644
--- a/BanyanHDF5/src/pfs.jl
+++ b/BanyanHDF5/src/pfs.jl
@@ -154,10 +154,11 @@ function WriteHelperHDF5(
     is_main = worker_idx == 1
     if is_main
         # We invalidate both the location and the metadata in this case
-        serialize(
-            Banyan.get_location_path(path_and_subpath),
-            INVALID_LOCATION
-        )
+        invalidate_location(loc_params_path)
+        # serialize(
+        #     Banyan.get_location_path(path_and_subpath),
+        #     INVALID_LOCATION
+        # )
     end
     
     # Invalidate location if 
diff --git a/BanyanHDF5/test/runtests.jl b/BanyanHDF5/test/runtests.jl
index 739ed349..8b155e77 100644
--- a/BanyanHDF5/test/runtests.jl
+++ b/BanyanHDF5/test/runtests.jl
@@ -16,7 +16,6 @@ end
 function use_session_for_testing(
     f::Function;
     sample_rate = 2,
-    max_exact_sample_length = 50,
     scheduling_config_name = "default scheduling",
 )
     haskey(ENV, "BANYAN_CLUSTER_NAME") || error(
@@ -71,9 +70,6 @@ function use_session_for_testing(
     # If selected session has already failed, this will throw an error.
     sessions_for_testing[session_config_hash] = get_session_id()
 
-    # Set the maximum exact sample length
-    set_max_exact_sample_length(max_exact_sample_length)
-
     configure_scheduling(name = scheduling_config_name)
 
     try
diff --git a/BanyanHDF5/test/sample_collection.jl b/BanyanHDF5/test/sample_collection.jl
index 82e849bb..d6544edc 100644
--- a/BanyanHDF5/test/sample_collection.jl
+++ b/BanyanHDF5/test/sample_collection.jl
@@ -19,7 +19,6 @@
     reusing in ["nothing", "sample", "location", "sample and location"]
 
     # Use session with appropriate sample collection configuration
-    max_exact_sample_length = exact_or_inexact == "Exact" ? 1_024_000 : 0
     use_session_for_testing(sample_rate = 2) do
 
         # Use data to collect a sample from
@@ -30,13 +29,17 @@
             RemoteHDF5Source(src_name, invalidate_metadata = true, invalidate_sample = true)
             RemoteHDF5Source(src_name, metadata_invalid = true, sample_invalid = true)
         end
-        remote_source = RemoteHDF5Source(
-            src_name,
-            metadata_invalid = (reusing == "nothing" || reusing == "sample"),
-            sample_invalid = (reusing == "nothing" || reusing == "location"),
-            shuffled = with_or_without_shuffled == "with",
-            max_exact_sample_length = max_exact_sample_length
+        configure_sampling(
+            always_exact = exact_or_inexact == "Exact",
+            assume_shuffled = with_or_without_shuffled == "with"
         )
+        if (reusing == "nothing" || reusing == "sample")
+            invalidate_metadata(src_name)
+        end
+        if (reusing == "nothing" || reusing == "location")
+            invalidate_locations(src_name)
+        end
+        remote_source = RemoteHDF5Source(src_name)
 
         # Verify the location
         if contains(src_name, "h5")
diff --git a/BanyanImages/src/image.jl b/BanyanImages/src/image.jl
index 77e4cd1a..b413a9ce 100644
--- a/BanyanImages/src/image.jl
+++ b/BanyanImages/src/image.jl
@@ -1,6 +1,8 @@
 function read_png(path; add_channelview=false)
+    invalidate(path; kwargs...)
     image_loc = RemoteImageSource(path, add_channelview)
     image_loc.src_name == "Remote" || error("$path does not exist")
+    invalidate(path; after=true, kwargs...)
     image = Future(;source=image_loc, datatype="Array")
     image_loc_eltype = type_from_str(image_loc.src_parameters["eltype"])
     image_loc_size = size_from_str(image_loc.src_parameters["size"])
diff --git a/BanyanImages/test/runtests.jl b/BanyanImages/test/runtests.jl
index e8e485e3..695a4f22 100644
--- a/BanyanImages/test/runtests.jl
+++ b/BanyanImages/test/runtests.jl
@@ -24,7 +24,6 @@ end
 function use_session_for_testing(
     f::Function;
     sample_rate = 2,
-    max_exact_sample_length = 50,
     scheduling_config_name = "default scheduling",
 )
     haskey(ENV, "BANYAN_CLUSTER_NAME") || error(
@@ -79,9 +78,6 @@ function use_session_for_testing(
     # If selected session has already failed, this will throw an error.
     sessions_for_testing[session_config_hash] = get_session_id()
 
-    # Set the maximum exact sample length
-    set_max_exact_sample_length(max_exact_sample_length)
-
     configure_scheduling(name = scheduling_config_name)
 
     try
diff --git a/BanyanONNXRunTime/test/runtests.jl b/BanyanONNXRunTime/test/runtests.jl
index de03c809..cbe80928 100644
--- a/BanyanONNXRunTime/test/runtests.jl
+++ b/BanyanONNXRunTime/test/runtests.jl
@@ -23,7 +23,6 @@ function use_session_for_testing(
     f::Function;
     nworkers = 2,
     sample_rate = 2,
-    max_exact_sample_length = 50,
     scheduling_config_name = "default scheduling",
 )
     haskey(ENV, "BANYAN_CLUSTER_NAME") || error(
@@ -78,9 +77,6 @@ function use_session_for_testing(
     # If selected session has already failed, this will throw an error.
     sessions_for_testing[session_config_hash] = get_session_id()
 
-    # Set the maximum exact sample length
-    set_max_exact_sample_length(max_exact_sample_length)
-
     configure_scheduling(name = scheduling_config_name)
 
     try

From ee901a80f2c63e917cd18167c1d92aaa7de36d75 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Mon, 8 Aug 2022 12:52:50 -0700
Subject: [PATCH 09/25] Implement parallel SQS data transfer and eliminate
 AWSSQS.jl dependency

---
 Banyan/src/Banyan.jl       |   9 +--
 Banyan/src/queues.jl       | 151 +++++++++++++++++++------------------
 Banyan/src/requests.jl     |  91 ++++++++++++----------
 Banyan/src/utils_queues.jl |   3 +-
 Banyan/test/Project.toml   |   1 -
 Project.toml               |   2 +
 6 files changed, 137 insertions(+), 120 deletions(-)
 create mode 100644 Project.toml

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index ce14b371..9dc9f186 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -21,10 +21,7 @@ global NOT_USING_MODULES = String["ProfileView", "SnoopCompileCore"]
 using FilePathsBase: joinpath, isempty
 using Base: notnothing, env_project_file
 
-using AWSCore,
-    AWSS3,
-    AWSSQS,
-    Base64,
+using Base64,
     DataStructures,
     Dates,
     Downloads,
@@ -41,8 +38,10 @@ using AWSCore,
     TOML
 
 using AWS.AWSServices: s3
-using S3: @service
+using AWS: @service
 @service S3
+@service SQS
+using AWSS3
 
 global BANYAN_API_ENDPOINT
 
diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl
index 2cc6c23c..412aef57 100644
--- a/Banyan/src/queues.jl
+++ b/Banyan/src/queues.jl
@@ -2,59 +2,28 @@
 # GET QUEUE URL #
 #################
 
-get_sqs_dict_from_url(url::String)::Dict{Symbol,Any} =
-    merge(
-        get_aws_config(),
-        Dict(:resource => "/" * replace(joinpath(splitpath(url)[end-1:end]), "\\"=>"/"))
-    )
-
-get_scatter_queue()::Dict{Symbol,Any} =
-    get_sqs_dict_from_url(get_session().scatter_queue_url)
-
-get_gather_queue()::Dict{Symbol,Any} =
-    get_sqs_dict_from_url(get_session().gather_queue_url)
-
-get_execution_queue()::Dict{Symbol,Any} =
-    get_sqs_dict_from_url(get_session().execution_queue_url)
+scatter_queue_url()::Dict{Symbol,Any} = get_session().scatter_queue_url
+gather_queue_url()::Dict{Symbol,Any} = get_session().gather_queue_url
+execution_queue_url()::Dict{Symbol,Any} = get_session().execution_queue_url
 
 ###################
 # RECEIVE MESSAGE #
 ###################
 
-function sqs_receive_message_with_long_polling(queue)
-    r = AWSSQS.sqs(queue, "ReceiveMessage", MaxNumberOfMessages = "1")
-    r = r["messages"]
-
-    if isnothing(r)
-        return nothing
-    end
-
-    handle  = r[1]["ReceiptHandle"]
-    id      = r[1]["MessageId"]
-    message = r[1]["Body"]
-    md5     = r[1]["MD5OfBody"]
-
-    Dict{Symbol,Any}(
-        :message => message,
-        :id => id,
-        :handle => handle
-    )
-end
-
 function get_next_message(
-    queue,
+    queue_url,
     p::Union{Nothing,ProgressMeter.ProgressUnknown} = nothing;
     delete::Bool = true,
     error_for_main_stuck::Union{Nothing,String} = nothing,
     error_for_main_stuck_time::Union{Nothing,DateTime} = nothing
 )::Tuple{String,Union{Nothing,String}}
-error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_stuck_time)
-    m = sqs_receive_message_with_long_polling(queue)
+    error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_stuck_time)
+    m = SQS.receive_message(queue_url, Dict("MaxNumberOfMessages" => "1"))
     i = 1
     j = 1
-    while (isnothing(m))
+    while (!haskey(m, "ReceiveMessageResult") || !haskey(m["ReceiveMessageResult"], "Message"))
         error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_stuck_time)
-        m = sqs_receive_message_with_long_polling(queue)
+        m = SQS.receive_message(queue_url, Dict("MaxNumberOfMessages" => "1"))
         i += 1
         if !isnothing(p)
             p::ProgressMeter.ProgressUnknown
@@ -62,19 +31,21 @@ error_for_main_stuck = check_worker_stuck(error_for_main_stuck, error_for_main_s
             j += 1
         end
     end
+    m_dict = m["ReceiveMessageResult"]["Message"]
     if delete
-        sqs_delete_message(queue, m)
+        SQS.delete_message(queue_url, m_dict["ReceiptHandle"]::String)
     end
-    return m[:message]::String, error_for_main_stuck
+    return m_dict["Body"]::String, error_for_main_stuck
 end
 
-function receive_next_message(
+function sqs_receive_next_message(
     queue_name,
     p=nothing,
     error_for_main_stuck=nothing,
     error_for_main_stuck_time=nothing
 )::Tuple{Dict{String,Any},Union{Nothing,String}}
-    content::String, error_for_main_stuck::Union{Nothing,String} = get_next_message(queue_name, p; error_for_main_stuck=error_for_main_stuck, error_for_main_stuck_time=error_for_main_stuck_time)
+    content::String, error_for_main_stuck::Union{Nothing,String} =
+        get_next_message(queue_name, p; error_for_main_stuck=error_for_main_stuck, error_for_main_stuck_time=error_for_main_stuck_time)
     res::Dict{String,Any} = if startswith(content, "JOB_READY") || startswith(content, "SESSION_READY")
         Dict{String,Any}(
             "kind" => "SESSION_READY"
@@ -126,12 +97,9 @@ end
 function receive_from_client(value_id::ValueId)
     # Send scatter message to client
     message = Dict{String,String}("kind" => "SCATTER_REQUEST", "value_id" => value_id)
-    send_message(
-        get_gather_queue(),
-        JSON.json(message)
-    )
+    sqs_send_message(gather_queue_url(), JSON.json(message))
     # Receive response from client
-    m = JSON.parse(get_next_message(get_scatter_queue())[1])
+    m = JSON.parse(get_next_message(scatter_queue_url())[1])
     v = from_jl_string(m["contents"]::String)
     v
 end
@@ -141,44 +109,81 @@ end
 # SEND MESSAGE #
 ################
 
-function send_message(queue_name, message)
+function sqs_send_message(queue_url, message)
     generated_message_id = generate_message_id()
-    sqs_send_message(
-        queue_name,
+    SQS.send_message(
+        queue_url,
         message,
-        (:MessageGroupId, "1"),
-        (:MessageDeduplicationId, generated_message_id),
+        Dict(
+            "MessageGroupId" => "1",
+            "MessageDeduplicationId" => generated_message_id
+        )
     )
 end
 
 function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
     MAX_MESSAGE_LENGTH = 220_000
     message = to_jl_string(value)::String
-    i = 1
+
+    # Break the message down into chunk ranges
+    nmessages = 0
+    message_length = length(message)
+    message_ranges = []
+    message_i = 1
     while true
-        is_last_message = length(message) <= MAX_MESSAGE_LENGTH
+        is_last_message = message_length <= MAX_MESSAGE_LENGTH
+        starti = message_i
+        if is_last_message
+            message_i += message_length
+            message_length = 0
+        else
+            message_i += MAX_MESSAGE_LENGTH
+            message_length -= MAX_MESSAGE_LENGTH
+        end
+        push!(message_ranges, starti:message_i)
+        nmessages += 1
+        if is_last_message
+            break
+        end
+    end
+
+    # Launch asynchronous threads to send SQS messages
+    gather_q_url = gather_queue_url()
+    num_chunks = length(message_ranges)
+    if num_chunks > 1
+        @sync for i = 1:message_ranges
+            @async begin
+                msg = Dict{String,Any}(
+                    "kind" => "GATHER",
+                    "value_id" => value_id,
+                    "contents" => message[message_ranges[i]],
+                    "worker_memory_used" => worker_memory_used,
+                    "chunk_idx" => i,
+                    "num_chunks" => num_chunks
+                )
+                msg_json = JSON.json(msg)
+                SQS.send_message(
+                    msg_json,
+                    gather_q_url,
+                    Dict("MessageGroupId" => string(i))
+                )
+            end
+        end
+    else
+        i = 1
         msg = Dict{String,Any}(
-            "kind" => (is_last_message ? "GATHER_END" : "GATHER"),
+            "kind" => "GATHER",
             "value_id" => value_id,
-            "contents" => if is_last_message
-                message
-            else
-                msg = message[1:MAX_MESSAGE_LENGTH]
-                message = message[MAX_MESSAGE_LENGTH+1:end]
-                msg
-            end,
+            "contents" => message[message_ranges[i]],
             "worker_memory_used" => worker_memory_used,
-            "gather_page_idx" => i
+            "chunk_idx" => i,
+            "num_chunks" => num_chunks
         )
-        send_message(
-            get_gather_queue(),
-            JSON.json(
-                msg
-            )
+        msg_json = JSON.json(msg)
+        SQS.send_message(
+            msg_json,
+            gather_q_url,
+            Dict("MessageGroupId" => string(i))
         )
-        i += 1
-        if is_last_message
-            break
-        end
     end
 end
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 6be89660..64a44981 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -18,13 +18,13 @@
 #############################
 
  function check_worker_stuck_error(
-    message::Dict{String,Any},
+    value_id::ValueId,
+    contents::String,
     error_for_main_stuck::Union{Nothing,String},
     error_for_main_stuck_time::Union{Nothing,DateTime}
 )::Tuple{Union{Nothing,String},Union{Nothing,DateTime}}
-    value_id = message["value_id"]::ValueId
     if value_id == "-2" && isnothing(error_for_main_stuck_time)
-        error_for_main_stuck_msg::String = from_jl_string(message["contents"]::String)
+        error_for_main_stuck_msg::String = from_jl_string(contents)
         if contains(error_for_main_stuck_msg, "session $(get_session_id())")
             error_for_main_stuck = error_for_main_stuck_msg
             error_for_main_stuck_time = Dates.now()
@@ -230,8 +230,8 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
     end
 
     # Get queues for moving data between client and cluster
-    scatter_queue = get_scatter_queue()
-    gather_queue = get_gather_queue()
+    scatter_queue = scatter_queue_url()
+    gather_queue = gather_queue_url()
 
     # There are two cases: either we
     # TODO: Maybe we don't need to wait_For_session
@@ -261,10 +261,9 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
     p = ProgressUnknown("Computing value with ID $(fut.value_id)", spinner=true)
     error_for_main_stuck::Union{Nothing,String} = nothing
     error_for_main_stuck_time::Union{Nothing,DateTime} = nothing
-    partial_gathers = Dict{ValueId,String}()
     while true
         # TODO: Use to_jl_value and from_jl_value to support Client
-        message, error_for_main_stuck = receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time)
+        message, error_for_main_stuck = sqs_receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time)
         message_type::String = message["kind"]
         if message_type == "SCATTER_REQUEST"
             # Send scatter
@@ -272,7 +271,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
             haskey(session.futures_on_client, value_id) || error("Expected future to be stored on client side")
             f = session.futures_on_client[value_id]::Future
             # @debug "Received scatter request for value with ID $value_id and value $(f.value) with location $(get_location(f))"
-            send_message(
+            sqs_send_message(
                 scatter_queue,
                 JSON.json(
                     Dict{String,Any}(
@@ -286,23 +285,33 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
         elseif message_type == "GATHER"
             # Receive gather
             value_id = message["value_id"]::ValueId
-            if !haskey(partial_gathers, value_id)
-                partial_gathers[value_id] = message["contents"]::String
+            num_chunks = message["num_chunks"]::Int64
+            num_remaining_chunks = num_chunks - 1
+            
+            whole_message_contents = if num_chunks > 1
+                partial_messages = Vector{String}(undef, num_chunks)
+                partial_messages[message["chunk_idx"]] = message["contents"]
+                @sync for i = 1:num_remaining_chunks
+                    @async begin
+                        partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing)
+                        chunk_idx = partial_message["chunk_idx"]
+                        partial_messages[chunk_idx] = message["contents"]
+                    end
+                end
+                join(partial_messages)
             else
-                partial_gathers[value_id] *= message["contents"]::String
+                message["contents"]
             end
-        elseif message_type == "GATHER_END"
-            value_id = message["value_id"]::ValueId
-            contents = get(partial_gathers, value_id, "") * message["contents"]::String
-            # @debug "Received gather request for $value_id"
+
             if haskey(session.futures_on_client, value_id)
-                value = from_jl_string(contents)
+                value = from_jl_string(whole_message_contents)
                 f = session.futures_on_client[value_id]::Future
                 f.value = value
                 # TODO: Update stale/mutated here to avoid costly
                 # call to `send_evaluation`
             end
-            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(message, error_for_main_stuck, error_for_main_stuck_time)
+
+            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time)
         elseif message_type == "EVALUATION_END"
             if message["end"]::Bool == true
                 break
@@ -683,39 +692,43 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
     p = ProgressUnknown("Running offloaded code", spinner=true)
     
     session = get_session()
-    gather_queue = get_gather_queue()
+    gather_queue = gather_queue_url()
     stored_message = nothing
     error_for_main_stuck, error_for_main_stuck_time = nothing, nothing
     partial_gathers = Dict{ValueId,String}()
     while true
-        message, error_for_main_stuck = receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time)
+        message, error_for_main_stuck = sqs_receive_next_message(gather_queue, p, error_for_main_stuck, error_for_main_stuck_time)
         message_type = message["kind"]::String
         if message_type == "GATHER"
             # Receive gather
             value_id = message["value_id"]::ValueId
-            contents = message["contents"]::String
-            if !haskey(partial_gathers, value_id)
-                partial_gathers[value_id] = contents
+            num_chunks = message["num_chunks"]::Int64
+            num_remaining_chunks = num_chunks - 1
+            
+            whole_message_contents = if num_chunks > 1
+                partial_messages = Vector{String}(undef, num_chunks)
+                partial_messages[message["chunk_idx"]] = message["contents"]
+                @sync for i = 1:num_remaining_chunks
+                    @async begin
+                        partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing)
+                        chunk_idx = partial_message["chunk_idx"]
+                        partial_messages[chunk_idx] = message["contents"]
+                    end
+                end
+                join(partial_messages)
             else
-                partial_gathers[value_id] *= contents
+                message["contents"]
             end
-        elseif message_type == "GATHER_END"
-            value_id = message["value_id"]::ValueId
-            contents = get(partial_gathers, value_id, "") * message["contents"]::String
-            if (value_id == "-1")
-                memory_used = message["worker_memory_used"]::Int64
-                if Banyan.INVESTIGATING_MEMORY_USAGE
-                    @show get_session().worker_memory_used
-                    @show memory_used
-                end
-                # Note that while the memory usage from offloaded computation does get
-                # reset with each session even if it reuses the same job, we do
-                # recompute the initial available memory every time we start a session
-                # and this should presumably include the offloaded memory usage.
-                get_session().worker_memory_used = get_session().worker_memory_used + memory_used
-                stored_message = from_jl_string(contents)
+
+            if haskey(session.futures_on_client, value_id)
+                value = from_jl_string(whole_message_contents)
+                f = session.futures_on_client[value_id]::Future
+                f.value = value
+                # TODO: Update stale/mutated here to avoid costly
+                # call to `send_evaluation`
             end
-            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(message, error_for_main_stuck, error_for_main_stuck_time) 
+
+            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time)
         elseif (message_type == "EVALUATION_END")
             if message["end"]::Bool == true
                 return stored_message
diff --git a/Banyan/src/utils_queues.jl b/Banyan/src/utils_queues.jl
index 17755bf6..da73b514 100644
--- a/Banyan/src/utils_queues.jl
+++ b/Banyan/src/utils_queues.jl
@@ -1,5 +1,4 @@
 using Dates
-using AWSSQS
 
 @nospecialize
 
@@ -33,7 +32,7 @@ function run_with_retries(
 end
 
 sqs_get_queue_with_retries(args...; kwargs...) = run_with_retries(
-    sqs_get_queue,
+    SQS.get_queue_url,
     args...;
     failure_message = "Queue for communicating results is nonexistent",
     kwargs...
diff --git a/Banyan/test/Project.toml b/Banyan/test/Project.toml
index 229e8c72..dd440603 100644
--- a/Banyan/test/Project.toml
+++ b/Banyan/test/Project.toml
@@ -2,7 +2,6 @@
 AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
 AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598"
 AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
-AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 00000000..8aa62950
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,2 @@
+[deps]
+AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"

From 8daff0a7bd2d0aa33193c2abc4a5f25907a86994 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Tue, 9 Aug 2022 07:25:04 -0700
Subject: [PATCH 10/25] Remove redundant total_memory_usage

---
 Banyan/src/Banyan.jl                          |  6 +-
 Banyan/src/annotation.jl                      | 16 ++---
 Banyan/src/clusters.jl                        |  6 +-
 Banyan/src/future.jl                          |  4 +-
 Banyan/src/futures.jl                         |  2 +-
 Banyan/src/location.jl                        | 50 ++++++--------
 Banyan/src/locations.jl                       | 22 +++---
 Banyan/src/precompile.jl                      |  5 +-
 Banyan/src/sample.jl                          | 15 ++--
 Banyan/src/samples.jl                         |  5 --
 Banyan/src/sessions.jl                        | 18 ++---
 Banyan/src/utils.jl                           | 58 ++--------------
 Banyan/src/utils_s3fs.jl                      |  6 +-
 Banyan/test/clusters.jl                       | 16 ++---
 Banyan/test/sessions.jl                       |  2 +-
 BanyanDataFrames/src/BanyanDataFrames.jl      |  2 +-
 BanyanDataFrames/src/gdf.jl                   |  2 +-
 BanyanDataFrames/src/locations.jl             | 14 ++--
 BanyanDataFrames/src/pfs.jl                   | 12 ++--
 BanyanDataFrames/test/latency.jl              |  4 +-
 BanyanDataFrames/test/runtests.jl             |  2 +-
 .../test/runtests_without_retest.jl           |  6 +-
 BanyanDataFrames/test/sample_collection.jl    | 69 +++++++++++++++++--
 BanyanDataFrames/test/utils_data.jl           | 38 +++++-----
 BanyanHDF5/src/locations.jl                   |  2 +-
 BanyanHDF5/test/runtests.jl                   |  4 +-
 BanyanImages/src/locations.jl                 |  4 +-
 BanyanImages/test/locations.jl                |  2 +-
 BanyanImages/test/pfs.jl                      |  2 +-
 BanyanImages/test/utils_data.jl               | 10 +--
 BanyanONNXRunTime/src/locations.jl            |  2 +-
 31 files changed, 203 insertions(+), 203 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 9dc9f186..640175cf 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -29,7 +29,6 @@ using Base64,
     FilePathsBase,
     HTTP,
     JSON,
-    IniFile,
     LibGit2,
     MPI,
     ProgressMeter,
@@ -37,6 +36,7 @@ using Base64,
     Serialization,
     TOML
 
+using AWS
 using AWS.AWSServices: s3
 using AWS: @service
 @service S3
@@ -85,7 +85,7 @@ export AbstractFuture, Future, partitioned_computation, compute_inplace, compute
 
 # Samples
 export Sample, ExactSample, sample, sample_for_grouping, SampleForGrouping, setsample!
-export sample_memory_usage, total_memory_usage, sample_axes, sample_keys, sample_by_key
+export sample_memory_usage, sample_memory_usage, sample_axes, sample_keys, sample_by_key
 export NOTHING_SAMPLE
 export SamplingConfig
 
@@ -189,7 +189,7 @@ export is_debug_on,
 export Empty, EMPTY, nonemptytype, disallowempty, empty_handler
 
 # Utilities for location constructors
-export get_cached_location, cache_location, get_sample_from_data, sample_from_range
+export get_sample_from_data, sample_from_range
 
 # Partitioning functions for usage in sessions that run on the cluster; dispatched
 # based on `res/pf_dispatch_table.json`.
diff --git a/Banyan/src/annotation.jl b/Banyan/src/annotation.jl
index 7a8a0891..9c5d3243 100644
--- a/Banyan/src/annotation.jl
+++ b/Banyan/src/annotation.jl
@@ -611,8 +611,8 @@ function apply_mutation(old::Future, new::Future)
     new.mutated,
     old.stale,
     new.stale,
-    old.total_memory_usage,
-    new.total_memory_usage,
+    old.sample_memory_usage,
+    new.sample_memory_usage,
     session_locations[old.value_id],
     session_locations[new.value_id] =
     new.value,
@@ -623,8 +623,8 @@ function apply_mutation(old::Future, new::Future)
     old.mutated,
     new.stale,
     old.stale,
-    new.total_memory_usage,
-    old.total_memory_usage,
+    new.sample_memory_usage,
+    old.sample_memory_usage,
     session_locations[new.value_id],
     session_locations[old.value_id]
 end
@@ -675,11 +675,11 @@ function finish_partitioned_code_region(splatted_futures::Vector{Future})
 
     # Get the initial memory usage
     for fut in splatted_futures
-        fut_initial_memory_usage::Int64 = if is_total_memory_usage_known(fut)
-            fut.total_memory_usage
+        fut_initial_memory_usage::Int64 = if is_sample_memory_usage_known(fut)
+            fut.sample_memory_usage
         else
             tmu::Int64 = try
-                get_location(fut).total_memory_usage
+                get_location(fut).sample_memory_usage
             catch e
                 if e isa MethodError
                     error("Future with value ID $(fut.value_id) has no initial memory usage even in location with source name $(get_location(fut).src_name)")
@@ -877,7 +877,7 @@ function finish_partitioned_code_region(splatted_futures::Vector{Future})
 
     # Destroy value IDs that are no longer needed because of mutation
     for fut in splatted_futures
-        fut.total_memory_usage = task.memory_usage[fut.value_id]["final"]
+        fut.sample_memory_usage = task.memory_usage[fut.value_id]["final"]
 
         # Issue destroy request for mutated futures that are no longer
         # going to be used
diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl
index 91c63217..c96efa47 100644
--- a/Banyan/src/clusters.jl
+++ b/Banyan/src/clusters.jl
@@ -75,7 +75,7 @@ function create_cluster(;
     end
     if isnothing(s3_bucket_arn)
         s3_bucket_arn = ""
-    elseif !(s3_bucket_name in s3_list_buckets(get_aws_config()))
+    elseif !(s3_bucket_name in s3_list_buckets(global_aws_config()))
         error("Bucket $s3_bucket_name does not exist in the connected AWS account")
     end
 
@@ -294,7 +294,7 @@ end
 function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cluster_name(), kwargs...)
     configure(; kwargs...)
     bucket_name = get_cluster_s3_bucket_name(cluster_name)
-    s3_dst_path = S3Path("s3://$bucket_name/$dst_name", config=get_aws_config())
+    s3_dst_path = S3Path("s3://$bucket_name/$dst_name", config=global_aws_config())
     if startswith(src_path, "http://") || startswith(src_path, "https://")
         Downloads.download(
             src_path,
@@ -320,7 +320,7 @@ function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cl
                     Path("$src_path/$f_name"),
                     S3Path(
                         "s3://$bucket_name/$(basename(src_path))/$(f_name)",
-                        config=get_aws_config()
+                        config=global_aws_config()
                     )
                 )
             end
diff --git a/Banyan/src/future.jl b/Banyan/src/future.jl
index 74318306..7fc1e3f0 100644
--- a/Banyan/src/future.jl
+++ b/Banyan/src/future.jl
@@ -4,7 +4,7 @@ mutable struct Future <: AbstractFuture
     value_id::ValueId
     mutated::Bool
     stale::Bool
-    total_memory_usage::Int64
+    sample_memory_usage::Int64
 end
 
 const NOTHING_FUTURE = Future("", nothing, "", false, false, -1)
@@ -12,7 +12,7 @@ Base.isnothing(f::Future) = isempty(f.value_id)
 
 Base.hash(f::Future) = hash(f.value_id)
 
-is_total_memory_usage_known(f::Future) = f.total_memory_usage != -1
+is_sample_memory_usage_known(f::Future) = f.sample_memory_usage != -1
 
 isview(f::AbstractFuture) = false
 
diff --git a/Banyan/src/futures.jl b/Banyan/src/futures.jl
index 01cbde61..c1e769a1 100644
--- a/Banyan/src/futures.jl
+++ b/Banyan/src/futures.jl
@@ -46,7 +46,7 @@ function create_new_future(source::Location, mutate_from::Future, datatype::Stri
 end
 
 function create_future_from_sample(value::T, datatype::String)::Future where T
-    location::Location = if total_memory_usage(value) ≤ 4 * 1024
+    location::Location = if sample_memory_usage(value) ≤ 4 * 1024
         Value(value)
     else
         # TODO: Store values in S3 instead so that we can read from there
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 8f52660e..85a79dc7 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -8,36 +8,10 @@ mutable struct Location
     dst_name::String
     src_parameters::LocationParameters
     dst_parameters::LocationParameters
-    total_memory_usage::Int64
+    sample_memory_usage::Int64
     sample::Sample
     metadata_invalid::Bool
     sample_invalid::Bool
-
-    # function Location(
-    #     src_name::String,
-    #     dst_name::String,
-    #     src_parameters::Dict{String,<:Any},
-    #     dst_parameters::Dict{String,<:Any},
-    #     total_memory_usage::Union{Int64,Nothing} = nothing,
-    #     sample::Sample = Sample(),
-    # )
-    #     # NOTE: A file might be None and None if it is simply to be cached on
-    #     # disk and then read from
-    #     # if src_name == "None" && dst_name == "None"
-    #     #     error(
-    #     #         "Location must either be usable as a source or as a destination for data",
-    #     #     )
-    #     # end
-
-    #     new(
-    #         src_name,
-    #         dst_name,
-    #         src_parameters,
-    #         dst_parameters,
-    #         total_memory_usage,
-    #         sample
-    #     )
-    # end
 end
 
 struct LocationPath
@@ -66,8 +40,10 @@ struct LocationPath
     LocationPath(path) = LocationPath(path, "jl", get_julia_version())``
 end
 
+# Functions with `LocationPath`s`
+
 global TABLE_FORMATS = ["csv", "parquet", "arrow"]
-z
+
 function get_location_path_with_format(p::String, kwargs...)::LocationPath
     if isempty(p)
         return NO_LOCATION_PATH
@@ -102,6 +78,16 @@ Base.hash(lp::LocationPath) = lp.path_hash_uint
 
 const NO_LOCATION_PATH = LocationPath("", "", "")
 
+# Sample config management
+
+const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true)
+session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG))
+
+function set_session_sampling_configs(d::Dict{SessionId,Dict{LocationPath,SamplingConfig}})
+    global session_sampling_configs
+    session_sampling_configs = d
+end
+
 get_sampling_config(path="", kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...))
 function get_sampling_configs()
     global session_sampling_configs
@@ -112,6 +98,8 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig =
         get(scs, l_path, scs[NO_LOCATION_PATH])
     end
 
+# Getting sample rate
+
 get_sample_rate(p::String=""; kwargs...) =
     get_sample_rate(get_location_path_with_format(p; kwargs...))
 parse_sample_rate(object_key) =
@@ -153,6 +141,8 @@ function get_sample_rate(l_path::LocationPath)
     sample_rate != -1 ? sample_rate : desired_sample_rate
 end
 
+# Checking for having metadata, samples
+
 function has_metadata(l_path:: LocationPath)::Bool
     try
         !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"])
@@ -171,6 +161,8 @@ function has_sample(l_path:: LocationPath)::Bool
     end
 end
 
+# Helper function for getting `Location` for location constructors
+
 twodigit(i::Int64) = i < 10 ? ("0" * string(i)) : string(i)
 
 get_src_params_dict(d::Union{Nothing,Base.ImmutableDict{String, String}}) =
@@ -331,7 +323,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     res_location = LocationSource(
         get(src_params, "name", "Remote"),
         src_params,
-        parse(Int64, get(src_params, "total_memory_usage", "0")),
+        parse(Int64, get(src_params, "sample_memory_usage", "0")),
         NOTHING_SAMPLE
     )
     res_location.metadata_invalid = isempty(src_params)
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 7c0a37a2..15a7e442 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -6,13 +6,13 @@ const NOTHING_LOCATION = Location("None", "None", LocationParameters(), Location
 
 const INVALID_LOCATION = Location("None", "None", LocationParameters(), LocationParameters(), Int64(-1), NOTHING_SAMPLE, true, true)
 
-Location(name::String, parameters::LocationParameters, total_memory_usage::Int64 = -1, sample::Sample = Sample())::Location =
-    Location(name, name, parameters, parameters, total_memory_usage, sample, false, false)
+Location(name::String, parameters::LocationParameters, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location =
+    Location(name, name, parameters, parameters, sample_memory_usage, sample, false, false)
 
 Base.isnothing(l::Location) = isnothing(l.sample)
 
-LocationSource(name::String, parameters::LocationParameters, total_memory_usage::Int64 = -1, sample::Sample = Sample())::Location =
-    Location(name, "None", parameters, LocationParameters(), total_memory_usage, sample, false, false)
+LocationSource(name::String, parameters::LocationParameters, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location =
+    Location(name, "None", parameters, LocationParameters(), sample_memory_usage, sample, false, false)
 
 LocationDestination(
     name::String,
@@ -31,7 +31,7 @@ function to_jl(lt::Location)
         # TODO: Instead of computing the total memory usage here, compute it
         # at the end of each `@partitioned`. That way we will count twice for
         # mutation
-        "total_memory_usage" => lt.total_memory_usage == -1 ? nothing : lt.total_memory_usage,
+        "sample_memory_usage" => lt.sample_memory_usage == -1 ? nothing : lt.sample_memory_usage,
     )
 end
 
@@ -59,7 +59,7 @@ function sourced(fut::Future, loc::Location)
                 "None",
                 loc.src_parameters,
                 Dict{String,Any}(),
-                loc.total_memory_usage,
+                loc.sample_memory_usage,
                 if !isnothing(loc.sample.value)
                     # If this location is like some remote location, then we need
                     # a sample from it.
@@ -81,7 +81,7 @@ function sourced(fut::Future, loc::Location)
                 fut_location.dst_name,
                 loc.src_parameters,
                 fut_location.dst_parameters,
-                loc.total_memory_usage,
+                loc.sample_memory_usage,
                 if !isnothing(loc.sample.value)
                     # If this location is like some remote location, then we need
                     # a sample from it.
@@ -114,7 +114,7 @@ function destined(fut::Future, loc::Location)
                 loc.dst_name,
                 EMPTY_DICT,
                 loc.dst_parameters,
-                fut_location.total_memory_usage,
+                fut_location.sample_memory_usage,
                 Sample(),
                 loc.metadata_invalid,
                 loc.sample_invalid
@@ -129,7 +129,7 @@ function destined(fut::Future, loc::Location)
                 loc.dst_name,
                 fut_location.src_parameters,
                 loc.dst_parameters,
-                fut_location.total_memory_usage,
+                fut_location.sample_memory_usage,
                 fut_location.sample,
                 fut_location.metadata_invalid,
                 fut_location.sample_invalid
@@ -211,7 +211,7 @@ get_dst_parameters(fut)::LocationParameters = get_location(fut).dst_parameters
 ####################
 
 function Value(val::T)::Location where {T}
-    LocationSource("Value", Dict{String,Any}("value" => to_jl_value(val)), total_memory_usage(val), ExactSample(val))
+    LocationSource("Value", Dict{String,Any}("value" => to_jl_value(val)), sample_memory_usage(val), ExactSample(val))
 end
 
 # TODO: Implement Size
@@ -223,7 +223,7 @@ Size(val)::Location = LocationSource(
 )
 
 function Client(val::T)::Location where {T}
-    LocationSource("Client", Dict{String,Any}(), total_memory_usage(val), ExactSample(val))
+    LocationSource("Client", Dict{String,Any}(), sample_memory_usage(val), ExactSample(val))
 end
 const CLIENT = Location("None", "Client", LocationParameters(), LocationParameters(), Int64(0), Sample(nothing, Int64(0), Int64(1)), false, false)
 Client()::Location = deepcopy(CLIENT)
diff --git a/Banyan/src/precompile.jl b/Banyan/src/precompile.jl
index e3e89a2f..c87e406e 100644
--- a/Banyan/src/precompile.jl
+++ b/Banyan/src/precompile.jl
@@ -285,8 +285,9 @@ function _precompile_()
     end
 
     # locations.jl
-    precompile(get_cached_location, (String, Bool, Bool))
-    precompile(cache_location, (String, Location, Bool, Bool))
+    for lp_func in [get_sample_rate, get_location_source, has_metadata, has_sample]
+        precompile(lp_func, (LocationPath,))
+    end
     precompile(sample_from_range, (UnitRange{Int64}, Int64))
 
     # utils.jl, utils_s3fs.jl
diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl
index da4f70f8..7db1bfdc 100644
--- a/Banyan/src/sample.jl
+++ b/Banyan/src/sample.jl
@@ -12,24 +12,25 @@ mutable struct Sample
         new(nothing, objectid(nothing), 0, get_sample_rate(), Any[])
     # Sample(value::Any) =
     #     new(value, objectid(value), sample_memory_usage(value), get_sample_rate(), Any[])
-    function Sample(value::Any, total_memory_usage::Int64, sample_rate::Int64)
+    function Sample(value::Any, sample_memory_usage::Int64, sample_rate::Int64)
         # sample_rate = get_sample_rate()
-        memory_usage = convert(Int64, round(total_memory_usage / sample_rate))::Int64
+        memory_usage = convert(Int64, round(sample_memory_usage / sample_rate))::Int64
         new(value, objectid(value), memory_usage, sample_rate, Any[])
     end
     function Sample(value::Any, sample_rate::Int64)
         # This is only for the NOTHING_SAMPLE and ExactSample
         new(value, objectid(value), sample_memory_usage(value), sample_rate, Any[])
-    end
+    end    
 end
 
+const NOTHING_SAMPLE = Sample(nothing, Int64(-1))
+
+Base.isnothing(s::Sample) = s.rate == -1
+
 struct SamplingConfig
     rate::Int64
     always_exact::Bool
     max_num_bytes_exact::Int64
     force_new_sample_rate::Bool
     assume_shuffled::Bool
-end
-
-const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true)
-session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG))
\ No newline at end of file
+end
\ No newline at end of file
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index 667f5fbd..2e6e6d39 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -68,7 +68,6 @@ impl_error(fn_name, as) = error("$fn_name not implemented for $(typeof(as))")
 sample_by_key(as::Any, key::Any) = impl_error("sample_by_key", as)
 sample_axes(as::Any)::Vector{Int64} = impl_error("sample_axes", as)
 sample_keys(as::Any) = impl_error("sample_keys", as)
-sample_memory_usage(as::Any)::Int64 = total_memory_usage(as)
 
 # Sample computation functions
 
@@ -200,10 +199,6 @@ function sample_max(A::T, key::K) where {T,K}
     isempty(A) ? nothing : _maximum(orderinghashes(A, key))
 end
 
-const NOTHING_SAMPLE = Sample(nothing, UInt(0), Int64(-1), Int64(-1), Int64[])
-
-Base.isnothing(s::Sample) = s.rate == -1
-
 # Caching samples with same statistics
 
 # A sample with memoized statistics for 
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index ea1945e5..d0361657 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -175,15 +175,15 @@ function _start_session(
         environment_hash = get_hash(project_toml * manifest_toml * version)
         environment_info["environment_hash"] = environment_hash
         environment_info["project_toml"] = "$(environment_hash)/Project.toml"
-        file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Project.toml", config=get_aws_config()))
+        file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Project.toml", config=global_aws_config()))
         if !file_already_in_s3
-            s3_put(get_aws_config(), s3_bucket_name, "$(environment_hash)/Project.toml", project_toml)
+            s3_put(global_aws_config(), s3_bucket_name, "$(environment_hash)/Project.toml", project_toml)
         end
         if manifest_toml != ""
             environment_info["manifest_toml"] = "$(environment_hash)/Manifest.toml"
-            file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Manifest.toml", config=get_aws_config()))
+            file_already_in_s3 = isfile(S3Path("s3://$(s3_bucket_name)/$(environment_hash)/Manifest.toml", config=global_aws_config()))
             if !file_already_in_s3
-                s3_put(get_aws_config(), s3_bucket_name, "$(environment_hash)/Manifest.toml", manifest_toml)
+                s3_put(global_aws_config(), s3_bucket_name, "$(environment_hash)/Manifest.toml", manifest_toml)
             end
         end
     else
@@ -208,9 +208,9 @@ function _start_session(
 
     # Upload files to S3
     for f in vcat(files, code_files)
-        s3_path = S3Path("s3://$(s3_bucket_name)/$(basename(f))", config=get_aws_config())
+        s3_path = S3Path("s3://$(s3_bucket_name)/$(basename(f))", config=global_aws_config())
         if !isfile(s3_path) || force_update_files
-            s3_put(get_aws_config(), s3_bucket_name, basename(f), load_file(f))
+            s3_put(global_aws_config(), s3_bucket_name, basename(f), load_file(f))
         end
     end
     # TODO: Optimize so that we only upload (and download onto cluster) the files if the filename doesn't already exist
@@ -488,7 +488,7 @@ function download_session_logs(session_id::SessionId, cluster_name::String, file
         mkdir(joinpath(homedir(), ".banyan", "logs"))
     end
     filename = !isnothing(filename) ? filename : joinpath(homedir(), ".banyan", "logs", log_file_name)
-    s3_get_file(get_aws_config(), s3_bucket_name, log_file_name, filename)
+    s3_get_file(global_aws_config(), s3_bucket_name, log_file_name, filename)
     @info "Downloaded logs for session with ID $session_id to $filename"
     return filename
 end
@@ -496,10 +496,10 @@ end
 function print_session_logs(session_id, cluster_name, delete_file=true)
     s3_bucket_name = get_cluster_s3_bucket_name(cluster_name)
     log_file_name = "banyan-log-for-session-$(session_id)"
-    logs = s3_get(get_aws_config(), s3_bucket_name, log_file_name)
+    logs = s3_get(global_aws_config(), s3_bucket_name, log_file_name)
     println(String(logs))
     if delete_file
-        s3_delete(get_aws_config(), s3_bucket_name, log_file_name)
+        s3_delete(global_aws_config(), s3_bucket_name, log_file_name)
     end
 end
 
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index 6a609027..6daafc17 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -21,7 +21,7 @@ json_to_jl(j) = JSON.parse(j)
 key_to_jl(key) = reinterpret(UInt8, hash(string(key))) |> String
 axis_to_jl(axis) = reinterpret(UInt8, hash(string(key))) |> String
 
-total_memory_usage(val)::Int64 =
+sample_memory_usage(val::Any)::Int64 =
     begin
         size = Base.summarysize(val)
         # TODO: Maybe make this larger
@@ -97,7 +97,6 @@ end
 # Banyan.jl may be being used). However, wrapping this in a mutex to ensure
 # synchronized mutation in this module would be a good TODO.
 global banyan_config = nothing
-global aws_config_in_usage = nothing
 
 @nospecialize
 
@@ -220,56 +219,7 @@ end
 
 @specialize
 
-"""
-Get the value for `key` in the `ini` file for a given `profile`.
-"""
-function _get_ini_value(
-    ini::Inifile, profile::String, key::String; default_value=nothing
-)
-    value = get(ini, "profile $profile", key)
-    value === :notfound && (value = get(ini, profile, key))
-    value === :notfound && (value = default_value)
-
-    return value
-end
-
-function get_aws_config()::Dict{Symbol,Any}
-    global aws_config_in_usage
-
-    # Get AWS configuration
-    if isnothing(aws_config_in_usage)
-        # Get region according to ENV, then credentials, then config files
-        profile = get(ENV, "AWS_DEFAULT_PROFILE", get(ENV, "AWS_DEFAULT_PROFILE", "default"))
-        region::String = get(ENV, "AWS_DEFAULT_REGION", "")
-        if region == ""
-            try
-                configfile = read(Inifile(), joinpath(homedir(), ".aws", "config"))
-                region = convert(String, _get_ini_value(configfile, profile, "region", default_value=""))::String
-            catch
-            end
-        end
-        if region == ""
-            try
-                credentialsfile = read(Inifile(), joinpath(homedir(), ".aws", "credentials"))
-                region = convert(String, _get_ini_value(credentialsfile, profile, "region", default_value=""))::String
-            catch
-            end
-        end
-
-        if region == ""
-            throw(ErrorException("Could not discover AWS region to use from looking at AWS_PROFILE, AWS_DEFAULT_PROFILE, AWS_DEFAULT_REGION, HOME/.aws/credentials, and HOME/.aws/config"))
-        end
-
-        aws_config_in_usage = Dict{Symbol,Any}(
-            :creds => AWSCredentials(),
-            :region => region
-        )
-    end
-
-    aws_config_in_usage
-end
-
-get_aws_config_region() = get_aws_config()[:region]::String
+get_aws_config_region() = global_aws_config().region
 
 #########################
 # ENVIRONMENT VARIABLES #
@@ -445,7 +395,7 @@ function load_json(path::String)
     elseif startswith(path, "s3://")
         error("S3 path not currently supported")
         # TODO: Maybe support with
-        # `JSON.parsefile(S3Path(path, config=get_aws_config()))` and also down
+        # `JSON.parsefile(S3Path(path, config=global_aws_config()))` and also down
         # in `load_toml`
     elseif startswith(path, "http://") || startswith(path, "https://")
 	    JSON.parse(request_body(path)[2])
@@ -462,7 +412,7 @@ function load_toml(path::String)
         TOML.parsefile(path[8:end])
     elseif startswith(path, "s3://")
         error("S3 path not currently supported")
-        # JSON.parsefile(S3Path(path, config=get_aws_config()))
+        # JSON.parsefile(S3Path(path, config=global_aws_config()))
     elseif startswith(path, "http://") || startswith(path, "https://")
 	    TOML.parse(request_body(path)[2])
     else
diff --git a/Banyan/src/utils_s3fs.jl b/Banyan/src/utils_s3fs.jl
index 2878c05c..f3861bc9 100644
--- a/Banyan/src/utils_s3fs.jl
+++ b/Banyan/src/utils_s3fs.jl
@@ -55,7 +55,7 @@ function download_remote_s3_path(path::String)
     global failed_to_use_s3fs
 
     # Get information about requested object
-    s3path = S3Path(path, config = get_aws_config())
+    s3path = S3Path(path, config = global_aws_config())
     bucket = s3path.bucket
     key = s3path.key
     # bucket = "banyan-cluster-data-myfirstcluster"
@@ -96,8 +96,8 @@ function download_remote_s3_path(path::String)
 
             # TODO: Store buckets from different accounts/IAMs/etc. seperately
             try
-                ACCESS_KEY_ID = get_aws_config()[:creds].access_key_id
-                SECRET_ACCESS_KEY = get_aws_config()[:creds].secret_key
+                ACCESS_KEY_ID = global_aws_config()[:creds].access_key_id
+                SECRET_ACCESS_KEY = global_aws_config()[:creds].secret_key
                 passwd_s3fs_contents = ACCESS_KEY_ID * ":" * SECRET_ACCESS_KEY
                 HOME = homedir()
                 region = get_aws_config_region()
diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl
index 6531f9b0..61af636e 100644
--- a/Banyan/test/clusters.jl
+++ b/Banyan/test/clusters.jl
@@ -28,7 +28,7 @@ end
 end
 
 function bucket_exists(s3_bucket_name)
-    ispath(S3Path("s3://$(s3_bucket_name)", config=Banyan.get_aws_config()))
+    ispath(S3Path("s3://$(s3_bucket_name)", config=Banyan.global_aws_config()))
 end
 
 @testset "Create clusters" begin
@@ -55,7 +55,7 @@ end
         s3_bucket = nothing
     elseif s3_bucket == "user-provided"
         s3_bucket = Random.randstring(['a':'z'; '0':'9'], 6)
-        s3_create_bucket(Banyan.get_aws_config(), s3_bucket)
+        s3_create_bucket(Banyan.global_aws_config(), s3_bucket)
     end
 
     # Create a cluster (at least initiate) and check that S3 bucket exists
@@ -142,8 +142,8 @@ end
         dst_name = "data_from_s3"
         src_path = "s3://$s3_bucket/$dst_name"
         # Create a bucket and upload data
-        s3_create_bucket(Banyan.get_aws_config(), s3_bucket)
-        s3_put(Banyan.get_aws_config(), s3_bucket, dst_name, "some file contents")
+        s3_create_bucket(Banyan.global_aws_config(), s3_bucket)
+        s3_put(Banyan.global_aws_config(), s3_bucket, dst_name, "some file contents")
     end
 
     cluster_name = ENV["BANYAN_CLUSTER_NAME"]
@@ -153,10 +153,10 @@ end
     @test ispath(S3Path("s3://$cluster_s3_bucket/$dst_name"))
 
     # Cleanup
-    s3_delete(Banyan.get_aws_config(), cluster_s3_bucket, dst_name)
+    s3_delete(Banyan.global_aws_config(), cluster_s3_bucket, dst_name)
     if src_type == "s3"
-        s3_delete(Banyan.get_aws_config(), s3_bucket, dst_name)
-        s3_delete_bucket(Banyan.get_aws_config(), s3_bucket)
+        s3_delete(Banyan.global_aws_config(), s3_bucket, dst_name)
+        s3_delete_bucket(Banyan.global_aws_config(), s3_bucket)
     end
 end
 
@@ -178,6 +178,6 @@ end
 
     # Cleanup
     for f_name in readdir(src_path)
-        s3_delete(Banyan.get_aws_config(), cluster_s3_bucket, "$dst_name/$f_name")
+        s3_delete(Banyan.global_aws_config(), cluster_s3_bucket, "$dst_name/$f_name")
     end
 end
\ No newline at end of file
diff --git a/Banyan/test/sessions.jl b/Banyan/test/sessions.jl
index 6c6e9bbc..738f9837 100644
--- a/Banyan/test/sessions.jl
+++ b/Banyan/test/sessions.jl
@@ -177,7 +177,7 @@ end
     println("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)")
     @test store_logs_in_s3 == isfile(
         S3Path("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)",
-        config=Banyan.get_aws_config())
+        config=Banyan.global_aws_config())
     )
 end
 
diff --git a/BanyanDataFrames/src/BanyanDataFrames.jl b/BanyanDataFrames/src/BanyanDataFrames.jl
index 69ef44ec..fae29bce 100644
--- a/BanyanDataFrames/src/BanyanDataFrames.jl
+++ b/BanyanDataFrames/src/BanyanDataFrames.jl
@@ -20,7 +20,7 @@ using Arrow,
 export DataFrame, GroupedDataFrame
 
 # I/O
-export read_csv, write_csv, read_parquet, write_parquet, read_arrow, write_arrow
+export read_table, write_table, read_csv, write_csv, read_parquet, write_parquet, read_arrow, write_arrow
 
 # Dataframe properties
 export nrow, ncol, size, names, propertynames
diff --git a/BanyanDataFrames/src/gdf.jl b/BanyanDataFrames/src/gdf.jl
index 7bbe12cf..979f1b46 100644
--- a/BanyanDataFrames/src/gdf.jl
+++ b/BanyanDataFrames/src/gdf.jl
@@ -9,7 +9,7 @@ end
 Banyan.convert(::Type{Future}, gdf::GroupedDataFrame) = gdf.data
 Banyan.isview(gdf::GroupedDataFrame) = true
 Banyan.sample_memory_usage(gdf::DataFrames.GroupedDataFrame)::Int64 =
-    total_memory_usage(gdf) - total_memory_usage(parent(gdf))
+    sample_memory_usage(gdf) - sample_memory_usage(parent(gdf))
 
 Base.length(gdf::GroupedDataFrame) = compute(gdf.length)
 Base.size(gdf::GroupedDataFrame) = Tuple(length(gdf))
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index 17bca8f0..c16c9e33 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -102,7 +102,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
     else
         parse(Int64, loc.src_parameters["nrows"])
     end
-    total_nbytes = curr_metadata_invalid ? -1 : parse(Int64, loc.src_parameters["total_memory_usage"])
+    total_nbytes = curr_metadata_invalid ? -1 : parse(Int64, loc.src_parameters["sample_memory_usage"])
     exact_sample_needed = sampling_config.always_exact || total_nbytes <= max_num_bytes_exact
 
     # inv: (a) `meta_nrows_on_worker`, (b) `total_nrows_res`, and
@@ -200,7 +200,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
                     meta_nrows_on_worker[i] = path_nrows
                     push!(local_samples, path_sample)
                     local_nrows += path_nrows
-                    local_nbytes += ceil(Int64, total_memory_usage(path_sample) * path_sample_rate)
+                    local_nbytes += ceil(Int64, sample_memory_usage(path_sample) * path_sample_rate)
                 end
                 total_nrows_res = reduce_and_sync_across(+, local_nrows)
                 total_nbytes_res = reduce_and_sync_across(+, local_nbytes)
@@ -264,7 +264,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
             remote_sample_value_arrow = io.data
 
             # Construct Sample with the concatenated value, memory usage, and sample rate
-            remote_sample_value_memory_usage = total_memory_usage(remote_sample_value)
+            remote_sample_value_memory_usage = sample_memory_usage(remote_sample_value)
             total_nbytes_res = if exact_sample_needed
                 remote_sample_value_memory_usage
             else
@@ -278,7 +278,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
             remote_sample_res::Sample = if exact_sample_needed
                 # Technically we don't need to be passing in `total_bytes_res`
                 # here but we do it because we are anyway computing it to
-                # return as the `total_memory_usage` for the `Location` and so
+                # return as the `sample_memory_usage` for the `Location` and so
                 # we might as well avoid recomputing it in the `Sample`
                 # constructors
                 ExactSample(remote_sample_value_arrow, total_nbytes_res)
@@ -307,7 +307,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
             #     sample_rate
             # )
             # remote_sample_value_nrows = nrow(cached_remote_sample_res.value)
-            # remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_res.value)
+            # remote_sample_value_nbytes = sample_memory_usage(cached_remote_sample_res.value)
             # if Banyan.INVESTIGATING_COLLECTING_SAMPLES || Banyan.INVESTIGATING_MEMORY_USAGE
             #     @show remote_sample_value_nbytes remote_sample_value_nrows total_nrows_res
             # end
@@ -323,7 +323,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
             # end
 
             cached_remote_sample_value = DataFrames.DataFrame(Arrow.Table(sample_path))
-            remote_sample_value_nbytes = total_memory_usage(cached_remote_sample_value)
+            remote_sample_value_nbytes = sample_memory_usage(cached_remote_sample_value)
             remote_sample_value_nrows = DataFrames.nrow(cached_remote_sample_value)
             total_nbytes_res = ceil(Int64, remote_sample_value_nbytes * total_nrows_res / remote_sample_value_nrows)
             cached_remote_sample_res = NOTHING_SAMPLE
@@ -341,7 +341,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
     src_params =
         Dict(
             "name" => "Remote",
-            "total_memory_usage" => string(total_nbytes),
+            "sample_memory_usage" => string(total_nbytes),
             # For dispatching the appropriate PF for this format
             "format" => format_string,
             # For constructing the `BanyanDataFrames.DataFrame`'s `nrows::Future` field
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index aa4f453c..f9fe0981 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -515,7 +515,7 @@ function WriteHelper(@nospecialize(format_value))
         else
             Dict(
                 "name" => "Remote",
-                "total_memory_usage" => "0",
+                "sample_memory_usage" => "0",
                 "format" => format_string,
                 "nrows" => "0",
                 "path" => loc_params_path,
@@ -524,7 +524,7 @@ function WriteHelper(@nospecialize(format_value))
         end
 
         # Gather # of rows, # of bytes, empty sample, and actual sample
-        nbytes = part_res isa Empty ? 0 : Banyan.total_memory_usage(part_res)
+        nbytes = part_res isa Empty ? 0 : Banyan.sample_memory_usage(part_res)
         sampling_config = get_sampling_config(lp)
         sample_rate = sampling_config.rate
         sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows)
@@ -549,13 +549,13 @@ function WriteHelper(@nospecialize(format_value))
 
             # Update the # of bytes
             total_nrows::Int64 = parse(Int64, curr_src_parameters["nrows"])
-            total_memory_usage::Int64 = parse(Int64, curr_src_parameters["total_memory_usage"])
+            sample_memory_usage::Int64 = parse(Int64, curr_src_parameters["sample_memory_usage"])
             empty_sample_found = false
             for (new_nrows::Int64, new_nbytes::Int64, empty_part, sampled_part) in gathered_data
                 # Update the total # of rows and the total # of bytes
                 total_nrows += sum(new_nrows)
                 push!(curr_nrows, new_nrows)
-                total_memory_usage += new_nbytes
+                sample_memory_usage += new_nbytes
 
                 # Get the empty sample
                 if !empty_sample_found && !(empty_part isa Empty)
@@ -564,9 +564,9 @@ function WriteHelper(@nospecialize(format_value))
                 end
             end
             curr_src_parameters["nrows"] = string(total_nrows)
-            curr_src_parameters["total_memory_usage"] = string(total_memory_usage)
+            curr_src_parameters["sample_memory_usage"] = string(sample_memory_usage)
 
-            if !is_disk && batch_idx == nbatches && total_memory_usage <= sampling_config.max_num_bytes_exact
+            if !is_disk && batch_idx == nbatches && sample_memory_usage <= sampling_config.max_num_bytes_exact
                 # If the total # of rows turns out to be inexact then we can simply mark it as
                 # stale so that it can be collected more efficiently later on
                 # We should be able to quickly recompute a more useful sample later
diff --git a/BanyanDataFrames/test/latency.jl b/BanyanDataFrames/test/latency.jl
index 1ac18269..d0e50128 100644
--- a/BanyanDataFrames/test/latency.jl
+++ b/BanyanDataFrames/test/latency.jl
@@ -41,13 +41,13 @@ end
 function test_csv_from_s3_latency()
     use_session_for_testing(scheduling_config_name = "default scheduling", sample_rate=2048*4) do
         s3_bucket_name = get_cluster_s3_bucket_name()
-        if !s3_exists(Banyan.get_aws_config(), s3_bucket_name, "nyc_tripdata_small.csv")
+        if !s3_exists(Banyan.global_aws_config(), s3_bucket_name, "nyc_tripdata_small.csv")
             data_path = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv"
             offloaded(s3_bucket_name, data_path) do s3_bucket_name, data_path
                 temp_path = Downloads.download(data_path)
                 cp(
                     Path(temp_path),
-                    S3Path("s3://$s3_bucket_name/nyc_tripdata_small.csv", config=Banyan.get_aws_config())
+                    S3Path("s3://$s3_bucket_name/nyc_tripdata_small.csv", config=Banyan.global_aws_config())
                 )
             end
         end
diff --git a/BanyanDataFrames/test/runtests.jl b/BanyanDataFrames/test/runtests.jl
index ff77fb8d..d81b3c59 100644
--- a/BanyanDataFrames/test/runtests.jl
+++ b/BanyanDataFrames/test/runtests.jl
@@ -74,7 +74,7 @@ function use_data(file_extension, remote_kind, single_file)
             ".$file_extension"
         testing_dataset_s3_path = S3Path(
             "s3://$(get_cluster_s3_bucket_name())/$testing_dataset_s3_name",
-            config = Banyan.get_aws_config(),
+            config = Banyan.global_aws_config(),
         )
 
         # Create the file if not already created
diff --git a/BanyanDataFrames/test/runtests_without_retest.jl b/BanyanDataFrames/test/runtests_without_retest.jl
index 7772e755..3c57728a 100644
--- a/BanyanDataFrames/test/runtests_without_retest.jl
+++ b/BanyanDataFrames/test/runtests_without_retest.jl
@@ -112,11 +112,11 @@ end
 # path - path to write file to in bucket
 # download_path - either http(s) link to a file or a local Path indicating the source of the file
 function verify_file_in_s3(bucket, path, download_path)
-     if !s3_exists(Banyan.get_aws_config(), bucket, path)
+     if !s3_exists(Banyan.global_aws_config(), bucket, path)
         if typeof(download_path) == String && (startswith(download_path, "https://") || startswith(download_path, "http://"))
-            Downloads.download(download_path, S3Path("s3://$(bucket)/$(path)", config=Banyan.get_aws_config()))
+            Downloads.download(download_path, S3Path("s3://$(bucket)/$(path)", config=Banyan.global_aws_config()))
         else  # upload local file
-            cp(Path(download_path), S3Path("s3://$(bucket)/$(path)", config=Banyan.get_aws_config()))
+            cp(Path(download_path), S3Path("s3://$(bucket)/$(path)", config=Banyan.global_aws_config()))
         end
     end
 end
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index 88a85235..b20813c1 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -33,9 +33,9 @@
 
         # Construct location
         if reusing != "nothing"
-            RemoteTableSource(src_name, invalidate_metadata = true, invalidate_sample = true)
+            RemoteTableSource(src_name)
             invalidate_location(src_name)
-            RemoteTableSource(src_name, metadata_invalid = true, sample_invalid = true)
+            RemoteTableSource(src_name)
         end
         if (reusing == "nothing" || reusing == "sample")
             invalidate_metadata(src_name)
@@ -51,10 +51,10 @@
 
         # Verify the location
         
-        @test remote_source.total_memory_usage > 0
+        @test remote_source.sample_memory_usage > 0
         @test !remote_source.metadata_invalid
         @test !remote_source.sample_invalid
-        @test remote_source.src_parameters["nrows"] == src_nrows
+        @test remote_source.src_parameters["nrows"] == string(src_nrows)
         # if contains(src_name, "dir")
         #     @test length(remote_source.files) == 10
         #     for f in remote_source.files
@@ -78,3 +78,64 @@
         end
     end
 end
+
+@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and maximum # of bytes for exact sample" for scheduling_config in
+    [
+        "default scheduling",
+        "parallelism encouraged",
+        "parallelism and batches encouraged",
+    ],
+    format in ["csv", "parquet"],
+    max_num_bytes in [0, Banyan.parse_bytes("100 GB")],
+    shuffled in [true, false]
+
+    use_session_for_testing(scheduling_config_name = scheduling_config) do
+        use_basic_data()
+
+        bucket = get_cluster_s3_bucket_name()
+
+        invalidate_all_locations()
+
+        p1 = "s3://$(bucket)/iris_large_$format.$format"
+        p2 = "s3://$(bucket)/iris_large_tmp_$format.$format"
+
+        df = read_table(p1; metadata_invalid=true, invalidate_samples=true)
+        sample(df)
+        @show get_sample_rate(p1)
+
+        configure_sampling(p2; sample_rate=5)
+        write_table(p2, df)
+        @test get_sample_rate(p2) == 5
+        @test has_metadata(p2)
+        @test has_sample(p2)
+        invalidate_metadata(p2)
+        @test !has_metadata(p2)
+        @test has_sample(p2)
+        innvalidate_location(p2)
+        @test !has_metadata(p2)
+        @test !has_sample(p2)
+
+        df2 = read_table(df2)
+        @show get_sample_rate(p2)
+        sample(df2)
+        @show get_sample_rate(p2)
+        df2 = read_table(df2; samples_invalid=true)
+        sample(df2)
+        configure_sampling(sample_rate=7, for_all_locations=true)
+        df2 = read_table(df2; metadata_invalid=true)
+        sample(df2)
+        @test get_sample_rate() == 5
+        configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
+        @test get_sample_rate(p2) == 5
+        df2 = read_table(df2)
+        @test get_sample_rate() == 7
+        @test get_sample_rate() == 5
+        df2 = read_table(df2; location_invalid=true)
+        sample(df2)
+        @test has_metadata(p2)
+        @test has_sample(p2)
+        @show get_sample_rate(p2)
+        configure_sampling(p2; always_exact=tru)
+        sample(df2)
+    end
+end
diff --git a/BanyanDataFrames/test/utils_data.jl b/BanyanDataFrames/test/utils_data.jl
index 24cfadf6..0f8e0463 100644
--- a/BanyanDataFrames/test/utils_data.jl
+++ b/BanyanDataFrames/test/utils_data.jl
@@ -2,17 +2,17 @@
 # path - path to write file to in bucket
 # download_path - either http(s) link to a file or a local Path indicating the source of the file
 function verify_file_in_s3(bucket, path, download_path)
-    if !s3_exists(Banyan.get_aws_config(), bucket, path)
+    if !s3_exists(Banyan.global_aws_config(), bucket, path)
         if typeof(download_path) == String &&
            (startswith(download_path, "https://") || startswith(download_path, "http://"))
             Downloads.download(
                 download_path,
-                S3Path("s3://$(bucket)/$(path)", config = Banyan.get_aws_config()),
+                S3Path("s3://$(bucket)/$(path)", config = Banyan.global_aws_config()),
             )
         else  # upload local file
             cp(
                 Path(download_path),
-                S3Path("s3://$(bucket)/$(path)", config = Banyan.get_aws_config()),
+                S3Path("s3://$(bucket)/$(path)", config = Banyan.global_aws_config()),
             )
         end
     end
@@ -58,11 +58,11 @@ function setup_basic_tests(bucket_name=get_cluster_s3_bucket_name())
         "iris_species_info.parquet",
         "iris_species_info.arrow",
     ]
-    bucket_contents = s3_list_keys(Banyan.get_aws_config(), bucket_name)
+    bucket_contents = s3_list_keys(Banyan.global_aws_config(), bucket_name)
     to_be_downloaded = [
         iris_s3_path for iris_s3_path in iris_s3_paths if
         # TODO: Use the following when AWSS3.jl supports folders
-        # !s3_exists(Banyan.get_aws_config(), bucket_name, iris_s3_path)
+        # !s3_exists(Banyan.global_aws_config(), bucket_name, iris_s3_path)
         !(iris_s3_path in bucket_contents)
     ]
     if !isempty(to_be_downloaded)
@@ -146,7 +146,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name())
     # Write empty dataframe
     empty_df = DataFrames.DataFrame()
     println("At start of setup_empty_tests")
-    if !ispath(S3Path("s3://$bucket_name/empty_df.csv", config = Banyan.get_aws_config()))
+    if !ispath(S3Path("s3://$bucket_name/empty_df.csv", config = Banyan.global_aws_config()))
         write_df_to_csv_to_s3(
             empty_df,
             "empty_df.csv",
@@ -156,7 +156,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name())
         )
     end
     println("After first setup_empty_tests")
-    if !ispath(S3Path("s3://$bucket_name/empty_df.arrow", config = Banyan.get_aws_config()))
+    if !ispath(S3Path("s3://$bucket_name/empty_df.arrow", config = Banyan.global_aws_config()))
         write_df_to_arrow_to_s3(
             empty_df,
             "empty_df.arrow",
@@ -168,7 +168,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name())
 
     # Write empty dataframe with two columns
     empty_df2 = DataFrames.DataFrame(x = [], y = [])
-    if !ispath(S3Path("s3://$bucket_name/empty_df2.csv", config = Banyan.get_aws_config()))
+    if !ispath(S3Path("s3://$bucket_name/empty_df2.csv", config = Banyan.global_aws_config()))
         write_df_to_csv_to_s3(
             empty_df2,
             "empty_df2.csv",
@@ -177,7 +177,7 @@ function setup_empty_tests(bucket_name=get_cluster_s3_bucket_name())
             "empty_df2.csv",
         )
     end
-    if !ispath(S3Path("s3://$bucket_name/empty_df2.arrow", config = Banyan.get_aws_config()))
+    if !ispath(S3Path("s3://$bucket_name/empty_df2.arrow", config = Banyan.global_aws_config()))
         write_df_to_arrow_to_s3(
             empty_df2,
             "empty_df2.arrow",
@@ -197,13 +197,13 @@ end
 #     idx = 0
 #     part_names = []
 #     while num_bytes_so_far < num_bytes
-#         dst_path = S3Path("s3://$bucket_name/nyc_tripdata_large.csv/part$idx.csv", config = Banyan.get_aws_config())
+#         dst_path = S3Path("s3://$bucket_name/nyc_tripdata_large.csv/part$idx.csv", config = Banyan.global_aws_config())
 #         if Banyan.INVESTIGATING_SETUP_NYC_TAXI_STRESS_TEST
 #             println("In while loop in setup_nyc_taxi_stress_test")
 #             @show dst_path
-#             @show !s3_exists(Banyan.get_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv")
+#             @show !s3_exists(Banyan.global_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv")
 #         end
-#         if !s3_exists(Banyan.get_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv")
+#         if !s3_exists(Banyan.global_aws_config(), bucket_name, "nyc_tripdata_large.csv/part$idx.csv")
 #             if isnothing(nyc_trip_data_120_mb_path)
 #                 nyc_trip_data_120_mb_path = Path(download("https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv"))
 #             end
@@ -218,13 +218,13 @@ end
 #         println("Outside while loop in setup_nyc_taxi_stress_test")
 #         @show part_names
 #     end
-#     for p in s3_list_keys(Banyan.get_aws_config(), bucket_name, "nyc_tripdata_large.csv/")
+#     for p in s3_list_keys(Banyan.global_aws_config(), bucket_name, "nyc_tripdata_large.csv/")
 #         p_str = string(p)
 #         if !any((endswith(p_str, part_name) for part_name in part_names))
 #             if Banyan.INVESTIGATING_SETUP_NYC_TAXI_STRESS_TEST
 #                 println("In final for loop in setup_nyc_taxi_stress_test with p=$p")
 #             end
-#             s3_delete(Banyan.get_aws_config(), bucket_name, p)
+#             s3_delete(Banyan.global_aws_config(), bucket_name, p)
 #         end
 #     end
 # end
@@ -277,7 +277,7 @@ function setup_stress_tests(bucket_name=get_cluster_s3_bucket_name())
         for filetype in ["csv", "parquet", "arrow"]
             for ncopy = 1:n_repeats
                 dst_path = "s3://$(bucket_name)/tripdata_large_$(filetype).$(filetype)/tripdata_$(month)_copy$(ncopy).$(filetype)"
-                dst_s3_path = S3Path(dst_path, config = Banyan.get_aws_config())
+                dst_s3_path = S3Path(dst_path, config = Banyan.global_aws_config())
                 push!(dst_s3_paths, dst_s3_path)
                 if !isfile(dst_s3_path)
                     push!(dst_s3_paths_missing, dst_s3_path)
@@ -309,7 +309,7 @@ function setup_stress_tests(bucket_name=get_cluster_s3_bucket_name())
                 cp(
                     Path(get_local_path_tripdata(s3_path)),
                     s3_path,
-                    config = Banyan.get_aws_config(),
+                    config = Banyan.global_aws_config(),
                 )
             end
         end
@@ -319,10 +319,10 @@ end
 function cleanup_tests(bucket_name=get_cluster_s3_bucket_name())
     # Delete all temporary test files that are prepended with "test-tmp__"
     @show bucket_name
-    for p in s3_list_keys(Banyan.get_aws_config(), bucket_name)
+    for p in s3_list_keys(Banyan.global_aws_config(), bucket_name)
         if contains(string(p), "test-tmp_")
-            # s3_path = S3Path("s3://$bucket_name/$p", config = Banyan.get_aws_config())
-            rm(S3Path("s3://$bucket_name/$p", config = Banyan.get_aws_config()), recursive=true)
+            # s3_path = S3Path("s3://$bucket_name/$p", config = Banyan.global_aws_config())
+            rm(S3Path("s3://$bucket_name/$p", config = Banyan.global_aws_config()), recursive=true)
         end
     end
 end
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 1dc78be0..38371070 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -145,7 +145,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig
                 "subpath" => datasetpath,
                 "eltype" => Banyan.size_to_str(dataszie),
                 "size" => Banyan.type_to_str(dataeltype),
-                "total_memory_usage" => string(nbytes),
+                "sample_memory_usage" => string(nbytes),
                 "format" => "hdf5"
             )
         else
diff --git a/BanyanHDF5/test/runtests.jl b/BanyanHDF5/test/runtests.jl
index 8b155e77..56b16b58 100644
--- a/BanyanHDF5/test/runtests.jl
+++ b/BanyanHDF5/test/runtests.jl
@@ -101,7 +101,7 @@ function use_data(data_src = "S3")
             ),
         )
         f_dst = joinpath(
-            S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.get_aws_config()),
+            S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.global_aws_config()),
             "fillval.h5",
         )
         f = get_downloaded_path(f_dst, only_for_writing=true)
@@ -115,7 +115,7 @@ function use_data(data_src = "S3")
         # rm(get_s3fs_path(joinpath(get_cluster_s3_bucket_name(), "fillval_copy.h5")), force=true)
         rm(
             joinpath(
-                S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.get_aws_config()),
+                S3Path("s3://$(get_cluster_s3_bucket_name())", config = Banyan.global_aws_config()),
                 "fillval_copy.h5",
             ),
             force = true,
diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl
index 4d084f2a..783b858f 100644
--- a/BanyanImages/src/locations.jl
+++ b/BanyanImages/src/locations.jl
@@ -306,7 +306,7 @@ function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfi
     # regardless of whether we want to get the sample or the metadata
     _load_img = add_channelview ? _load_image_and_add_channelview : _load_image
     first_img = is_main ? (localpaths[1] |> _load_img |> _reshape_image) : nothing
-    exact_sample_needed = is_main ? ((total_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false
+    exact_sample_needed = is_main ? ((sample_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false
     exact_sample_needed = sync_across(exact_sample_needed)
     need_to_parallelize = nimages >= 10
     total_num_images_to_read_in = if curr_sample_invalid
@@ -365,7 +365,7 @@ function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfi
             Dict{String,Any}(
                 "name" => "Remote",
                 "nimages" => string(nimages),
-                "total_memory_usage" => string(nbytes_res),  # NOTE: We assume all files have same size
+                "sample_memory_usage" => string(nbytes_res),  # NOTE: We assume all files have same size
                 "size" => size_to_str(datasize_res),
                 "eltype" => type_to_str(dataeltype_res),
                 "add_channelview" => add_channelview ? "1" : "0",
diff --git a/BanyanImages/test/locations.jl b/BanyanImages/test/locations.jl
index 416ae0a6..274077ab 100644
--- a/BanyanImages/test/locations.jl
+++ b/BanyanImages/test/locations.jl
@@ -14,7 +14,7 @@
 
 #         s = RemoteImageSource(path; metadata_invalid=metadata_invalid, sample_invalid=sample_invalid)
 #         @test s.src_parameters["nimages"] == 1
-#         @test s.total_memory_usage == sizeof(ImageCore.RGB{N0f8}) * image_size  # exact sample
+#         @test s.sample_memory_usage == sizeof(ImageCore.RGB{N0f8}) * image_size  # exact sample
 #         @test s.src_parameters["nbytes"] == sizeof(ImageCore.RGB{N0f8}) * image_size
 #         @test s.src_parameters["ndims"] == 3
 #         @test s.src_parameters["size"] == (1, sqrt(image_size), sqrt(image_size))
diff --git a/BanyanImages/test/pfs.jl b/BanyanImages/test/pfs.jl
index c53994e6..f41207f2 100644
--- a/BanyanImages/test/pfs.jl
+++ b/BanyanImages/test/pfs.jl
@@ -29,7 +29,7 @@
 
 #     # Construct files
 #     if format == "directory"
-#         files = readdir(S3Path(path, config=Banyan.get_aws_config()))
+#         files = readdir(S3Path(path, config=Banyan.global_aws_config()))
 #         datasize = add_channelview ? (nimages, 3, 100, 100) : (nimages, 100, 100)
 #         empty_part_size = add_channelview ? (0, 3, 100, 100) : (0, 100, 100)
 #     elseif format == "generator"
diff --git a/BanyanImages/test/utils_data.jl b/BanyanImages/test/utils_data.jl
index 863a19c5..0fbe8479 100644
--- a/BanyanImages/test/utils_data.jl
+++ b/BanyanImages/test/utils_data.jl
@@ -7,7 +7,7 @@ img_len = 100
 function write_png_files_to_s3(bucket_name=get_cluster_s3_bucket_name(), nimages=1)
     global s3_dirs
     s3_dir_png = s3_dirs["png"]
-    if length(readdir(S3Path("s3://$bucket_name/$s3_dir_png/", config=Banyan.get_aws_config()))) < nimages
+    if length(readdir(S3Path("s3://$bucket_name/$s3_dir_png/", config=Banyan.global_aws_config()))) < nimages
         for i in 1:nimages
             println("Writing image $i to S3")
             rand_image = rand(ImageCore.RGB, img_len, img_len)
@@ -20,7 +20,7 @@ end
 function write_jpg_files_to_s3(bucket_name=get_cluster_s3_bucket_name(), nimages=1)
     global s3_dirs
     s3_dir_jpg = s3_dirs["jpg"]
-    if length(readdir(S3Path("s3://$bucket_name/$s3_dir_jpg/", config=Banyan.get_aws_config()))) < nimages
+    if length(readdir(S3Path("s3://$bucket_name/$s3_dir_jpg/", config=Banyan.global_aws_config()))) < nimages
         for i in 1:nimages
             println("Writing image $i to S3")
             rand_image = rand(ImageCore.RGB, img_len, img_len)
@@ -34,8 +34,8 @@ function cleanup_s3_test_files(bucket_name=get_cluster_s3_bucket_name())
     global s3_dirs
     # Delete all files in test_images
     for (filetype, s3_dir) in s3_dirs
-        for p in s3_list_keys(Banyan.get_aws_config(), bucket_name, "$s3_dir")
-            rm(S3Path("s3://$bucket_name/$p", config=Banyan.get_aws_config()), recursive=true)
+        for p in s3_list_keys(Banyan.global_aws_config(), bucket_name, "$s3_dir")
+            rm(S3Path("s3://$bucket_name/$p", config=Banyan.global_aws_config()), recursive=true)
         end
     end
 end
@@ -79,7 +79,7 @@ function get_test_path(src, format, filetype, nimages, bucket_name)
         if format == "path"
             "s3://$bucket_name/$s3_dir/test_image_1.$filetype"
         elseif format == "directory" || format == "generator"
-            p = S3Path("s3://$bucket_name/earthdata_jpg_$nimages/", config=Banyan.get_aws_config())
+            p = S3Path("s3://$bucket_name/earthdata_jpg_$nimages/", config=Banyan.global_aws_config())
             if !isdir(p)
                 mkdir(p)
             end
diff --git a/BanyanONNXRunTime/src/locations.jl b/BanyanONNXRunTime/src/locations.jl
index 812ad8f1..35968586 100644
--- a/BanyanONNXRunTime/src/locations.jl
+++ b/BanyanONNXRunTime/src/locations.jl
@@ -10,7 +10,7 @@ function RemoteONNXSource(remotepath)::Location
     if p_exists
         pp = get_downloaded_path(p)
         model = ONNXRunTime.load_inference(pp)
-        nbytes = Banyan.total_memory_usage(model)
+        nbytes = Banyan.sample_memory_usage(model)
         destroy_downloaded_path(pp)
     end
 

From f6bb46f860d9fb08be84fadb1a096ffbe9328e82 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Tue, 9 Aug 2022 07:55:30 -0700
Subject: [PATCH 11/25] Add BanyanONNXRunTime tests for sampling

---
 BanyanONNXRunTime/test/onnxruntime.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/BanyanONNXRunTime/test/onnxruntime.jl b/BanyanONNXRunTime/test/onnxruntime.jl
index 829cedba..876482d8 100644
--- a/BanyanONNXRunTime/test/onnxruntime.jl
+++ b/BanyanONNXRunTime/test/onnxruntime.jl
@@ -17,6 +17,13 @@
         @test res_size == (120, 2, 3)
         all_incremented = all(res .== 2)
         @test all_incremented
+
+        model_sample = sample(model)
+        res_sample = model_sample(Dict("input" => sample(data)))["output"]
+        res_size = size(res_sample)
+        @test res_size == (120, 2, 3)
+        all_incremented = all(res_sample .== 2)
+        @test all_incremented
     end
 end
 

From 1a3832dc98f0f30c5d1c63e1c26abfb4d78b49d2 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Tue, 9 Aug 2022 16:02:24 -0700
Subject: [PATCH 12/25] Fix some bugs and update Arrow version

---
 Banyan/Project.toml                        | 10 +--
 Banyan/src/Banyan.jl                       |  8 ++-
 Banyan/src/location.jl                     | 26 +++++---
 Banyan/src/locations.jl                    | 26 ++++----
 Banyan/src/queues.jl                       |  8 +--
 Banyan/src/requests.jl                     |  8 +++
 Banyan/src/sample.jl                       |  2 +-
 Banyan/src/samples.jl                      |  2 +-
 Banyan/src/sessions.jl                     |  6 +-
 Banyan/src/utils.jl                        |  6 +-
 Banyan/test/Project.toml                   |  4 +-
 BanyanDataFrames/Project.toml              |  2 +-
 BanyanDataFrames/src/df.jl                 |  2 +-
 BanyanDataFrames/test/sample_collection.jl | 18 +++---
 BanyanDataFrames/test/utils_data.jl        |  1 -
 BanyanHDF5/src/hdf5.jl                     |  2 +-
 BanyanHDF5/src/locations.jl                |  2 +-
 BanyanHDF5/test/hdf5.jl                    | 73 ++++++++++++++++++++--
 BanyanImages/Project.toml                  |  2 +-
 BanyanImages/test/jpg.jl                   | 64 ++++++++++++++++++-
 20 files changed, 209 insertions(+), 63 deletions(-)

diff --git a/Banyan/Project.toml b/Banyan/Project.toml
index 02ad86b3..ef6505fd 100644
--- a/Banyan/Project.toml
+++ b/Banyan/Project.toml
@@ -7,7 +7,6 @@ version = "0.4.1"
 AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
 AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598"
 AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
-AWSSQS = "6e80b5ca-5733-51f9-999e-c18680912812"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -16,7 +15,6 @@ Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-IniFile = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LibGit2 = "76f85450-5226-5b5a-8eaa-529ad045b433"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -31,16 +29,14 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
 
 [compat]
-AWSCore = "0.6"
-AWSS3 = "0.7"
-AWSSQS = "0.6"
+AWS = "1"
+AWSS3 = "0.8"
 Arrow = "2"
 DataStructures = "0.18"
-Downloads = "^1.4"
+Downloads = "1.4"
 FileIO = "1.9.1"
 FilePathsBase = "^0.9.15"
 HTTP = "^0.9.17"
-IniFile = "0.5"
 JSON = "0.21"
 MPI = "^0.19.0"
 MethodAnalysis = "0.4"
diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 640175cf..4e31eeda 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -37,10 +37,12 @@ using Base64,
     TOML
 
 using AWS
-using AWS.AWSServices: s3
+AWS.DEFAULT_BACKEND[] = AWS.DownloadsBackend()
+s3 = set_features(AWS.AWSServices.s3; use_response_type=true)
+using AWS.AWSExceptions
 using AWS: @service
-@service S3
-@service SQS
+@service S3 use_response_type = true
+@service SQS use_response_type = true
 using AWSS3
 
 global BANYAN_API_ENDPOINT
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 85a79dc7..07c665a7 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -44,7 +44,7 @@ end
 
 global TABLE_FORMATS = ["csv", "parquet", "arrow"]
 
-function get_location_path_with_format(p::String, kwargs...)::LocationPath
+function get_location_path_with_format(p::String; kwargs...)::LocationPath
     if isempty(p)
         return NO_LOCATION_PATH
     end
@@ -88,7 +88,7 @@ function set_session_sampling_configs(d::Dict{SessionId,Dict{LocationPath,Sampli
     session_sampling_configs = d
 end
 
-get_sampling_config(path="", kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...))
+get_sampling_config(path=""; kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...))
 function get_sampling_configs()
     global session_sampling_configs
     session_sampling_configs[_get_session_id_no_error()]
@@ -102,8 +102,13 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig =
 
 get_sample_rate(p::String=""; kwargs...) =
     get_sample_rate(get_location_path_with_format(p; kwargs...))
-parse_sample_rate(object_key) =
-    parse(Int64, object_key[(findlast("_", object_key).start+1):end])
+function parse_sample_rate(object_key)
+    lastpos = findlast("_", object_key)
+    if isnothing(lastpos)
+        error("Object name \"$object_key\" doesn't contain a sample rate")
+    end
+    parse(Int64, object_key[(lastpos.start+1):end])
+end
 function get_sample_rate(l_path::LocationPath)
     # Get the desired sample rate
     desired_sample_rate = get_sampling_config(l_path).rate
@@ -182,6 +187,8 @@ struct AWSExceptionInfo
 end
 
 function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
+    global s3
+
     # This checks local cache and S3 cache for sample and metadata files.
     # It then returns a Location object (with a null sample) and the local file names
     # to read/write the metadata and sample from/to.
@@ -246,7 +253,8 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     found_local_samples = Tuple{String,Int64}[]
     found_local_sample_rate_diffs = Int64[]
     samples_local_dir = joinpath(homedir(), ".banyan", "samples")
-    for local_sample_path in readdir(samples_local_dir, join=true)
+    local_sample_paths = isdir(samples_local_dir) ? readdir(samples_local_dir, join=true) : String[]
+    for local_sample_path in local_sample_paths
         if startswith(local_sample_path, sample_path_prefix)
             local_sample_rate = parse_sample_rate(object_key)
             diff_sample_rate = abs(local_sample_rate - desired_sample_rate)
@@ -330,7 +338,11 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     res_location.sample_invalid = isempty(final_local_sample_path)
     (
         res_location,
-        metaata_local_path,
-        isempty(final_local_sample_path) ? final_local_sample_path : "sample_path_prefix$desired_sample_rate"
+        metadata_local_path,
+        if !isempty(final_local_sample_path)
+            final_local_sample_path
+        else
+            joinpath(samples_local_dir, "$sample_path_prefix$desired_sample_rate")
+        end
     )
 end
\ No newline at end of file
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 15a7e442..565e45ac 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -11,7 +11,7 @@ Location(name::String, parameters::LocationParameters, sample_memory_usage::Int6
 
 Base.isnothing(l::Location) = isnothing(l.sample)
 
-LocationSource(name::String, parameters::LocationParameters, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location =
+LocationSource(name::String, parameters::Union{Dict{String,Any},Dict{String,String}}, sample_memory_usage::Int64 = -1, sample::Sample = Sample())::Location =
     Location(name, "None", parameters, LocationParameters(), sample_memory_usage, sample, false, false)
 
 LocationDestination(
@@ -350,11 +350,11 @@ function invalidate_location(p; kwargs...)
     invalidate_metadata(p; kwargs...)
     invalidate_samples(p; kwargs...)
 end
-function invalidate_all_locations(p; kwargs...)
+function invalidate_all_locations()
     for subdir in ["samples", "metadata"]
         local_dir = joinpath(homedir(), ".banyan", subdir)
-        if isdir(samples_local_dir)
-            rm(local_dir; force=true, recrusive=true)
+        if isdir(local_dir)
+            rm(local_dir; force=true, recursive=true)
         end
     end
 
@@ -374,14 +374,16 @@ function invalidate_all_locations(p; kwargs...)
             for d in banyan_samples_objects
                 push!(objects_to_delete, Dict("Key" => d["Key"]))
             end
-            try
-                S3.delete_objects(
-                    banyan_samples_bucket_name(),
-                    Dict("Objects" => objects_to_delete)
-                )
-            catch e
-                if is_debug_on()
-                    show(e)
+            if !isempty(objects_to_delete)
+                try
+                    S3.delete_objects(
+                        banyan_samples_bucket_name(),
+                        Dict("Objects" => objects_to_delete)
+                    )
+                catch e
+                    if is_debug_on()
+                        show(e)
+                    end
                 end
             end
         end
diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl
index 412aef57..8ced21e5 100644
--- a/Banyan/src/queues.jl
+++ b/Banyan/src/queues.jl
@@ -2,9 +2,9 @@
 # GET QUEUE URL #
 #################
 
-scatter_queue_url()::Dict{Symbol,Any} = get_session().scatter_queue_url
-gather_queue_url()::Dict{Symbol,Any} = get_session().gather_queue_url
-execution_queue_url()::Dict{Symbol,Any} = get_session().execution_queue_url
+scatter_queue_url()::String = get_session().scatter_queue_url
+gather_queue_url()::String = get_session().gather_queue_url
+execution_queue_url()::String = get_session().execution_queue_url
 
 ###################
 # RECEIVE MESSAGE #
@@ -112,8 +112,8 @@ end
 function sqs_send_message(queue_url, message)
     generated_message_id = generate_message_id()
     SQS.send_message(
-        queue_url,
         message,
+        queue_url,
         Dict(
             "MessageGroupId" => "1",
             "MessageDeduplicationId" => generated_message_id
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 64a44981..a8d28ce9 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -287,6 +287,10 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
             value_id = message["value_id"]::ValueId
             num_chunks = message["num_chunks"]::Int64
             num_remaining_chunks = num_chunks - 1
+
+            if is_debug_on()
+                printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
+            end
             
             whole_message_contents = if num_chunks > 1
                 partial_messages = Vector{String}(undef, num_chunks)
@@ -704,6 +708,10 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
             value_id = message["value_id"]::ValueId
             num_chunks = message["num_chunks"]::Int64
             num_remaining_chunks = num_chunks - 1
+
+            if is_debug_on()
+                printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
+            end
             
             whole_message_contents = if num_chunks > 1
                 partial_messages = Vector{String}(undef, num_chunks)
diff --git a/Banyan/src/sample.jl b/Banyan/src/sample.jl
index 7db1bfdc..81a5d6b1 100644
--- a/Banyan/src/sample.jl
+++ b/Banyan/src/sample.jl
@@ -27,7 +27,7 @@ const NOTHING_SAMPLE = Sample(nothing, Int64(-1))
 
 Base.isnothing(s::Sample) = s.rate == -1
 
-struct SamplingConfig
+mutable struct SamplingConfig
     rate::Int64
     always_exact::Bool
     max_num_bytes_exact::Int64
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index 2e6e6d39..47d3a8fd 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -13,7 +13,7 @@ function configure_sampling(
 
     sc = get_sampling_config(path; kwargs...)
     nsc = SamplingConfig(
-        (!isnothing(sample_rate) && !default) ? rate : sc.rate,
+        (!isnothing(sample_rate) && !default) ? sample_rate : sc.rate,
         (!isnothing(always_exact) && !default) ? always_exact : sc.always_exact,
         (!isnothing(max_num_bytes_exact) && !default) ? max_num_bytes_exact : sc.max_num_bytes_exact,
         (!isnothing(force_new_sample_rate) && !default) ? force_new_sample_rate : sc.force_new_sample_rate,
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index d0361657..dd6cd25d 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -549,7 +549,7 @@ function get_session_status(session_id::String=get_session_id(); kwargs...)::Str
     session_status
 end
 
-function _wait_for_session(session_id::SessionId=get_session_id(), kwargs...)
+function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...)
     sessions_dict = get_sessions_dict()
     session_status = get_session_status(session_id; kwargs...)
     p = ProgressUnknown("Preparing session with ID $session_id", spinner=true)
@@ -580,7 +580,7 @@ function _wait_for_session(session_id::SessionId=get_session_id(), kwargs...)
     end
 end
 
-function wait_for_session(session_id::SessionId=get_session_id(), kwargs...)
+function wait_for_session(session_id::SessionId=get_session_id(); kwargs...)
     sessions_dict = get_sessions_dict()
     is_session_ready = if haskey(sessions_dict, session_id)
         session_info::Session = sessions_dict[session_id]
@@ -592,7 +592,7 @@ function wait_for_session(session_id::SessionId=get_session_id(), kwargs...)
         false
     end
     if !is_session_ready
-        _wait_for_session(session_id, kwargs...)
+        _wait_for_session(session_id; kwargs...)
     end
 end
 
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index 6daafc17..4001469a 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -123,7 +123,7 @@ end
 
 get_banyanconfig_path()::String = joinpath(homedir(), ".banyan", "banyanconfig.toml")
 
-configure(; user_id=nothing, api_key=nothing, ec2_key_pair_name=nothing, banyanconfig_path=nothing) =
+configure(; user_id=nothing, api_key=nothing, ec2_key_pair_name=nothing, banyanconfig_path=nothing, kwargs...) =
     configure(
         isnothing(user_id) ? "" : user_id,
         isnothing(api_key) ? "" : api_key,
@@ -200,7 +200,7 @@ end
 
 # Getting organization IDs
 
-organization_ids = Dict{String,String}
+organization_ids = Dict{String,String}()
 function get_organization_id()
     global organization_ids
     global sessions
@@ -209,7 +209,7 @@ function get_organization_id()
     if haskey(organization_ids, user_id)
         organization_ids[user_id]
     elseif haskey(sessions, session_id)
-        sessions[session_id].organization_ids
+        sessions[session_id].organization_id
     else
         organization_id = send_request_get_response(:describe_users, Dict())["organization_id"]
         organization_ids[user_id] = organization_id
diff --git a/Banyan/test/Project.toml b/Banyan/test/Project.toml
index dd440603..61c14273 100644
--- a/Banyan/test/Project.toml
+++ b/Banyan/test/Project.toml
@@ -10,7 +10,6 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
-IniFile = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Parquet = "626c502c-15b0-58ad-a749-f091afb673ae"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -23,13 +22,12 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
 
 [compat]
-Arrow = "1.5.0"
+Arrow = "2"
 CSV = "0.9.5"
 DataFrames = "1"
 Downloads = "1.4"
 FileIO = "1.9.1"
 FilePathsBase = "^0.9.15"
-IniFile = "0.5.0"
 JSON = "0.21.1"
 Parquet = "0.8.3"
 ReTest = "0.3.2"
diff --git a/BanyanDataFrames/Project.toml b/BanyanDataFrames/Project.toml
index 64dcbab1..0de977b7 100644
--- a/BanyanDataFrames/Project.toml
+++ b/BanyanDataFrames/Project.toml
@@ -21,7 +21,7 @@ Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-Arrow = "^1.6"
+Arrow = "2"
 Banyan = "0.4.1"
 BanyanArrays = "0.4.1"
 DataFrames = "1"
diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl
index b51eb4f0..3c604963 100644
--- a/BanyanDataFrames/src/df.jl
+++ b/BanyanDataFrames/src/df.jl
@@ -50,7 +50,7 @@ Base.propertynames(df::DataFrame) = propertynames(sample(df)::DataFrames.DataFra
 function read_table(path::String; kwargs...)
     @nospecialize
     invalidate(path; kwargs...)
-    df_loc = RemoteTableSource(path; kwargs...)
+    df_loc = RemoteTableSource(path)
     df_loc.src_name == "Remote" || error("$path does not exist")
     invalidate(path; after=true, kwargs...)
     df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"])
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index b20813c1..3b772ff8 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -115,27 +115,29 @@ end
         @test !has_metadata(p2)
         @test !has_sample(p2)
 
-        df2 = read_table(df2)
+        df2 = read_table(p2)
         @show get_sample_rate(p2)
         sample(df2)
         @show get_sample_rate(p2)
-        df2 = read_table(df2; samples_invalid=true)
+        df2 = read_table(p2; samples_invalid=true)
         sample(df2)
         configure_sampling(sample_rate=7, for_all_locations=true)
-        df2 = read_table(df2; metadata_invalid=true)
+        df2 = read_table(p2; metadata_invalid=true)
         sample(df2)
-        @test get_sample_rate() == 5
+        @test get_sample_rate(p2) == 5
+        @test get_sample_rate() == 7
         configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
         @test get_sample_rate(p2) == 5
-        df2 = read_table(df2)
         @test get_sample_rate() == 7
-        @test get_sample_rate() == 5
-        df2 = read_table(df2; location_invalid=true)
+        df2 = read_table(p2)
+        @test get_sample_rate(p2) == 7
+        @test get_sample_rate() == 7
+        df2 = read_table(p2; location_invalid=true)
         sample(df2)
         @test has_metadata(p2)
         @test has_sample(p2)
         @show get_sample_rate(p2)
-        configure_sampling(p2; always_exact=tru)
+        configure_sampling(p2; always_exact=true)
         sample(df2)
     end
 end
diff --git a/BanyanDataFrames/test/utils_data.jl b/BanyanDataFrames/test/utils_data.jl
index 0f8e0463..4bae4562 100644
--- a/BanyanDataFrames/test/utils_data.jl
+++ b/BanyanDataFrames/test/utils_data.jl
@@ -318,7 +318,6 @@ end
 
 function cleanup_tests(bucket_name=get_cluster_s3_bucket_name())
     # Delete all temporary test files that are prepended with "test-tmp__"
-    @show bucket_name
     for p in s3_list_keys(Banyan.global_aws_config(), bucket_name)
         if contains(string(p), "test-tmp_")
             # s3_path = S3Path("s3://$bucket_name/$p", config = Banyan.global_aws_config())
diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl
index 3bef15de..ecd529b4 100644
--- a/BanyanHDF5/src/hdf5.jl
+++ b/BanyanHDF5/src/hdf5.jl
@@ -1,6 +1,6 @@
 function read_hdf5(path; kwargs...)
     invalidate(path; kwargs...)
-    A_loc = RemoteHDF5Source(path; kwargs...)
+    A_loc = RemoteHDF5Source(path)
     A_loc.src_name == "Remote" || error("$path does not exist")
     invalidate(path; after=true, kwargs...)
     A = Future(datatype="Array", source=A_loc)
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 38371070..82142481 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -138,7 +138,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig
     if is_main
         # Construct parameters for Location
         src_params = if curr_metadata_invalid
-            Dict{String,String}(
+            Dict{String,Any}(
                 "name" => "Remote",
                 "path_and_subpath" => path_and_subpath,
                 "path" => remotepath,
diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl
index 41b6d969..8080db4a 100644
--- a/BanyanHDF5/test/hdf5.jl
+++ b/BanyanHDF5/test/hdf5.jl
@@ -6,7 +6,7 @@
 src in ["Internet", "S3"]
     use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do
         use_data()
-        set_max_exact_sample_length(128)
+        configure_sampling(max_num_bytes_exact=0)
 
         for _ in 1:2
             src_is_s3 = src == "S3"
@@ -27,7 +27,72 @@ src in ["Internet", "S3"]
             @test x_sum_collect == (src_is_s3 ? 12840 : 32100000)
         end
 
-        set_max_exact_sample_length(2048)
+        configure_sampling(default=true)
+    end
+end
+
+# TODO: Add tests here modeled after BDF.jl
+
+@testset "Reading and sampling HDF5 in $src with $scheduling_config with max_num_bytes_exact=$max_num_bytes and shuffled=$shuffled" for scheduling_config in [
+    "default scheduling",
+    "parallelism encouraged",
+    "parallelism and batches encouraged",
+],
+src in ["Internet", "S3"],
+max_num_bytes in [0, Banyan.parse_bytes("100 GB")],
+shuffled in [true, false]
+    get_organization_id()
+    use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do
+        invalidate_all_locations()
+        use_data()
+        configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled)
+
+        p = if src == "S3"
+            joinpath("s3://", get_cluster_s3_bucket_name(), "fillval.h5/DS1")
+        else
+            joinpath("https://github.com/banyan-team/banyan-julia/raw/v0.1.1/BanyanArrays/test/res", "fillval.h5/DS1")
+        end
+
+        x = read_hdf5(p)
+        sample(x)
+        @show get_sample_rate(x)
+
+        configure_sampling(p; sample_rate=5)
+        x = read_hdf5(p)
+        @test get_sample_rate(p) == 5
+        @test has_metadata(p)
+        @test has_sample(p)
+        invalidate_metadata(p)
+        @test !has_metadata(p)
+        @test has_sample(p)
+        innvalidate_location(p)
+        @test !has_metadata(p)
+        @test !has_sample(p)
+
+        x = read_hdf5(p)
+        @show get_sample_rate(p)
+        sample(x)
+        @show get_sample_rate(p)
+        x = read_hdf5(p; samples_invalid=true)
+        sample(x)
+        configure_sampling(sample_rate=7, for_all_locations=true)
+        x = read_hdf5(p; metadata_invalid=true)
+        sample(x)
+        @test get_sample_rate(p) == 5
+        @test get_sample_rate() == 7
+        configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
+        @test get_sample_rate(p) == 5
+        @test get_sample_rate() == 7
+        x = read_hdf5(p)
+        @test get_sample_rate(p) == 7
+        @test get_sample_rate() == 7
+        x = read_hdf5(p; location_invalid=true)
+        sample(x)
+        @test has_metadata(p)
+        @test has_sample(p)
+        @show get_sample_rate(p)
+        configure_sampling(p; always_exact=true)
+        sample(x)
     end
 end
 
@@ -41,7 +106,7 @@ end
 
     use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do
         use_data(src)
-        set_max_exact_sample_length(128)
+        configure_sampling(max_num_bytes_exact=0)
 
         # Determine where to read from
 
@@ -169,6 +234,6 @@ end
         #     end
         # end
 
-        set_max_exact_sample_length(2048)
+        configure_sampling(default=true)
     end
 end
diff --git a/BanyanImages/Project.toml b/BanyanImages/Project.toml
index 1fb3dc44..cf7cbabd 100644
--- a/BanyanImages/Project.toml
+++ b/BanyanImages/Project.toml
@@ -17,7 +17,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-Arrow = "^1.6"
+Arrow = "2"
 Banyan = "0.4.1"
 BanyanArrays = "0.4.1"
 FileIO = "1.9.1"
diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl
index e8a9a2f4..bdf0ae26 100644
--- a/BanyanImages/test/jpg.jl
+++ b/BanyanImages/test/jpg.jl
@@ -54,7 +54,7 @@ invalid_bool_to_str(metadata_invalid) = metadata_invalid ? "invalid" : "valid"
     add_channelview in [true, false],
     metadata_invalid in [true, false],
     sample_invalid in [true, false],
-    nimages in [75, 5]
+    nimages in [75, 50]
     # TODO: Test exact sample collection and also replicated with batch image computation
     use_session_for_testing(sample_rate = 75) do
         bucket_name = get_cluster_s3_bucket_name()
@@ -79,6 +79,68 @@ invalid_bool_to_str(metadata_invalid) = metadata_invalid ? "invalid" : "valid"
     end
 end
 
+@testset "Reading and sampling $nimage JPG images on $loc with $format and add_channelview=$add_channelview, max_num_bytes=$max_num_bytes, shuffled=$shuffled" for
+    (loc, format) in [
+        ("Internet", "generator"),
+        ("S3", "generator"),
+        ("S3", "directory")
+    ],
+    max_num_bytes in [0, Banyan.parse_bytes("100 GB")],
+    shuffled in [true, false],
+    nimages in [1, 50],
+    add_channelview in [true, false]
+    get_organization_id()
+    use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do
+        bucket_name = get_cluster_s3_bucket_name()
+        invalidate_all_locations()
+        configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled)
+
+        p = get_test_path(loc, "generator", "jpg", nimages, bucket_name)
+
+        x = read_jpg(p; add_channelview=add_channelview)
+        sample(x)
+        @show get_sample_rate(x)
+
+        # TODO: Ensure that this triggers parallel cluster<->client data transfer
+        configure_sampling(p; sample_rate=20)
+        x = read_jpg(p; add_channelview=add_channelview)
+        @test get_sample_rate(p) == 20
+        @test has_metadata(p)
+        @test has_sample(p)
+        invalidate_metadata(p)
+        @test !has_metadata(p)
+        @test has_sample(p)
+        innvalidate_location(p)
+        @test !has_metadata(p)
+        @test !has_sample(p)
+
+        x = read_jpg(p; add_channelview=add_channelview)
+        @show get_sample_rate(p)
+        sample(x)
+        @show get_sample_rate(p)
+        x = read_jpg(p; add_channelview=add_channelview, samples_invalid=true)
+        sample(x)
+        configure_sampling(sample_rate=75, for_all_locations=true)
+        x = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true)
+        sample(x)
+        @test get_sample_rate(p) == 50
+        @test get_sample_rate() == 75
+        configure_sampling(sample_rate=75, force_new_sample_rate=true, for_all_locations=true)
+        @test get_sample_rate(p) == 50
+        @test get_sample_rate() == 75
+        x = read_jpg(p; add_channelview=add_channelview)
+        @test get_sample_rate(p) == 75
+        @test get_sample_rate() == 75
+        x = read_jpg(p; add_channelview=add_channelview, location_invalid=true)
+        sample(x)
+        @test has_metadata(p)
+        @test has_sample(p)
+        @show get_sample_rate(p)
+        configure_sampling(p; always_exact=true)
+        sample(x)
+    end
+end
+
 # @testset "Reading/writing JPG $src through $format" for (src, format) in
 #     ]
 #     # TODO: read

From 47a48234127f67c6c9523f8485113d53dadee7d9 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Tue, 9 Aug 2022 19:24:01 -0700
Subject: [PATCH 13/25] Fix send_to_client

---
 Banyan/src/queues.jl   | 17 ++++++++++++++---
 Banyan/src/requests.jl |  6 ++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl
index 8ced21e5..f565277c 100644
--- a/Banyan/src/queues.jl
+++ b/Banyan/src/queues.jl
@@ -124,6 +124,7 @@ end
 function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
     MAX_MESSAGE_LENGTH = 220_000
     message = to_jl_string(value)::String
+    generated_message_id = generate_message_id()
 
     # Break the message down into chunk ranges
     nmessages = 0
@@ -140,7 +141,7 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
             message_i += MAX_MESSAGE_LENGTH
             message_length -= MAX_MESSAGE_LENGTH
         end
-        push!(message_ranges, starti:message_i)
+        push!(message_ranges, starti:(message_i-1))
         nmessages += 1
         if is_last_message
             break
@@ -150,6 +151,7 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
     # Launch asynchronous threads to send SQS messages
     gather_q_url = gather_queue_url()
     num_chunks = length(message_ranges)
+    @show num_chunks
     if num_chunks > 1
         @sync for i = 1:message_ranges
             @async begin
@@ -165,8 +167,13 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
                 SQS.send_message(
                     msg_json,
                     gather_q_url,
-                    Dict("MessageGroupId" => string(i))
+                    Dict(
+                        "MessageGroupId" => string(i),
+                        "MessageDeduplicationId" => generated_message_id * string(i)
+                    )
                 )
+                @show msg
+                @show i
             end
         end
     else
@@ -179,11 +186,15 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
             "chunk_idx" => i,
             "num_chunks" => num_chunks
         )
+        @show msg
         msg_json = JSON.json(msg)
         SQS.send_message(
             msg_json,
             gather_q_url,
-            Dict("MessageGroupId" => string(i))
+            Dict(
+                "MessageGroupId" => string(i),
+                "MessageDeduplicationId" => generated_message_id * string(i)
+            )
         )
     end
 end
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index a8d28ce9..07d50e60 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -291,6 +291,8 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
             if is_debug_on()
                 printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
             end
+
+            @show num_chunks
             
             whole_message_contents = if num_chunks > 1
                 partial_messages = Vector{String}(undef, num_chunks)
@@ -299,6 +301,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
                     @async begin
                         partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing)
                         chunk_idx = partial_message["chunk_idx"]
+                        @show chunk_idx
                         partial_messages[chunk_idx] = message["contents"]
                     end
                 end
@@ -713,6 +716,8 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
                 printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
             end
             
+            @show num_chunks
+            
             whole_message_contents = if num_chunks > 1
                 partial_messages = Vector{String}(undef, num_chunks)
                 partial_messages[message["chunk_idx"]] = message["contents"]
@@ -720,6 +725,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
                     @async begin
                         partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing)
                         chunk_idx = partial_message["chunk_idx"]
+                        @show chunk_idx
                         partial_messages[chunk_idx] = message["contents"]
                     end
                 end

From 91245c527292d8ddb35987779f5ed03439007faf Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Wed, 10 Aug 2022 09:53:06 -0700
Subject: [PATCH 14/25] Fix SamplingConfig serialization

---
 Banyan/src/Banyan.jl                       |  1 +
 Banyan/src/location.jl                     |  4 ++--
 Banyan/src/requests.jl                     |  4 ++--
 Banyan/src/session.jl                      |  6 +++---
 Banyan/src/utils.jl                        | 18 ++++++++++--------
 BanyanDataFrames/Project.toml              |  1 +
 BanyanDataFrames/src/BanyanDataFrames.jl   |  1 +
 BanyanDataFrames/src/locations.jl          |  5 +++--
 BanyanDataFrames/test/sample_collection.jl |  4 ++--
 9 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 4e31eeda..388c0083 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -100,6 +100,7 @@ export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata
 export LocationPath, SamplingConfig
 export has_metadata, has_sample, get_sample_rate, configure_sampling
 export type_to_str, str_to_type
+export banyan_metadata_bucket_name, banyan_samples_bucket_name, get_metadata_path, get_sample_path_prefix, get_sample_path
 
 # Serialization
 export from_jl_string, to_jl_string
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 07c665a7..2b2f7f61 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -83,9 +83,9 @@ const NO_LOCATION_PATH = LocationPath("", "", "")
 const DEFAULT_SAMPLING_CONFIG = SamplingConfig(1024, false, parse_bytes("32 MB"), false, true)
 session_sampling_configs = Dict{SessionId,Dict{LocationPath,SamplingConfig}}("" => Dict(NO_LOCATION_PATH => DEFAULT_SAMPLING_CONFIG))
 
-function set_session_sampling_configs(d::Dict{SessionId,Dict{LocationPath,SamplingConfig}})
+function set_sampling_configs(d::Dict{LocationPath,SamplingConfig})
     global session_sampling_configs
-    session_sampling_configs = d
+    session_sampling_configs[_get_session_id_no_error()] = d
 end
 
 get_sampling_config(path=""; kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...))
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 07d50e60..6da8d9c4 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -713,7 +713,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
             num_remaining_chunks = num_chunks - 1
 
             if is_debug_on()
-                printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
+                println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
             end
             
             @show num_chunks
@@ -742,7 +742,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
                 # call to `send_evaluation`
             end
 
-            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time)
+            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, whole_message_contents, error_for_main_stuck, error_for_main_stuck_time)
         elseif (message_type == "EVALUATION_END")
             if message["end"]::Bool == true
                 return stored_message
diff --git a/Banyan/src/session.jl b/Banyan/src/session.jl
index cbec24ec..9e781299 100644
--- a/Banyan/src/session.jl
+++ b/Banyan/src/session.jl
@@ -61,13 +61,13 @@ mutable struct Session
 end
 
 function sampling_configs_to_jl(sampling_configs::Dict{LocationPath,SamplingConfig})
-    res = Tuple{Tuple{String,String,String},Tuple{Int64,Bool,Int64,Bool}}[]
+    res = Tuple{Tuple{String,String,String},Tuple{Int64,Bool,Int64,Bool,Bool}}[]
     for (l::LocationPath, s::SamplingConfig) in sampling_configs
         push!(
             res,
             (
                 (l.original_path, l.format_name, l.format_version),
-                (s.rate, s.always_exact, s.max_num_bytes_exact, s.force_new_sample_rate),
+                (s.rate, s.always_exact, s.max_num_bytes_exact, s.force_new_sample_rate, s.assume_shuffled),
             ),
         )
     end
@@ -77,7 +77,7 @@ end
 function sampling_configs_from_jl(sampling_configs)
     res = Dict{LocationPath,SamplingConfig}()
     for (l, s) in sampling_configs
-        res[LocationPath(l[1], l[2], l[3])] = SamplingConfig(s[1], s[2], s[3], s[4])
+        res[LocationPath(l[1], l[2], l[3])] = SamplingConfig(s[1], s[2], s[3], s[4], s[5])
     end
     res
 end
\ No newline at end of file
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index 4001469a..cec3e57c 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -165,7 +165,7 @@ function configure(user_id, api_key, ec2_key_pair_name, banyanconfig_path)
     end
 
     # Check banyanconfig file
-    banyan_config_has_info = !(isempty(banyan_config) || isempty(banyan_config))
+    banyan_config_has_info = !isnothing(banyan_config) && !isempty(banyan_config)
     if isempty(user_id) && banyan_config_has_info && haskey(banyan_config, "banyan") && haskey(banyan_config["banyan"], "user_id")
         user_id = banyan_config["banyan"]["user_id"]
     end
@@ -204,16 +204,18 @@ organization_ids = Dict{String,String}()
 function get_organization_id()
     global organization_ids
     global sessions
-    user_id = configure()["banyan"]["user_id"]
     session_id = _get_session_id_no_error()
-    if haskey(organization_ids, user_id)
-        organization_ids[user_id]
-    elseif haskey(sessions, session_id)
+    if haskey(sessions, session_id)
         sessions[session_id].organization_id
     else
-        organization_id = send_request_get_response(:describe_users, Dict())["organization_id"]
-        organization_ids[user_id] = organization_id
-        organization_id
+        user_id = configure()["banyan"]["user_id"]
+        if haskey(organization_ids, user_id)
+            organization_ids[user_id]
+        else
+            organization_id = send_request_get_response(:describe_users, Dict())["organization_id"]
+            organization_ids[user_id] = organization_id
+            organization_id
+        end
     end
 end
 
diff --git a/BanyanDataFrames/Project.toml b/BanyanDataFrames/Project.toml
index 0de977b7..963f2ecd 100644
--- a/BanyanDataFrames/Project.toml
+++ b/BanyanDataFrames/Project.toml
@@ -7,6 +7,7 @@ version = "0.4.1"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317"
 BanyanArrays = "369465de-032e-4609-9dcf-82b89c370a7b"
+Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
diff --git a/BanyanDataFrames/src/BanyanDataFrames.jl b/BanyanDataFrames/src/BanyanDataFrames.jl
index fae29bce..2c6b1a85 100644
--- a/BanyanDataFrames/src/BanyanDataFrames.jl
+++ b/BanyanDataFrames/src/BanyanDataFrames.jl
@@ -3,6 +3,7 @@ module BanyanDataFrames
 using Arrow,
     Banyan,
     BanyanArrays,
+    Base64,
     DataFrames,
     Dates,
     Downloads,
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index c16c9e33..398d4dc9 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -8,6 +8,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
     shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact
     # TODO: Replace `max_exact_sample_length` with `max_num_bytes_exact`
     is_main = is_main_worker()
+    sample_rate = sampling_config.rate
     
     # Get cached Location and if it has valid parameters and sample, return
     curr_metadata_invalid, curr_sample_invalid = loc.metadata_invalid, loc.sample_invalid
@@ -21,7 +22,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
 
     # Get paths for writing sample and metadata
     metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
-    sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)"
+    sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate"
 
     # Get metadata if it is still valid
     curr_meta::Arrow.Table = if !curr_metadata_invalid
@@ -367,7 +368,7 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
 
         # Write the sample to S3 cache if previously invalid
         if curr_sample_invalid
-            write(sample_path, remote_sample.value.data)
+            write(sample_path, remote_sample.value)
         end
 
         if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index 3b772ff8..4a18f3b1 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -96,8 +96,8 @@ end
 
         invalidate_all_locations()
 
-        p1 = "s3://$(bucket)/iris_large_$format.$format"
-        p2 = "s3://$(bucket)/iris_large_tmp_$format.$format"
+        p1 = "s3://$(bucket)/iris_large.$format"
+        p2 = "s3://$(bucket)/iris_large_tmp.$format"
 
         df = read_table(p1; metadata_invalid=true, invalidate_samples=true)
         sample(df)

From cbcdd7c8b1c84427c4b6335a5de9d1734c8c9486 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Wed, 10 Aug 2022 22:53:51 -0700
Subject: [PATCH 15/25] Fix bugs

---
 Banyan/src/Banyan.jl                       |  2 +-
 Banyan/src/location.jl                     | 28 +++++++++++++++++++---
 Banyan/src/locations.jl                    |  6 ++---
 Banyan/src/queues.jl                       |  2 --
 Banyan/src/requests.jl                     | 16 +++++--------
 BanyanDataFrames/src/locations.jl          |  2 +-
 BanyanDataFrames/src/pfs.jl                | 12 ++++++----
 BanyanDataFrames/test/sample_collection.jl | 19 +++++++++++----
 BanyanHDF5/src/locations.jl                |  2 +-
 BanyanHDF5/test/hdf5.jl                    |  2 +-
 BanyanImages/test/jpg.jl                   |  2 +-
 11 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 388c0083..1cdba8ed 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -98,7 +98,7 @@ export invalidate_all_locations, invalidate_location, invalidate_metadata, inval
 export NOTHING_LOCATION, INVALID_LOCATION, NO_LOCATION_PATH
 export has_separate_metadata, get_sample, get_metadata, get_sample_and_metadata
 export LocationPath, SamplingConfig
-export has_metadata, has_sample, get_sample_rate, configure_sampling
+export has_metadata, has_sample, get_sample_rate, configure_sampling, get_sampling_config, get_sampling_configs, set_sampling_configs
 export type_to_str, str_to_type
 export banyan_metadata_bucket_name, banyan_samples_bucket_name, get_metadata_path, get_sample_path_prefix, get_sample_path
 
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 2b2f7f61..1af470c9 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -128,7 +128,8 @@ function get_sample_rate(l_path::LocationPath)
     banyan_samples_objects = try
         res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"]
         res isa Base.Vector ? res : [res]
-    catch
+    catch e
+        @show e
         return desired_sample_rate
     end
     sample_rate = -1
@@ -148,7 +149,10 @@ end
 
 # Checking for having metadata, samples
 
+has_metadata(p::String=""; kwargs...) =
+    has_metadata(get_location_path_with_format(p; kwargs...))
 function has_metadata(l_path:: LocationPath)::Bool
+    println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path))")
     try
         !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"])
     catch
@@ -156,6 +160,8 @@ function has_metadata(l_path:: LocationPath)::Bool
     end
 end
 
+has_sample(p::String=""; kwargs...) =
+    has_sample(get_location_path_with_format(p; kwargs...))
 function has_sample(l_path:: LocationPath)::Bool
     sc = get_sampling_config(l_path)
     pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
@@ -186,6 +192,22 @@ struct AWSExceptionInfo
     end
 end
 
+function get_metadata_local_path()
+    p = joinpath(homedir(), ".banyan", "metadata")
+    if !isdir(p)
+        mkpath(p)
+    end
+    p
+end
+
+function get_samples_local_path()
+    p = joinpath(homedir(), ".banyan", "metadata")
+    if !isdir(p)
+        mkpath(p)
+    end
+    p
+end
+
 function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     global s3
 
@@ -195,7 +217,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
 
     # Load in metadata
     metadata_path = get_metadata_path(lp)
-    metadata_local_path = joinpath(homedir(), ".banyan", "metadata", metadata_path)
+    metadata_local_path = joinpath(get_metadata_local_path(), metadata_path)
     metadata_s3_path = "/$(banyan_metadata_bucket_name())/$metadata_path"
     src_params_not_stored_locally = false
     src_params::Dict{String, String} = if isfile(metadata_local_path)
@@ -252,7 +274,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     # Find local samples
     found_local_samples = Tuple{String,Int64}[]
     found_local_sample_rate_diffs = Int64[]
-    samples_local_dir = joinpath(homedir(), ".banyan", "samples")
+    samples_local_dir = get_samples_local_path()
     local_sample_paths = isdir(samples_local_dir) ? readdir(samples_local_dir, join=true) : String[]
     for local_sample_path in local_sample_paths
         if startswith(local_sample_path, sample_path_prefix)
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 565e45ac..7e4b8e6e 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -16,7 +16,7 @@ LocationSource(name::String, parameters::Union{Dict{String,Any},Dict{String,Stri
 
 LocationDestination(
     name::String,
-    parameters::LocationParameters
+    parameters::Union{Dict{String,Any},Dict{String,String}}
 )::Location = Location("None", name, LocationParameters(), parameters, -1, Sample(), false, false)
 
 function to_jl(lt::Location)
@@ -316,8 +316,8 @@ function invalidate_samples(p; kwargs...)
 
     # Delete locally
     samples_local_dir = joinpath(homedir(), ".banyan", "samples")
+    sample_path_prefix = get_sample_path_prefix(lp)
     if isdir(samples_local_dir)
-        sample_path_prefix = get_sample_path_prefix(lp)
         for local_sample_path in readdir(samples_local_dir, join=true)
             if startswith(local_sample_path, sample_path_prefix)
                 rm(local_sample_path)
@@ -463,7 +463,7 @@ function RemoteSource(
     # Look at local and S3 caches of metadata and samples to attempt to
     # construct a Location.
     loc, local_metadata_path, local_sample_path = get_location_source(lp)
-    sc = get_sampling_config(lp)
+    sc = deepcopy(get_sampling_config(lp))
     sc.rate = parse_sample_rate(local_sample_path)
 
     if !loc.metadata_invalid && !loc.sample_invalid
diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl
index f565277c..466ff88c 100644
--- a/Banyan/src/queues.jl
+++ b/Banyan/src/queues.jl
@@ -172,7 +172,6 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
                         "MessageDeduplicationId" => generated_message_id * string(i)
                     )
                 )
-                @show msg
                 @show i
             end
         end
@@ -186,7 +185,6 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
             "chunk_idx" => i,
             "num_chunks" => num_chunks
         )
-        @show msg
         msg_json = JSON.json(msg)
         SQS.send_message(
             msg_json,
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 6da8d9c4..32d67630 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -289,7 +289,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
             num_remaining_chunks = num_chunks - 1
 
             if is_debug_on()
-                printlng("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
+                println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
             end
 
             @show num_chunks
@@ -318,7 +318,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
                 # call to `send_evaluation`
             end
 
-            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, contents, error_for_main_stuck, error_for_main_stuck_time)
+            error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, whole_message_contents, error_for_main_stuck, error_for_main_stuck_time)
         elseif message_type == "EVALUATION_END"
             if message["end"]::Bool == true
                 break
@@ -700,7 +700,7 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
     
     session = get_session()
     gather_queue = gather_queue_url()
-    stored_message = nothing
+    stored_res = nothing
     error_for_main_stuck, error_for_main_stuck_time = nothing, nothing
     partial_gathers = Dict{ValueId,String}()
     while true
@@ -734,18 +734,14 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
                 message["contents"]
             end
 
-            if haskey(session.futures_on_client, value_id)
-                value = from_jl_string(whole_message_contents)
-                f = session.futures_on_client[value_id]::Future
-                f.value = value
-                # TODO: Update stale/mutated here to avoid costly
-                # call to `send_evaluation`
+            if value_id == "-1"
+                stored_res = from_jl_string(whole_message_contents)
             end
 
             error_for_main_stuck, error_for_main_stuck_time = check_worker_stuck_error(value_id, whole_message_contents, error_for_main_stuck, error_for_main_stuck_time)
         elseif (message_type == "EVALUATION_END")
             if message["end"]::Bool == true
-                return stored_message
+                return stored_res
             end
         end
     end
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index 398d4dc9..e1a71efb 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -424,6 +424,6 @@ RemoteTableDestination(remotepath)::Location =
         Dict(
             "format" => get_file_ending(remotepath),
             "nrows" => "0",
-            "path" => remotepath,
+            "path" => remotepath
         ),
     )
\ No newline at end of file
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index f9fe0981..b324c4ba 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -223,7 +223,7 @@ function ReadBlockHelper(@nospecialize(format_value))
         nworkers = get_nworkers(comm)
         npartitions = nbatches * nworkers
         partition_idx = get_partition_idx(batch_idx, nbatches, comm)
-        nrows::Int64 = meta_nrows
+        nrows::Int64 = length(meta_nrows)
         rows_per_partition = cld(nrows, npartitions)
         sorting_perm = sortperm(meta_nrows, rev=true)
         files_by_partition = Base.Vector{Int64}[]
@@ -335,7 +335,7 @@ function ReadBlockHelper(@nospecialize(format_value))
             dfs = Base.Vector{Any}(undef, ndfs)
 
             if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
-                @show (files_to_read, get_worker_idx())
+                @show (filezs_to_read, get_worker_idx())
             end
 
             # Iterate through files and identify which ones correspond to the range of
@@ -484,6 +484,10 @@ function WriteHelper(@nospecialize(format_value))
         # SAMPLE/METADATA COLLECTIOM AND STORAGE #
         ##########################################
 
+        # Get sampling configuration
+        sampling_config = get_sampling_config(lp)
+        sample_rate = sampling_config.rate
+
         # Get paths for reading in metadata and Location
         tmp_suffix = nbatches > 1 ? ".tmp" : ""
         lp_tmp = LocationPath(loc_params_path * tmp_suffix, "arrow", "2")
@@ -525,8 +529,6 @@ function WriteHelper(@nospecialize(format_value))
 
         # Gather # of rows, # of bytes, empty sample, and actual sample
         nbytes = part_res isa Empty ? 0 : Banyan.sample_memory_usage(part_res)
-        sampling_config = get_sampling_config(lp)
-        sample_rate = sampling_config.rate
         sampled_part = (part_res isa Empty || is_disk) ? empty_df : Banyan.get_sample_from_data(part_res, sample_rate, nrows)
         gathered_data =
             gather_across((nrows, nbytes, part_res isa Empty ? part_res : empty(part_res), sampled_part), comm)
@@ -574,6 +576,8 @@ function WriteHelper(@nospecialize(format_value))
                 sample_invalid = true
             end
 
+            println("In Write with sample_invalid=$sample_invalid and sample_memory_usage=$sample_memory_usage while sampling_config=$sampling_config, writing to $m_path")
+
             # Get the actual sample by concatenating
             if !is_disk && !sample_invalid
                 sampled_parts = [gathered[4] for gathered in gathered_data]
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index 4a18f3b1..72c0ba7f 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -94,6 +94,9 @@ end
 
         bucket = get_cluster_s3_bucket_name()
 
+        configure_sampling(max_num_bytes=max_num_bytes, always_shuffled=shuffled)
+        exact_sample = max_num_bytes > 0
+
         invalidate_all_locations()
 
         p1 = "s3://$(bucket)/iris_large.$format"
@@ -104,24 +107,32 @@ end
         @show get_sample_rate(p1)
 
         configure_sampling(p2; sample_rate=5)
-        write_table(p2, df)
+        @show get_sampling_configs()
+        write_table(df, p2)
+        @show get_sampling_configs()
         @test get_sample_rate(p2) == 5
         @test has_metadata(p2)
-        @test has_sample(p2)
+        @test has_sample(p2) == !exact_sample
         invalidate_metadata(p2)
         @test !has_metadata(p2)
-        @test has_sample(p2)
-        innvalidate_location(p2)
+        @test has_sample(p2) == !exact_sample
+        invalidate_location(p2)
         @test !has_metadata(p2)
         @test !has_sample(p2)
 
+        @show get_sample_rate(p2)
         df2 = read_table(p2)
+        @show Banyan.get_location_path_with_format(p2)
+        @show get_sampling_configs()
+        @show get_sampling_config(p2)
         @show get_sample_rate(p2)
         sample(df2)
         @show get_sample_rate(p2)
         df2 = read_table(p2; samples_invalid=true)
         sample(df2)
+        @test get_sample_rate(p2) == 5
         configure_sampling(sample_rate=7, for_all_locations=true)
+        @test get_sample_rate(p2) == 5
         df2 = read_table(p2; metadata_invalid=true)
         sample(df2)
         @test get_sample_rate(p2) == 5
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 82142481..ee214963 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -186,7 +186,7 @@ function RemoteHDF5Destination(remotepath)::Location
             "path" => remotepath,
             "subpath" => datasetpath,
             "path_and_subpath" => path_and_subpath,
-            "format" => "hdf5"
+            "format" => "hdf5",
         )
     )
 end
diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl
index 8080db4a..23ac8254 100644
--- a/BanyanHDF5/test/hdf5.jl
+++ b/BanyanHDF5/test/hdf5.jl
@@ -65,7 +65,7 @@ shuffled in [true, false]
         invalidate_metadata(p)
         @test !has_metadata(p)
         @test has_sample(p)
-        innvalidate_location(p)
+        invalidate_location(p)
         @test !has_metadata(p)
         @test !has_sample(p)
 
diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl
index bdf0ae26..ab982b79 100644
--- a/BanyanImages/test/jpg.jl
+++ b/BanyanImages/test/jpg.jl
@@ -110,7 +110,7 @@ end
         invalidate_metadata(p)
         @test !has_metadata(p)
         @test has_sample(p)
-        innvalidate_location(p)
+        invalidate_location(p)
         @test !has_metadata(p)
         @test !has_sample(p)
 

From dfe98d10c2e3f66e59049c09fec3ad73070c9ac1 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Mon, 15 Aug 2022 09:57:19 -0400
Subject: [PATCH 16/25] Add more options for updating cluster

---
 Banyan/some_file                           |  1 +
 Banyan/src/clusters.jl                     | 17 +++++++++----
 Banyan/src/location.jl                     | 18 +++++++++-----
 Banyan/src/locations.jl                    | 29 ++++++++++++++++------
 BanyanDataFrames/src/locations.jl          | 12 +++++++++
 BanyanDataFrames/src/pfs.jl                | 26 +++++++++++++++++--
 BanyanDataFrames/test/sample_collection.jl |  8 ++++--
 7 files changed, 88 insertions(+), 23 deletions(-)
 create mode 100644 Banyan/some_file

diff --git a/Banyan/some_file b/Banyan/some_file
new file mode 100644
index 00000000..3b18e512
--- /dev/null
+++ b/Banyan/some_file
@@ -0,0 +1 @@
+hello world
diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl
index c96efa47..2e7572c3 100644
--- a/Banyan/src/clusters.jl
+++ b/Banyan/src/clusters.jl
@@ -35,13 +35,14 @@ function create_cluster(;
     vpc_id = nothing,
     subnet_id = nothing,
     nowait=false,
+    force_create=false,
     kwargs...,
 )
 
     # Configure using parameters
     c = configure(; kwargs...)
     
-    clusters = get_clusters(; kwargs...)
+    clusters = get_clusters(name; kwargs...)
     if isnothing(name)
         name = "Cluster " * string(length(clusters) + 1)
     end
@@ -52,11 +53,11 @@ function create_cluster(;
     # Check if the configuration for this cluster name already exists
     # If it does, then recreate cluster
     if haskey(clusters, name)
-        if clusters[name].status == :terminated
+        if force_create || clusters[name].status == :terminated
             @info "Started re-creating cluster named $name"
             send_request_get_response(
                 :create_cluster,
-                Dict("cluster_name" => name, "recreate" => true),
+                Dict("cluster_name" => name, "recreate" => true, "force_create" => true),
             )
             if !nowait
                 wait_for_cluster(name; kwargs...)
@@ -139,12 +140,17 @@ function delete_cluster(name::String; kwargs...)
     )
 end
 
-function update_cluster(name::String; nowait=false, kwargs...)
+function update_cluster(name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...)
     configure(; kwargs...)
     @info "Updating cluster named $name"
     send_request_get_response(
         :update_cluster,
-        Dict{String, Any}("cluster_name" => name)
+        Dict{String, Any}(
+            "cluster_name" => name,
+            "force_update" => force_update,
+            "update_linux_packages" => update_linux_packages,
+            "reinstall_julia" => reinstall_julia
+        )
     )
     if !nowait
         wait_for_cluster(name)
@@ -189,6 +195,7 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster}
     if !isempty(cluster_name)
         filters["cluster_name"] = cluster_name
     end
+    @show filters
     response = send_request_get_response(:describe_clusters, Dict{String,Any}("filters"=>filters))
     clusters_dict::Dict{String,Cluster} = Dict{String,Cluster}()
     for (name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any}
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 1af470c9..a5bdf0b2 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -152,8 +152,12 @@ end
 has_metadata(p::String=""; kwargs...) =
     has_metadata(get_location_path_with_format(p; kwargs...))
 function has_metadata(l_path:: LocationPath)::Bool
-    println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path))")
+    println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())")
     try
+        @show propertynames(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path))))
+        @show keys(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path))))
+        @show S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["KeyCount"]
+        @show S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"]
         !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"])
     catch
         false
@@ -166,8 +170,10 @@ function has_sample(l_path:: LocationPath)::Bool
     sc = get_sampling_config(l_path)
     pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
     try
+        @show S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))
         !isempty(S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"])
-    catch
+    catch e
+        @show e
         false
     end
 end
@@ -333,11 +339,11 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     banyan_samples_object_sample_rate = -1
     for banyan_samples_object in banyan_samples_objects
         object_key = banyan_samples_object["Key"]
-        if startswith(object_key, banyan_samples_object_prefix)
+        if startswith(object_key, sample_path_prefix)
             object_sample_rate = parse_sample_rate(object_key)
             object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
-            curr_sample_rate_diff = abs(object_sample_rate - sample_rate)
-            if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
+            curr_sample_rate_diff = abs(object_sample_rate - banyan_samples_object_sample_rate)
+            if banyan_samples_object_sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
                 banyan_samples_object_sample_rate = object_sample_rate
             end
         end
@@ -346,7 +352,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
         sample_path_suffix = "$sample_path_prefix$banyan_samples_object_sample_rate"
         blob = s3("GET", "/$(banyan_samples_bucket_name())/$sample_path_suffix")
         final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix)
-        write(final_local_sample_path, blob)
+        write(final_local_sample_path, seekstart(blob.io))
     end
     
     # Construct and return LocationSource
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 7e4b8e6e..628c77e5 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -303,6 +303,7 @@ function invalidate_metadata(p; kwargs...)
     end
 
     # Delete from S3
+    println("Deleting get_metadata_path(lp)=$(get_metadata_path(lp))")
     try
         S3.delete_object(banyan_samples_bucket_name(), get_metadata_path(lp))
     catch e
@@ -335,11 +336,17 @@ function invalidate_samples(p; kwargs...)
         end
         []
     end
+    @show banyan_samples_objects
     if !isempty(banyan_samples_objects)
         objects_to_delete = []
         for d in banyan_samples_objects
             push!(objects_to_delete, Dict("Key" => d["Key"]))
         end
+        S3.delete_objects(
+            banyan_samples_bucket_name(),
+            Dict("objects" => objects_to_delete)
+        )
+        @show objects_to_delete
         S3.delete_objects(
             banyan_samples_bucket_name(),
             Dict("Objects" => objects_to_delete)
@@ -350,6 +357,9 @@ function invalidate_location(p; kwargs...)
     invalidate_metadata(p; kwargs...)
     invalidate_samples(p; kwargs...)
 end
+function partition(series, partition_size)
+    (series[i:min(i+(partition_size-1),end)] for i in 1:partition_size:length(series))
+end
 function invalidate_all_locations()
     for subdir in ["samples", "metadata"]
         local_dir = joinpath(homedir(), ".banyan", subdir)
@@ -369,20 +379,23 @@ function invalidate_all_locations()
             end
             []
         end
+        println("Deleting banyan_samples_objects=$banyan_samples_objects from bucket_name=$bucket_name")
         if !isempty(banyan_samples_objects)
             objects_to_delete = []
             for d in banyan_samples_objects
                 push!(objects_to_delete, Dict("Key" => d["Key"]))
             end
             if !isempty(objects_to_delete)
-                try
-                    S3.delete_objects(
-                        banyan_samples_bucket_name(),
-                        Dict("Objects" => objects_to_delete)
-                    )
-                catch e
-                    if is_debug_on()
-                        show(e)
+                for objects_to_delete_partition in partition(objects_to_delete, 1000)
+                    try
+                        S3.delete_objects(
+                            bucket_name,
+                            Dict("Objects" => objects_to_delete_partition)
+                        )
+                    catch e
+                        if is_debug_on()
+                            show(e)
+                        end
                     end
                 end
             end
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index e1a71efb..a53abb79 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -3,6 +3,12 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end]
 Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5))
 
 function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::SamplingConfig)::Location
+    metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
+    metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
+        haskey(s3_res, "Contents") ? s3_res["Contents"] : []
+    end
+    println("In _remote_table_source at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir")
+
     # Setup for sampling
     remotepath = lp.path
     shuffled, max_num_bytes_exact = sampling_config.assume_shuffled, sampling_config.max_num_bytes_exact
@@ -338,6 +344,12 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
     # If a file does not exist, one of the get_metadata/get_sample functions
     # will error.
 
+    metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
+    metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
+        haskey(s3_res, "Contents") ? s3_res["Contents"] : []
+    end
+    println("In _remote_table_source at end with metadata_dir=$metadata_dir and metadata_bucket_dir=$metadata_bucket_dir and metadata_path=$metadata_path and curr_metadata_invalid=$curr_metadata_invalid")
+
     # Get source parameters
     src_params =
         Dict(
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index b324c4ba..9ee72d52 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -388,6 +388,12 @@ function WriteHelper(@nospecialize(format_value))
         loc_name::String,
         loc_params::Dict{String,Any},
     )
+        metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
+        metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
+            haskey(s3_res, "Contents") ? s3_res["Contents"] : []
+        end
+        println("In Write at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir")
+        
         # Get rid of splitting divisions if they were used to split this data into
         # groups
         splitting_divisions = Banyan.get_splitting_divisions()
@@ -494,8 +500,10 @@ function WriteHelper(@nospecialize(format_value))
         # m_path = is_main ? get_meta_path() : ""
         # location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : ""
         # m_path, location_path = sync_across((m_path, location_path), comm=comm)
-        m_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp_tmp))"
-        s_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp_tmp))$sample_rate"
+        m_dir = "s3/$(banyan_metadata_bucket_name())"
+        s_dir = "s3/$(banyan_samples_bucket_name())"
+        m_path = "$m_dir/$(get_metadata_path(lp_tmp))"
+        s_path = "$s_dir/$(get_sample_path_prefix(lp_tmp))$sample_rate"
         # loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params
 
         # Read in meta path if it's there
@@ -536,6 +544,12 @@ function WriteHelper(@nospecialize(format_value))
         # On the main worker, finalize metadata and location info.
         sample_invalid = false
         if is_main
+            metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
+            metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
+                haskey(s3_res, "Contents") ? s3_res["Contents"] : []
+            end
+            println("In Write with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir")
+
             # Determine paths and #s of rows for metadata file
             for worker_i in 1:nworkers
                 push!(
@@ -588,9 +602,17 @@ function WriteHelper(@nospecialize(format_value))
             end
 
             # Determine paths for this batch and gather # of rows
+            @show m_path
+            @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b/")
+            @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
+            bucket_dir = readdir("s3/$(banyan_metadata_bucket_name())")
+            println("On main in $(banyan_metadata_bucket_name()): $bucket_dir")
             Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows); compress=:zstd, metadata=curr_src_parameters)
         end
 
+        @show readdir("s3/$(banyan_metadata_bucket_name())")
+        @show Banyan.S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"]
+
         ###################################
         # Handling Final Batch by Copying #
         ###################################
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index 72c0ba7f..b4688eb0 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -79,7 +79,7 @@
     end
 end
 
-@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and maximum # of bytes for exact sample" for scheduling_config in
+@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for scheduling_config in
     [
         "default scheduling",
         "parallelism encouraged",
@@ -94,7 +94,7 @@ end
 
         bucket = get_cluster_s3_bucket_name()
 
-        configure_sampling(max_num_bytes=max_num_bytes, always_shuffled=shuffled)
+        configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled)
         exact_sample = max_num_bytes > 0
 
         invalidate_all_locations()
@@ -104,6 +104,8 @@ end
 
         df = read_table(p1; metadata_invalid=true, invalidate_samples=true)
         sample(df)
+        @show max_num_bytes
+        @show exact_sample
         @show get_sample_rate(p1)
 
         configure_sampling(p2; sample_rate=5)
@@ -112,6 +114,8 @@ end
         @show get_sampling_configs()
         @test get_sample_rate(p2) == 5
         @test has_metadata(p2)
+        sleep(5)
+        @test has_metadata(p2)
         @test has_sample(p2) == !exact_sample
         invalidate_metadata(p2)
         @test !has_metadata(p2)

From 52be36886b8889be6163be56160aac509390fec3 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Tue, 16 Aug 2022 06:53:05 -0400
Subject: [PATCH 17/25] Add Arrow import

---
 Banyan/src/Banyan.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 1cdba8ed..ea4107ed 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -21,7 +21,8 @@ global NOT_USING_MODULES = String["ProfileView", "SnoopCompileCore"]
 using FilePathsBase: joinpath, isempty
 using Base: notnothing, env_project_file
 
-using Base64,
+using Arrow,
+    Base64,
     DataStructures,
     Dates,
     Downloads,

From 80e23298d888a522d0310b64c8eca84c218b8789 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Tue, 16 Aug 2022 07:03:33 -0400
Subject: [PATCH 18/25] Fix bugs

---
 Banyan/src/location.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index a5bdf0b2..06d3c016 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -231,7 +231,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
         if_modified_since_string =
             "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
         try
-            d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))))
+            d = get_src_params_dict_from_arrow(seekstart(s3("GET", metadata_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string))).io))
             src_params_not_stored_locally = true
             d
         catch e
@@ -251,7 +251,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
         end
     else
         try
-            d = get_src_params_dict_from_arrow(s3("GET", metadata_s3_path))
+            d = get_src_params_dict_from_arrow(seekstart(s3("GET", metadata_s3_path).io))
             src_params_not_stored_locally = true
             d
         catch e
@@ -259,7 +259,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
                 show(e)
             end
             if !AWSExceptionInfo(e).not_found
-                @warn "Assumming metadata isn't copied in the cloud because of following error in attempted access"
+                @warn "Assuming metadata isn't copied in the cloud because of following error in attempted access"
                 show(e)
             end
             Dict{String, String}()
@@ -306,7 +306,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
         sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix$sample_rate"
         try
             blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))
-            write(sample_local_path, blob)  # This overwrites the existing file
+            write(sample_local_path, seekstart(blob.io))  # This overwrites the existing file
             final_local_sample_path = sample_local_path
             break
         catch e

From 734f8b7f68412eb3bc32c1288dc5e7ec45633c78 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Tue, 16 Aug 2022 08:44:19 -0400
Subject: [PATCH 19/25] Switch to using S3Path where possible

---
 Banyan/src/Banyan.jl                       |   1 +
 Banyan/src/futures.jl                      |   1 +
 Banyan/src/location.jl                     | 173 +++++++++++----------
 Banyan/src/locations.jl                    |  90 ++++-------
 Banyan/src/samples.jl                      |  12 +-
 Banyan/src/utils.jl                        |  27 +++-
 BanyanDataFrames/src/df.jl                 |   3 +-
 BanyanDataFrames/src/locations.jl          |   9 +-
 BanyanDataFrames/src/pfs.jl                |  39 +++--
 BanyanDataFrames/test/sample_collection.jl |  16 +-
 BanyanHDF5/src/locations.jl                |   7 +-
 BanyanImages/src/locations.jl              |   7 +-
 12 files changed, 206 insertions(+), 179 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index ea4107ed..8bde1ae3 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -42,6 +42,7 @@ AWS.DEFAULT_BACKEND[] = AWS.DownloadsBackend()
 s3 = set_features(AWS.AWSServices.s3; use_response_type=true)
 using AWS.AWSExceptions
 using AWS: @service
+# TODO: Remove @service S3 since we just use AWSS3 and s3
 @service S3 use_response_type = true
 @service SQS use_response_type = true
 using AWSS3
diff --git a/Banyan/src/futures.jl b/Banyan/src/futures.jl
index c1e769a1..0274f081 100644
--- a/Banyan/src/futures.jl
+++ b/Banyan/src/futures.jl
@@ -19,6 +19,7 @@ function create_new_future(source::Location, mutate_from::Future, datatype::Stri
     sourced(new_future, source)
     destined(new_future, None())
 
+
     # TODO: Add Size location here if needed
     # Handle locations that have an associated value
     source_src_name = source.src_name
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 06d3c016..6ec0f39a 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -65,11 +65,8 @@ end
 
 function get_sample_path_prefix(lp::LocationPath)
     format_name_sep = !isempty(lp.format_name) ? "_" : ""
-    format_version_sep = !isempty(lp.format_version) ? "_" : ""
-    lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version * format_version_sep
+    lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version
 end
-get_sample_path(lp::LocationPath, sample_rate::Int64) =
-    get_sample_path_prefix(lp) * string(sample_rate)
 get_metadata_path(lp::LocationPath) = lp.path_hash
 banyan_samples_bucket_name() = "banyan-samples-$(get_organization_id())"
 banyan_metadata_bucket_name() = "banyan-metadata-$(get_organization_id())"
@@ -103,45 +100,37 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig =
 get_sample_rate(p::String=""; kwargs...) =
     get_sample_rate(get_location_path_with_format(p; kwargs...))
 function parse_sample_rate(object_key)
-    lastpos = findlast("_", object_key)
-    if isnothing(lastpos)
-        error("Object name \"$object_key\" doesn't contain a sample rate")
-    end
-    parse(Int64, object_key[(lastpos.start+1):end])
+    parse(Int64, last(splitpath(object_key)))
 end
 function get_sample_rate(l_path::LocationPath)
+    sc = get_sampling_config(l_path)
+    @show sc
+
     # Get the desired sample rate
-    desired_sample_rate = get_sampling_config(l_path).rate
+    desired_sample_rate = sc.rate
 
     # If we just want the default sample rate or if a new sample rate is being
     # forced, then just return that.
     if isempty(l_path.path)
         return desired_sample_rate
     end
-    sc = get_sampling_config(l_path)
     if sc.force_new_sample_rate
         return desired_sample_rate
     end
 
     # Find a cached sample with a similar sample rate
-    pre = get_sample_path_prefix(l_path)
-    banyan_samples_objects = try
-        res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"]
-        res isa Base.Vector ? res : [res]
-    catch e
-        @show e
-        return desired_sample_rate
-    end
+    banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())")
+    banyan_samples_object_dir = joinpath(banyan_samples_bucket, get_sample_path_prefix(l_path))
     sample_rate = -1
-    for banyan_samples_object in banyan_samples_objects
-        object_key = banyan_samples_object["Key"]
-        if startswith(object_key, pre)
-            object_sample_rate = parse_sample_rate(object_key)
-            object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
-            curr_sample_rate_diff = abs(object_sample_rate - sample_rate)
-            if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
-                sample_rate = object_sample_rate
-            end
+    @show banyan_samples_object_dir
+    @show readdir(banyan_samples_bucket)
+    @show readdir_no_error(banyan_samples_object_dir)
+    for object_key in readdir_no_error(banyan_samples_object_dir)
+        object_sample_rate = parse(Int64, object_key)
+        object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
+        curr_sample_rate_diff = abs(sample_rate - desired_sample_rate)
+        if sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
+            sample_rate = object_sample_rate
         end
     end
     sample_rate != -1 ? sample_rate : desired_sample_rate
@@ -153,28 +142,32 @@ has_metadata(p::String=""; kwargs...) =
     has_metadata(get_location_path_with_format(p; kwargs...))
 function has_metadata(l_path:: LocationPath)::Bool
     println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())")
-    try
-        @show propertynames(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path))))
-        @show keys(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path))))
-        @show S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["KeyCount"]
-        @show S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"]
-        !isempty(S3.list_objects_v2(banyan_metadata_bucket_name(), Dict("prefix" => get_metadata_path(l_path)))["Contents"])
-    catch
-        false
-    end
+    isfile(S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(l_path))"))
 end
 
 has_sample(p::String=""; kwargs...) =
     has_sample(get_location_path_with_format(p; kwargs...))
 function has_sample(l_path:: LocationPath)::Bool
     sc = get_sampling_config(l_path)
-    pre = sc.force_new_sample_rate ? get_sample_path(l_path, sc.rate) : get_sample_path_prefix(l_path)
-    try
-        @show S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))
-        !isempty(S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => pre))["Contents"])
-    catch e
-        @show e
-        false
+    banyan_sample_dir = S3Path("s3://$(banyan_samples_bucket_name())/$(get_sample_path_prefix(l_path))")
+    println("In has_sample")
+    @show sc
+    @show sc.force_new_sample_rate
+    @show joinpath(banyan_sample_dir, string(sc.rate))
+    @show isdir_no_error(banyan_sample_dir)
+    @show isdir_no_error(banyan_sample_dir) && !isempty(readdir(banyan_sample_dir))
+    @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/"))
+    @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2"))
+    @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2/"))
+    @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2"))
+    @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arr/"))
+    @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_200/"))
+    @show banyan_sample_dir
+    @show readdir_no_error(banyan_sample_dir)
+    if sc.force_new_sample_rate
+        isfile(joinpath(banyan_sample_dir, string(sc.rate)))
+    else
+        !isempty(readdir_no_error(banyan_sample_dir))
     end
 end
 
@@ -266,13 +259,13 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
         end
     end
     # Store metadata locally
-    if src_params_not_stored_locally && !isempty(d)
+    if src_params_not_stored_locally && !isempty(src_params)
         Arrow.write(metadata_local_path, Arrow.Table(); metadata=src_params)
     end
 
     # Load in sample
 
-    sc = get_sampling_config()
+    sc = get_sampling_config(lp)
     force_new_sample_rate = sc.force_new_sample_rate
     desired_sample_rate = sc.rate
     sample_path_prefix = get_sample_path_prefix(lp)
@@ -280,16 +273,16 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     # Find local samples
     found_local_samples = Tuple{String,Int64}[]
     found_local_sample_rate_diffs = Int64[]
-    samples_local_dir = get_samples_local_path()
-    local_sample_paths = isdir(samples_local_dir) ? readdir(samples_local_dir, join=true) : String[]
-    for local_sample_path in local_sample_paths
-        if startswith(local_sample_path, sample_path_prefix)
-            local_sample_rate = parse_sample_rate(object_key)
-            diff_sample_rate = abs(local_sample_rate - desired_sample_rate)
-            if !force_new_sample_rate || sample_rate_diff == 0
-                push!(found_local_samples, (local_sample_path, local_sample_rate))
-                push!(found_local_sample_rate_diffs, diff_sample_rate)
-            end
+    sample_local_dir = joinpath(get_samples_local_path(), sample_path_prefix)
+    mkpath(sample_local_dir)
+    local_sample_paths = isdir(sample_local_dir) ? readdir(sample_local_dir) : String[]
+    for local_sample_path_suffix in local_sample_paths
+        local_sample_path = joinpath(sample_local_dir, local_sample_path_suffix)
+        local_sample_rate = parse(Int64, local_sample_path_suffix)
+        diff_sample_rate = abs(local_sample_rate - desired_sample_rate)
+        if !force_new_sample_rate || diff_sample_rate == 0
+            push!(found_local_samples, (local_sample_path, local_sample_rate))
+            push!(found_local_sample_rate_diffs, diff_sample_rate)
         end
     end
 
@@ -297,17 +290,26 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     # rate closest to the desired sample rate)
     found_local_samples = found_local_samples[sortperm(found_local_sample_rate_diffs)]
 
-    # Find a local sample that is up-to-date
+    # Find a local sample that is up-to-date. NOTE: The data itself might have
+    # changed in which case the cached samples are out-of-date and we don't
+    # currently capture that. This doesn't even check if there is a more recent
+    # sample of a different sample rate (although that is kind of a bug/limitation
+    # that could be resolved though the best way to resolve it would be by
+    # comparing to the last modified date for the data itself). It just checks that the remote sample
+    # hasn't been manually invalidated by the user or a Banyan writing function
+    # and that there isn't a newer sample for this specific sample rate.
     final_local_sample_path = ""
+    final_sample_rate = -1
     for (sample_local_path, sample_rate) in found_local_samples
         lm = Dates.unix2datetime(mtime(sample_local_path))
         if_modified_since_string =
             "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
-        sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix$sample_rate"
+        sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix/$sample_rate"
         try
             blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))
             write(sample_local_path, seekstart(blob.io))  # This overwrites the existing file
             final_local_sample_path = sample_local_path
+            final_sample_rate = sample_rate
             break
         catch e
             if is_debug_on()
@@ -318,6 +320,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
                 @warn "Assumming locally stored metadata is invalid because it is not backed up to the cloud"
             elseif ei.unmodified_since
                 final_local_sample_path = sample_local_path
+                final_sample_rate = sample_rate
                 break
             else
                 @warn "Assumming locally stored metadata is invalid because of following error in accessing the metadata copy in the cloud"
@@ -327,32 +330,29 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     end
 
     # If no such sample is found, search the S3 bucket
-    banyan_samples_objects = try
-        res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"]
-        res isa Base.Vector ? res : [res]
-    catch e
-        if is_debug_on()
-            show(e)
-        end
-        []
-    end
-    banyan_samples_object_sample_rate = -1
-    for banyan_samples_object in banyan_samples_objects
-        object_key = banyan_samples_object["Key"]
-        if startswith(object_key, sample_path_prefix)
-            object_sample_rate = parse_sample_rate(object_key)
+    if isempty(final_local_sample_path)
+        banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())")
+        final_sample_rate = -1
+        banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix)
+        for object_key in readdir_no_error(banyan_samples_object_dir)
+            object_sample_rate = parse(Int64, object_key)
             object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
-            curr_sample_rate_diff = abs(object_sample_rate - banyan_samples_object_sample_rate)
-            if banyan_samples_object_sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff
-                banyan_samples_object_sample_rate = object_sample_rate
+            curr_sample_rate_diff = abs(final_sample_rate - desired_sample_rate)
+            if force_new_sample_rate ? (object_sample_rate_diff == 0) : (final_sample_rate == -1 || object_sample_rate_diff < curr_sample_rate_diff)
+                final_sample_rate = object_sample_rate
+                final_local_sample_path = joinpath(sample_local_dir, object_key)
             end
         end
-    end
-    if banyan_samples_object_sample_rate != -1
-        sample_path_suffix = "$sample_path_prefix$banyan_samples_object_sample_rate"
-        blob = s3("GET", "/$(banyan_samples_bucket_name())/$sample_path_suffix")
-        final_local_sample_path = joinpath(samples_local_dir, sample_path_suffix)
-        write(final_local_sample_path, seekstart(blob.io))
+        if final_sample_rate != -1
+            cp(
+                joinpath(
+                    banyan_samples_bucket,
+                    sample_path_prefix,
+                    string(final_sample_rate)
+                ),
+                Path(final_local_sample_path)
+            )
+        end
     end
     
     # Construct and return LocationSource
@@ -364,13 +364,14 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     )
     res_location.metadata_invalid = isempty(src_params)
     res_location.sample_invalid = isempty(final_local_sample_path)
+    @show final_sample_rate
+    @show final_local_sample_path
+    final_sample_rate = isempty(final_local_sample_path) ? desired_sample_rate : final_sample_rate
+    @show desired_sample_rate
+    @show sample_local_dir
     (
         res_location,
         metadata_local_path,
-        if !isempty(final_local_sample_path)
-            final_local_sample_path
-        else
-            joinpath(samples_local_dir, "$sample_path_prefix$desired_sample_rate")
-        end
+        joinpath(sample_local_dir, string(final_sample_rate))
     )
 end
\ No newline at end of file
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 628c77e5..d7decd31 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -304,12 +304,9 @@ function invalidate_metadata(p; kwargs...)
 
     # Delete from S3
     println("Deleting get_metadata_path(lp)=$(get_metadata_path(lp))")
-    try
-        S3.delete_object(banyan_samples_bucket_name(), get_metadata_path(lp))
-    catch e
-        if is_debug_on()
-            show(e)
-        end
+    s3p = S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))")
+    if isfile(s3p)
+        rm(s3p)
     end
 end
 function invalidate_samples(p; kwargs...)
@@ -327,31 +324,17 @@ function invalidate_samples(p; kwargs...)
     end
 
     # Delete from S3
-    banyan_samples_objects = try
-        res = S3.list_objects_v2(banyan_samples_bucket_name(), Dict("prefix" => sample_path_prefix))["Contents"]
-        res isa Base.Vector ? res : [res]
-    catch e
-        if is_debug_on()
-            show(e)
-        end
-        []
-    end
-    @show banyan_samples_objects
-    if !isempty(banyan_samples_objects)
-        objects_to_delete = []
-        for d in banyan_samples_objects
-            push!(objects_to_delete, Dict("Key" => d["Key"]))
-        end
-        S3.delete_objects(
-            banyan_samples_bucket_name(),
-            Dict("objects" => objects_to_delete)
-        )
-        @show objects_to_delete
-        S3.delete_objects(
-            banyan_samples_bucket_name(),
-            Dict("Objects" => objects_to_delete)
-        )
+    s3p = S3Path("s3://$(banyan_samples_bucket_name())/$sample_path_prefix")
+    @show readdir_no_error(s3p)
+    @show s3p
+    @show path_as_dir(s3p)
+    @show readdir(S3Path("s3://$(banyan_samples_bucket_name())"))
+    if !isempty(readdir_no_error(s3p))
+        rm(path_as_dir(s3p), recursive=true)
     end
+    @show readdir_no_error(s3p)
+    @show s3p
+    @show readdir(S3Path("s3://$(banyan_samples_bucket_name())"))
 end
 function invalidate_location(p; kwargs...)
     invalidate_metadata(p; kwargs...)
@@ -370,34 +353,10 @@ function invalidate_all_locations()
 
     # Delete from S3
     for bucket_name in [banyan_samples_bucket_name(), banyan_metadata_bucket_name()]
-        banyan_samples_objects = try
-            res = S3.list_objects_v2(bucket_name)["Contents"]
-            res isa Base.Vector ? res : [res]
-        catch e
-            if is_debug_on()
-                show(e)
-            end
-            []
-        end
-        println("Deleting banyan_samples_objects=$banyan_samples_objects from bucket_name=$bucket_name")
-        if !isempty(banyan_samples_objects)
-            objects_to_delete = []
-            for d in banyan_samples_objects
-                push!(objects_to_delete, Dict("Key" => d["Key"]))
-            end
-            if !isempty(objects_to_delete)
-                for objects_to_delete_partition in partition(objects_to_delete, 1000)
-                    try
-                        S3.delete_objects(
-                            bucket_name,
-                            Dict("Objects" => objects_to_delete_partition)
-                        )
-                    catch e
-                        if is_debug_on()
-                            show(e)
-                        end
-                    end
-                end
+        s3p = S3Path("s3://$bucket_name")
+        if isdir_no_error(s3p)
+            for p in readdir(s3p, join=true)
+                rm(p, force=true, recursive=true)
             end
         end
     end 
@@ -476,24 +435,28 @@ function RemoteSource(
     # Look at local and S3 caches of metadata and samples to attempt to
     # construct a Location.
     loc, local_metadata_path, local_sample_path = get_location_source(lp)
-    sc = deepcopy(get_sampling_config(lp))
-    sc.rate = parse_sample_rate(local_sample_path)
+    @show lp
+    @show get_sampling_configs()
+    @show local_sample_path
 
-    if !loc.metadata_invalid && !loc.sample_invalid
+    res = if !loc.metadata_invalid && !loc.sample_invalid
         # Case where both sample and parameters are valid
         loc.sample.value = load_sample(local_sample_path)
+        loc.sample.rate = parse_sample_rate(local_sample_path)
         loc
     elseif loc.metadata_invalid && !loc.sample_invalid
         # Case where parameters are invalid
-        new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true)
+        new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true)
         Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
+        @show new_loc
         new_loc.sample.value = load_sample(local_sample_path)
         new_loc
     else
         # Case where sample is invalid
 
         # Get the Location with up-to-date metadata (source parameters) and sample
-        new_loc = offloaded(_remote_source, lp, loc, sc, args...; distributed=true)
+        new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true)
+        @show new_loc
 
         if !loc.metadata_invalid
             # Store the metadata locally. The local copy just has the source
@@ -508,4 +471,5 @@ function RemoteSource(
         
         new_loc
     end
+    res
 end
\ No newline at end of file
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index 47d3a8fd..66eb51af 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -11,13 +11,13 @@ function configure_sampling(
 )
     global session_sampling_configs
 
-    sc = get_sampling_config(path; kwargs...)
+    sc = default ? DEFAULT_SAMPLING_CONFIG : get_sampling_config(path; kwargs...)
     nsc = SamplingConfig(
-        (!isnothing(sample_rate) && !default) ? sample_rate : sc.rate,
-        (!isnothing(always_exact) && !default) ? always_exact : sc.always_exact,
-        (!isnothing(max_num_bytes_exact) && !default) ? max_num_bytes_exact : sc.max_num_bytes_exact,
-        (!isnothing(force_new_sample_rate) && !default) ? force_new_sample_rate : sc.force_new_sample_rate,
-        (!isnothing(assume_shuffled) && !default) ? assume_shuffled : sc.assume_shuffled,
+        (!isnothing(sample_rate)) ? sample_rate : sc.rate,
+        (!isnothing(always_exact)) ? always_exact : sc.always_exact,
+        (!isnothing(max_num_bytes_exact)) ? max_num_bytes_exact : sc.max_num_bytes_exact,
+        (!isnothing(force_new_sample_rate)) ? force_new_sample_rate : sc.force_new_sample_rate,
+        (!isnothing(assume_shuffled)) ? assume_shuffled : sc.assume_shuffled,
     )
 
     session_id = _get_session_id_no_error()
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index cec3e57c..bcc38d90 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -640,4 +640,29 @@ size_from_str(s) =
             res[i] = parse(Int64, sz_str)
         end
         Tuple(res)
-    end
\ No newline at end of file
+    end
+    
+function isdir_no_error(p)
+    try
+        isdir(p)
+    catch e
+        if is_debug_on()
+            print("Failed to check isdir because of e=$e")
+        end
+        false
+    end
+end
+function path_as_dir(p)
+    p_sep = p.separator
+    endswith(string(p), p_sep) ? p : (p * p_sep)
+end
+function readdir_no_error(p)
+    try
+        readdir(path_as_dir(p))
+    catch e
+        if is_debug_on()
+            print("Failed to readdir of p=$p because of e=$e")
+        end
+        String[]
+    end
+end
\ No newline at end of file
diff --git a/BanyanDataFrames/src/df.jl b/BanyanDataFrames/src/df.jl
index 3c604963..64654773 100644
--- a/BanyanDataFrames/src/df.jl
+++ b/BanyanDataFrames/src/df.jl
@@ -55,7 +55,8 @@ function read_table(path::String; kwargs...)
     invalidate(path; after=true, kwargs...)
     df_loc_nrows::Int64 = parse(Int64, df_loc.src_parameters["nrows"])
     df_nrows = Future(df_loc_nrows)
-    DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows)
+    res = DataFrame(Future(datatype="DataFrame", source=df_loc), df_nrows)
+    res
 end
 
 # TODO: For writing functions, if a file is specified, enforce Replicated
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index a53abb79..3581365e 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -2,7 +2,8 @@ get_file_ending(remotepath::String)::String = splitext(remotepath)[2][2:end]
 
 Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5))
 
-function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::SamplingConfig)::Location
+function _remote_table_source(lp::LocationPath, loc::Location)::Location
+    sampling_config = get_sampling_config(lp)
     metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
     metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
         haskey(s3_res, "Contents") ? s3_res["Contents"] : []
@@ -28,7 +29,11 @@ function _remote_table_source(lp::LocationPath, loc::Location, sampling_config::
 
     # Get paths for writing sample and metadata
     metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
-    sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate"
+    sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))"
+    mkpath(sample_dir)
+    sample_path = "$sample_dir/$sample_rate"
+    @show sample_path
+    @show sample_rate
 
     # Get metadata if it is still valid
     curr_meta::Arrow.Table = if !curr_metadata_invalid
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index 9ee72d52..c1ec11be 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -495,15 +495,16 @@ function WriteHelper(@nospecialize(format_value))
         sample_rate = sampling_config.rate
 
         # Get paths for reading in metadata and Location
-        tmp_suffix = nbatches > 1 ? ".tmp" : ""
-        lp_tmp = LocationPath(loc_params_path * tmp_suffix, "arrow", "2")
+        lp_tmp = LocationPath(path, "arrow", "2")
         # m_path = is_main ? get_meta_path() : ""
         # location_path = is_main ? get_location_path(loc_params_path * tmp_suffix) : ""
         # m_path, location_path = sync_across((m_path, location_path), comm=comm)
         m_dir = "s3/$(banyan_metadata_bucket_name())"
         s_dir = "s3/$(banyan_samples_bucket_name())"
         m_path = "$m_dir/$(get_metadata_path(lp_tmp))"
-        s_path = "$s_dir/$(get_sample_path_prefix(lp_tmp))$sample_rate"
+        s_sample_dir = "$s_dir/$(get_sample_path_prefix(lp_tmp))"
+        mkpath(s_sample_dir)
+        s_path = "$s_sample_dir/$sample_rate"
         # loc_params = loc_name == symbol_Disk ? Dict{String,String}(Arrow.getmetadata(Arrow.Table(m_path))) : loc_params
 
         # Read in meta path if it's there
@@ -582,7 +583,7 @@ function WriteHelper(@nospecialize(format_value))
             curr_src_parameters["nrows"] = string(total_nrows)
             curr_src_parameters["sample_memory_usage"] = string(sample_memory_usage)
 
-            if !is_disk && batch_idx == nbatches && sample_memory_usage <= sampling_config.max_num_bytes_exact
+            if is_disk || sample_memory_usage <= sampling_config.max_num_bytes_exact
                 # If the total # of rows turns out to be inexact then we can simply mark it as
                 # stale so that it can be collected more efficiently later on
                 # We should be able to quickly recompute a more useful sample later
@@ -590,15 +591,25 @@ function WriteHelper(@nospecialize(format_value))
                 sample_invalid = true
             end
 
-            println("In Write with sample_invalid=$sample_invalid and sample_memory_usage=$sample_memory_usage while sampling_config=$sampling_config, writing to $m_path")
+            println("In Write with sample_invalid=$sample_invalid (because sample_memory_usage=$sample_memory_usage and sampling_config.max_num_bytes_exact=$(sampling_config.max_num_bytes_exact)) and while sampling_config=$sampling_config, writing to $m_path and $s_path, on batch_idx=$batch_idx with curr_src_parameters=$curr_src_parameters")
+
+            @show get_sampling_configs()
+            @show lp
+            @show get_sampling_config(lp)
+            @show s_path
+            @show s_sample_dir
 
             # Get the actual sample by concatenating
-            if !is_disk && !sample_invalid
+            if !sample_invalid
                 sampled_parts = [gathered[4] for gathered in gathered_data]
                 if batch_idx > 1
-                    push!(sampled_parts, curr_location.sample.value |> seekstart |> Arrow.Table |> DataFrames.DataFrame)
+                    push!(sampled_parts, Arrow.Table(s_path) |> DataFrames.DataFrame)
                 end
+                println("Writing to s_path=$s_path")
                 Arrow.write(s_path, vcat(sampled_parts...), compress=:zstd)
+            else
+                println("Removing s_path=$s_path")
+                rm(s_path, force=true, recursive=true)
             end
 
             # Determine paths for this batch and gather # of rows
@@ -613,6 +624,9 @@ function WriteHelper(@nospecialize(format_value))
         @show readdir("s3/$(banyan_metadata_bucket_name())")
         @show Banyan.S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"]
 
+        println("In Write")
+        @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/"))
+
         ###################################
         # Handling Final Batch by Copying #
         ###################################
@@ -620,10 +634,15 @@ function WriteHelper(@nospecialize(format_value))
         if nbatches > 1 && batch_idx == nbatches
             # Copy over location and meta path
             actual_meta_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
-            actual_sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))$sample_rate"
-            if worker_idx == 1
+            actual_sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))"
+            actual_sample_path = "$actual_sample_dir/$sample_rate"
+            if is_main
                 cp(m_path, actual_meta_path, force=true)
-                cp(s_path, actual_sample_path, force=true)
+                if !sample_invalid
+                    mkpath(actual_sample_dir)
+                    println("Copying from s_path=$s_path to actual_sample_path=$actual_sample_path")
+                    cp(s_path, actual_sample_path, force=true)
+                end
             end
 
             # Copy over files to actual location
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index b4688eb0..e9aec975 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -79,14 +79,14 @@
     end
 end
 
-@testset "Reading/writing $(shuffled ? "shuffle " : " ")$format data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for scheduling_config in
+@testset "Reading/writing $(shuffled ? "shuffled " : "")$format data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for scheduling_config in
     [
         "default scheduling",
         "parallelism encouraged",
         "parallelism and batches encouraged",
     ],
     format in ["csv", "parquet"],
-    max_num_bytes in [0, Banyan.parse_bytes("100 GB")],
+    max_num_bytes in [0, 100_000_000_000],
     shuffled in [true, false]
 
     use_session_for_testing(scheduling_config_name = scheduling_config) do
@@ -94,13 +94,14 @@ end
 
         bucket = get_cluster_s3_bucket_name()
 
-        configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled)
+        configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled, for_all_locations=true, default=true)
         exact_sample = max_num_bytes > 0
 
         invalidate_all_locations()
 
         p1 = "s3://$(bucket)/iris_large.$format"
         p2 = "s3://$(bucket)/iris_large_tmp.$format"
+        println("has_sample(p2)=$(has_sample(p2)) after invalidation")
 
         df = read_table(p1; metadata_invalid=true, invalidate_samples=true)
         sample(df)
@@ -109,13 +110,12 @@ end
         @show get_sample_rate(p1)
 
         configure_sampling(p2; sample_rate=5)
+        println("Before write_table")
         @show get_sampling_configs()
         write_table(df, p2)
         @show get_sampling_configs()
         @test get_sample_rate(p2) == 5
         @test has_metadata(p2)
-        sleep(5)
-        @test has_metadata(p2)
         @test has_sample(p2) == !exact_sample
         invalidate_metadata(p2)
         @test !has_metadata(p2)
@@ -135,14 +135,18 @@ end
         df2 = read_table(p2; samples_invalid=true)
         sample(df2)
         @test get_sample_rate(p2) == 5
+        println("After bad get_sample_rate")
         configure_sampling(sample_rate=7, for_all_locations=true)
         @test get_sample_rate(p2) == 5
+        println("After bad get_sample_rate")
         df2 = read_table(p2; metadata_invalid=true)
         sample(df2)
         @test get_sample_rate(p2) == 5
         @test get_sample_rate() == 7
-        configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
+        configure_sampling(sample_rate=7, for_all_locations=true)
         @test get_sample_rate(p2) == 5
+        configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
+        @test get_sample_rate(p2) == 7
         @test get_sample_rate() == 7
         df2 = read_table(p2)
         @test get_sample_rate(p2) == 7
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index ee214963..585da17c 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -26,7 +26,8 @@ end
 
 HDF5_getindex_retry = retry(HDF5.getindex; delays=Base.ExponentialBackOff(; n=5))
 
-function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig)
+function _remote_hdf5_source(lp::LocationPath, loc::Location)
+    sc = get_sampling_config(lp)
     path_and_subpath = lp.path
     shuffled = sc.assume_shuffled
     curr_metadata_invalid = loc.metadata_invalid
@@ -154,7 +155,9 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location, sc::SamplingConfig
 
         # Get paths to store metadata and sample in
         metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
-        sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$sample_rate)"
+        sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))"
+        mkpath(sample_dir)
+        sample_path = "$sample_dir/$sample_rate"
 
         # Store metadata and sample in S3
         Arrow.write(metadata_path, Arrow.Table(); metadata=src_params)
diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl
index 783b858f..3bd07e40 100644
--- a/BanyanImages/src/locations.jl
+++ b/BanyanImages/src/locations.jl
@@ -272,7 +272,8 @@ _load_image_and_add_channelview(path_on_worker::String) = load_retry(path_on_wor
 
 _reshape_image(image) = reshape(image, (1, size(image)...))
 
-function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfig, remotepath, add_channelview::Bool)
+function _remote_image_source(lp::LocationPath, loc::Location, remotepath, add_channelview::Bool)
+    sc = get_sampling_config(lp)
     curr_sample_invalid = loc.sample_invalid
     curr_metadata_invalid = loc.metadata_invalid
     
@@ -296,7 +297,9 @@ function _remote_image_source(lp::LocationPath, loc::Location, sc::SamplingConfi
 
     # Get paths to store metadata and sample in
     metadata_path = "s3/$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))"
-    sample_path = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp)$(sc.rate))"
+    sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))"
+    mkpath(sample_dir)
+    sample_path = "$sample_dir/$(sc.rate)"
 
     # Iterable object that iterates over local paths
     localpaths = curr_metadata_invalid ? getpaths(remotepath) : Arrow.Table(metadata_path).path

From 498890a19e455659cf9855da02843c585322ee84 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Wed, 17 Aug 2022 11:41:51 -0400
Subject: [PATCH 20/25] Fix new sampling system for HDF5

---
 Banyan/src/Banyan.jl                       |   2 +-
 Banyan/src/location.jl                     |  92 ++++++++++++-------
 Banyan/src/locations.jl                    |  30 ++++--
 Banyan/src/queues.jl                       |  33 ++++---
 Banyan/src/requests.jl                     |  30 ++++--
 Banyan/src/samples.jl                      |   2 +-
 Banyan/src/utils.jl                        |  10 +-
 Banyan/src/utils_pfs.jl                    |   2 +-
 BanyanDataFrames/src/locations.jl          |   4 +-
 BanyanDataFrames/src/precompile.jl         |   5 +-
 BanyanDataFrames/test/sample_collection.jl |   3 +-
 BanyanHDF5/Project.toml                    |   2 +
 BanyanHDF5/src/BanyanHDF5.jl               |   3 +-
 BanyanHDF5/src/hdf5.jl                     |   1 -
 BanyanHDF5/src/locations.jl                |  15 +--
 BanyanHDF5/test/hdf5.jl                    |  70 ++++++++------
 BanyanImages/Project.toml                  |   1 +
 BanyanImages/src/BanyanImages.jl           |   2 +-
 BanyanImages/src/image.jl                  |   6 +-
 BanyanImages/src/locations.jl              |  14 +--
 BanyanImages/test/jpg.jl                   | 101 +++++++++++++--------
 BanyanImages/test/runtests.jl              |   2 +-
 22 files changed, 270 insertions(+), 160 deletions(-)

diff --git a/Banyan/src/Banyan.jl b/Banyan/src/Banyan.jl
index 8bde1ae3..da4a6c94 100644
--- a/Banyan/src/Banyan.jl
+++ b/Banyan/src/Banyan.jl
@@ -50,7 +50,7 @@ using AWSS3
 global BANYAN_API_ENDPOINT
 
 # Account management
-export configure
+export configure, get_organization_id
 
 # Cluster management
 export Cluster,
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 6ec0f39a..4b9458d2 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -14,6 +14,8 @@ mutable struct Location
     sample_invalid::Bool
 end
 
+LOCATION_PATH_KWARG_NAMES = ["add_channelview"]
+
 struct LocationPath
     original_path::String
     path::String
@@ -22,47 +24,61 @@ struct LocationPath
     format_name::String
     format_version::String
 
-    function LocationPath(path, format_name, format_version)
+    function LocationPath(path::Any, format_name::String, format_version::String; kwargs...)
+        LocationPath("lang_jl_$(hash(path))", format_name, format_version; kwargs...)
+    end
+    function LocationPath(path::String, format_name::String, format_version::String; kwargs...)
         # This function is responsible for "normalizing" the path.
         # If there are multiple path strings that are technically equivalent,
         # this function should map them to the same string.
-        path_hash = hash(path)
+
+        # Add the kwargs to the path
+        path_res = deepcopy(path)
+        for (kwarg_name, kwarg_value) in kwargs
+            if kwarg_name in LOCATION_PATH_KWARG_NAMES
+                path_res *= "_$kwarg_name=$kwarg_value"
+            end
+        end
+
+        # Return the LocationPath
+        path_hash = hash(path_res)
         new(
-            path,
-            path,
+            path_res,
+            path_res,
             path_hash,
             string(path_hash),
             format_name,
             format_version
         )
     end
+    
+    LocationPath(p; kwargs...) = LocationPath("lang_jl_$(hash(path))"; kwargs...)
+    function LocationPath(p::String; kwargs...)::LocationPath
+        if isempty(p)
+            return NO_LOCATION_PATH
+        end
+    
+        format_name = get(kwargs, :format, "jl")
+        is_sample_format_arrow = format_name == "arrow"
+        if is_sample_format_arrow
+            return LocationPath(p, "arrow", get(kwargs, :format_version, "2"); kwargs...)
+        else
+            for table_format in TABLE_FORMATS
+                if occursin(table_format, p) || format_name == p
+                    return LocationPath(p, "arrow", "2"; kwargs...)
+                end
+            end
+        end
+        LocationPath(p, "jl", get_julia_version(); kwargs...)
+    end
 
-    LocationPath(path) = LocationPath(path, "jl", get_julia_version())``
+    # TODO: Maybe make 
 end
 
 # Functions with `LocationPath`s`
 
 global TABLE_FORMATS = ["csv", "parquet", "arrow"]
 
-function get_location_path_with_format(p::String; kwargs...)::LocationPath
-    if isempty(p)
-        return NO_LOCATION_PATH
-    end
-
-    format_name = get(kwargs, :format, "jl")
-    is_sample_format_arrow = format_name == "arrow"
-    if is_sample_format_arrow
-        return LocationPath(p, "arrow", get(kwargs, :format_version, "2"))
-    else
-        for table_format in TABLE_FORMATS
-            if occursin(table_format, p) || format_name == p
-                return LocationPath(p, "arrow", "2")
-            end
-        end
-    end
-    LocationPath(p, "jl", get_julia_version())
-end
-
 function get_sample_path_prefix(lp::LocationPath)
     format_name_sep = !isempty(lp.format_name) ? "_" : ""
     lp.path_hash * "_" * lp.format_name * format_name_sep * lp.format_version
@@ -85,7 +101,7 @@ function set_sampling_configs(d::Dict{LocationPath,SamplingConfig})
     session_sampling_configs[_get_session_id_no_error()] = d
 end
 
-get_sampling_config(path=""; kwargs...) = get_sampling_config(get_location_path_with_format(path; kwargs...))
+get_sampling_config(path=""; kwargs...) = get_sampling_config(LocationPath(path; kwargs...))
 function get_sampling_configs()
     global session_sampling_configs
     session_sampling_configs[_get_session_id_no_error()]
@@ -97,8 +113,8 @@ get_sampling_config(l_path::LocationPath)::SamplingConfig =
 
 # Getting sample rate
 
-get_sample_rate(p::String=""; kwargs...) =
-    get_sample_rate(get_location_path_with_format(p; kwargs...))
+get_sample_rate(p=""; kwargs...) =
+    get_sample_rate(LocationPath(p; kwargs...))
 function parse_sample_rate(object_key)
     parse(Int64, last(splitpath(object_key)))
 end
@@ -138,15 +154,15 @@ end
 
 # Checking for having metadata, samples
 
-has_metadata(p::String=""; kwargs...) =
-    has_metadata(get_location_path_with_format(p; kwargs...))
+has_metadata(p=""; kwargs...) =
+    has_metadata(LocationPath(p; kwargs...))
 function has_metadata(l_path:: LocationPath)::Bool
     println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())")
     isfile(S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(l_path))"))
 end
 
-has_sample(p::String=""; kwargs...) =
-    has_sample(get_location_path_with_format(p; kwargs...))
+has_sample(p=""; kwargs...) =
+    has_sample(LocationPath(p; kwargs...))
 function has_sample(l_path:: LocationPath)::Bool
     sc = get_sampling_config(l_path)
     banyan_sample_dir = S3Path("s3://$(banyan_samples_bucket_name())/$(get_sample_path_prefix(l_path))")
@@ -200,7 +216,7 @@ function get_metadata_local_path()
 end
 
 function get_samples_local_path()
-    p = joinpath(homedir(), ".banyan", "metadata")
+    p = joinpath(homedir(), ".banyan", "samples")
     if !isdir(p)
         mkpath(p)
     end
@@ -306,6 +322,8 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
             "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
         sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix/$sample_rate"
         try
+            @show sample_local_path
+            @show sample_s3_path
             blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))
             write(sample_local_path, seekstart(blob.io))  # This overwrites the existing file
             final_local_sample_path = sample_local_path
@@ -330,10 +348,11 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     end
 
     # If no such sample is found, search the S3 bucket
+    banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())")
+    banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix)
     if isempty(final_local_sample_path)
-        banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())")
         final_sample_rate = -1
-        banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix)
+        @show readdir_no_error(banyan_samples_object_dir)
         for object_key in readdir_no_error(banyan_samples_object_dir)
             object_sample_rate = parse(Int64, object_key)
             object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
@@ -353,6 +372,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
                 Path(final_local_sample_path)
             )
         end
+        @show readdir_no_error(banyan_samples_object_dir)
     end
     
     # Construct and return LocationSource
@@ -364,11 +384,15 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     )
     res_location.metadata_invalid = isempty(src_params)
     res_location.sample_invalid = isempty(final_local_sample_path)
+    @show res_location
     @show final_sample_rate
     @show final_local_sample_path
     final_sample_rate = isempty(final_local_sample_path) ? desired_sample_rate : final_sample_rate
     @show desired_sample_rate
     @show sample_local_dir
+    @show readdir(sample_local_dir)
+    println("At end of get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
+    
     (
         res_location,
         metadata_local_path,
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index d7decd31..0a3e3525 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -294,7 +294,7 @@ getsamplenrows(totalnrows::Int64)::Int64 = begin
 # eventually stored and updated in S3 on each write.
 
 function invalidate_metadata(p; kwargs...)
-    lp = get_location_path_with_format(p; kwargs...)
+    lp = LocationPath(p; kwargs...)
 
     # Delete locally
     p = joinpath(homedir(), ".banyan", "metadata", get_metadata_path(lp))
@@ -310,7 +310,7 @@ function invalidate_metadata(p; kwargs...)
     end
 end
 function invalidate_samples(p; kwargs...)
-    lp = get_location_path_with_format(p; kwargs...)
+    lp = LocationPath(p; kwargs...)
 
     # Delete locally
     samples_local_dir = joinpath(homedir(), ".banyan", "samples")
@@ -344,11 +344,8 @@ function partition(series, partition_size)
     (series[i:min(i+(partition_size-1),end)] for i in 1:partition_size:length(series))
 end
 function invalidate_all_locations()
-    for subdir in ["samples", "metadata"]
-        local_dir = joinpath(homedir(), ".banyan", subdir)
-        if isdir(local_dir)
-            rm(local_dir; force=true, recursive=true)
-        end
+    for local_dir in [get_samples_local_path(), get_metadata_local_path()]
+        rm(local_dir; force=true, recursive=true)
     end
 
     # Delete from S3
@@ -435,9 +432,13 @@ function RemoteSource(
     # Look at local and S3 caches of metadata and samples to attempt to
     # construct a Location.
     loc, local_metadata_path, local_sample_path = get_location_source(lp)
+    let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
+        println("Before get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir)) and loc.metadata_invalid=$(loc.metadata_invalid) and loc.sample_invalid=$(loc.sample_invalid)")
+    end
     @show lp
     @show get_sampling_configs()
     @show local_sample_path
+    @show loc
 
     res = if !loc.metadata_invalid && !loc.sample_invalid
         # Case where both sample and parameters are valid
@@ -446,7 +447,19 @@ function RemoteSource(
         loc
     elseif loc.metadata_invalid && !loc.sample_invalid
         # Case where parameters are invalid
+        let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
+            println("Before offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
+        end
+        let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b")
+            println("Before offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))")
+        end
         new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true)
+        let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
+            println("After offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
+        end
+        let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b")
+            println("After offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))")
+        end
         Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
         @show new_loc
         new_loc.sample.value = load_sample(local_sample_path)
@@ -471,5 +484,8 @@ function RemoteSource(
         
         new_loc
     end
+    let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
+        println("At end of RemoteSource with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
+    end
     res
 end
\ No newline at end of file
diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl
index 466ff88c..df6ba120 100644
--- a/Banyan/src/queues.jl
+++ b/Banyan/src/queues.jl
@@ -32,6 +32,8 @@ function get_next_message(
         end
     end
     m_dict = m["ReceiveMessageResult"]["Message"]
+    @show m_dict["MessageId"]
+    @show m_dict["ReceiptHandle"]
     if delete
         SQS.delete_message(queue_url, m_dict["ReceiptHandle"]::String)
     end
@@ -148,24 +150,31 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
         end
     end
 
+    for (i, pm) in enumerate(message_ranges)
+        if i > 1
+            println("pm == partial_messages[i-1] = $(message[pm] == message[message_ranges[i-1]])")
+        end
+    end
+
     # Launch asynchronous threads to send SQS messages
     gather_q_url = gather_queue_url()
     num_chunks = length(message_ranges)
     @show num_chunks
     if num_chunks > 1
-        @sync for i = 1:message_ranges
+        @sync for i = 1:num_chunks
             @async begin
-                msg = Dict{String,Any}(
-                    "kind" => "GATHER",
-                    "value_id" => value_id,
-                    "contents" => message[message_ranges[i]],
-                    "worker_memory_used" => worker_memory_used,
-                    "chunk_idx" => i,
-                    "num_chunks" => num_chunks
-                )
-                msg_json = JSON.json(msg)
                 SQS.send_message(
-                    msg_json,
+                    JSON.json(
+                        Dict{String,Any}(
+                            "kind" => "GATHER",
+                            "value_id" => value_id,
+                            "contents" => message[message_ranges[i]],
+                            "contents_length" => length(message[message_ranges[i]]),
+                            "worker_memory_used" => worker_memory_used,
+                            "chunk_idx" => i,
+                            "num_chunks" => num_chunks
+                        )
+                    ),
                     gather_q_url,
                     Dict(
                         "MessageGroupId" => string(i),
@@ -173,6 +182,8 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
                     )
                 )
                 @show i
+                @show message_ranges[i]
+                @show length(message[message_ranges[i]])
             end
         end
     else
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 32d67630..66720fa9 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -302,9 +302,10 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
                         partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing)
                         chunk_idx = partial_message["chunk_idx"]
                         @show chunk_idx
-                        partial_messages[chunk_idx] = message["contents"]
+                        partial_messages[chunk_idx] = partial_message["contents"]
                     end
                 end
+                @show length.(partial_messages)
                 join(partial_messages)
             else
                 message["contents"]
@@ -719,14 +720,29 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
             @show num_chunks
             
             whole_message_contents = if num_chunks > 1
-                partial_messages = Vector{String}(undef, num_chunks)
+                partial_messages = fill("", num_chunks)
                 partial_messages[message["chunk_idx"]] = message["contents"]
-                @sync for i = 1:num_remaining_chunks
+                @show message["chunk_idx"]
+                @sync for _ = 1:num_remaining_chunks
                     @async begin
-                        partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing)
-                        chunk_idx = partial_message["chunk_idx"]
-                        @show chunk_idx
-                        partial_messages[chunk_idx] = message["contents"]
+                        let partial_message = sqs_receive_next_message(gather_queue, p, nothing, nothing)[1]
+                            chunk_idx = partial_message["chunk_idx"]
+                            partial_messages[chunk_idx] = partial_message["contents"]
+                            @show chunk_idx
+                            @show length(partial_message["contents"])
+                            @show partial_message["contents_length"]
+                            @show length(partial_messages[chunk_idx])
+                            @show last(partial_message["contents"], 20)
+                            @show last(partial_messages[chunk_idx], 20)
+                            @show length.(partial_messages)
+                        end
+                    end
+                end
+                # TODO: Fix this so that it gets the partial messages which are different lengths
+                @show length.(partial_messages)
+                for (i, pm) in enumerate(partial_messages)
+                    if i > 1
+                        println("pm == partial_messages[i-1] = $(pm == partial_messages[i-1])")
                     end
                 end
                 join(partial_messages)
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index 66eb51af..2ce34517 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -21,7 +21,7 @@ function configure_sampling(
     )
 
     session_id = _get_session_id_no_error()
-    lp = get_location_path_with_format(path; kwargs...)
+    lp = LocationPath(path; kwargs...)
     sampling_configs = session_sampling_configs[session_id]
     if for_all_locations
         empty!(sampling_configs)
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index bcc38d90..ac93c999 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -610,6 +610,8 @@ TYPE_TO_STR =
 STR_TO_TYPE = invert(TYPE_TO_STR)
 
 function type_to_str(ty::DataType)::String
+    @show ty
+    @show TYPE_TO_STR
     global TYPE_TO_STR
     if haskey(TYPE_TO_STR, ty)
         TYPE_TO_STR[ty]
@@ -619,14 +621,16 @@ function type_to_str(ty::DataType)::String
 end
 
 function type_from_str(s::String)
+    @show s
+    @show STR_TO_TYPE
     if startswith(s, "lang_")
         if startswith(s, "lang_jl_")
-            from_jl_string(s[4:end])
+            from_jl_string(s[9:end])
         else
             error("Cannot parse type $s from non-Julia language")
         end
-    elseif haskey(TYPE_TO_STR, s)
-        TYPE_TO_STR[s]
+    elseif haskey(STR_TO_TYPE, s)
+        STR_TO_TYPE[s]
     else
         error("Type not supported. You may need to update to the latest version of Banyan or declare the data/sample/metadata you are accessing invalid.")
     end
diff --git a/Banyan/src/utils_pfs.jl b/Banyan/src/utils_pfs.jl
index 9def168c..fe0aaabb 100644
--- a/Banyan/src/utils_pfs.jl
+++ b/Banyan/src/utils_pfs.jl
@@ -510,7 +510,7 @@ function getpath(path::String)::String
         # disk if it doesn't fit in free memory
         # TODO: Add option for Internet locations as to whether or not to
         # cache on disk
-        hashed_path = get_remotepath_id(path)
+        hashed_path = LocationPath(path).path_hash
         joined_path = "efs/job_$(Banyan.get_session().resource_id)_dataset_$(hashed_path)_$(MPI.Comm_rank(MPI.COMM_WORLD))"
         # @info "Downloading $path to $joined_path"
         # if MPI.Comm_rank(comm) == 0
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index 3581365e..1300aea7 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -32,8 +32,7 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location
     sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))"
     mkpath(sample_dir)
     sample_path = "$sample_dir/$sample_rate"
-    @show sample_path
-    @show sample_rate
+    println("In _remote_table_source at start with readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))")
 
     # Get metadata if it is still valid
     curr_meta::Arrow.Table = if !curr_metadata_invalid
@@ -383,6 +382,7 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location
             )
         end
 
+        println("In _remote_table_source with curr_sample_invalid=$curr_sample_invalid for writing to $sample_path and readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))")
         # Write the sample to S3 cache if previously invalid
         if curr_sample_invalid
             write(sample_path, remote_sample.value)
diff --git a/BanyanDataFrames/src/precompile.jl b/BanyanDataFrames/src/precompile.jl
index 17ecc87a..30d5d270 100644
--- a/BanyanDataFrames/src/precompile.jl
+++ b/BanyanDataFrames/src/precompile.jl
@@ -191,7 +191,7 @@ function _precompile_()
     end
 
     # locations.jl
-    precompile(_remote_table_source, (String, Bool, Bool, Bool, Bool, Bool))
+    precompile(_remote_table_source, (LocationPath, Location))
 
     # df.jl
     precompile(Banyan.orderinghashes, (DataFrames.DataFrame, String))
@@ -298,9 +298,6 @@ function _precompile_()
     precompile(Arrow.write, (String,))
     precompile(Arrow.write, (DataFrames.DataFrame,))
 
-    # locations.jl
-    precompile(_remote_table_source, (String, Bool, Bool, Bool, Bool, Bool, Int64))
-
     # TODO: Maybe run code here to precompile
 
     # df = Future()
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index e9aec975..4f8b102a 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -116,6 +116,7 @@ end
         @show get_sampling_configs()
         @test get_sample_rate(p2) == 5
         @test has_metadata(p2)
+        # NOTE: We don't compute _exact_ samples on writing
         @test has_sample(p2) == !exact_sample
         invalidate_metadata(p2)
         @test !has_metadata(p2)
@@ -126,7 +127,7 @@ end
 
         @show get_sample_rate(p2)
         df2 = read_table(p2)
-        @show Banyan.get_location_path_with_format(p2)
+        @show Banyan.LocationPath(p2)
         @show get_sampling_configs()
         @show get_sampling_config(p2)
         @show get_sample_rate(p2)
diff --git a/BanyanHDF5/Project.toml b/BanyanHDF5/Project.toml
index 4b89c068..425f90e7 100644
--- a/BanyanHDF5/Project.toml
+++ b/BanyanHDF5/Project.toml
@@ -4,6 +4,7 @@ authors = ["Caleb Winston <calebhwin@gmail.com>"]
 version = "0.2.1"
 
 [deps]
+Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317"
 BanyanArrays = "369465de-032e-4609-9dcf-82b89c370a7b"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
@@ -12,6 +13,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
 [compat]
+Arrow = "2"
 Banyan = "0.4.1"
 BanyanArrays = "0.4.1"
 HDF5 = "^0.16"
diff --git a/BanyanHDF5/src/BanyanHDF5.jl b/BanyanHDF5/src/BanyanHDF5.jl
index 10d13816..a215f710 100644
--- a/BanyanHDF5/src/BanyanHDF5.jl
+++ b/BanyanHDF5/src/BanyanHDF5.jl
@@ -1,6 +1,7 @@
 module BanyanHDF5
 
-using Banyan,
+using Arrow,
+    Banyan,
     BanyanArrays,
     HDF5,
     MPI,
diff --git a/BanyanHDF5/src/hdf5.jl b/BanyanHDF5/src/hdf5.jl
index ecd529b4..63446fa1 100644
--- a/BanyanHDF5/src/hdf5.jl
+++ b/BanyanHDF5/src/hdf5.jl
@@ -4,7 +4,6 @@ function read_hdf5(path; kwargs...)
     A_loc.src_name == "Remote" || error("$path does not exist")
     invalidate(path; after=true, kwargs...)
     A = Future(datatype="Array", source=A_loc)
-    A_loc_eltype, A_loc_size = Banyan.from_jl_string(A_loc.src_parameters["eltype_and_size"])
     A_loc_eltype = Banyan.type_from_str(A_loc.src_parameters["eltype"])
     A_loc_size = Banyan.size_from_str(A_loc.src_parameters["size"])
     A_loc_ndims = length(A_loc_size)
diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index 585da17c..fd85b1d4 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -30,8 +30,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location)
     sc = get_sampling_config(lp)
     path_and_subpath = lp.path
     shuffled = sc.assume_shuffled
-    curr_metadata_invalid = loc.metadata_invalid
-    curr_sample_invalid = loc.sample_invalid
+    curr_metadata_invalid, curr_sample_invalid = loc.metadata_invalid, loc.sample_invalid
 
     # Get session information
     sample_rate = sc.rate
@@ -144,8 +143,8 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location)
                 "path_and_subpath" => path_and_subpath,
                 "path" => remotepath,
                 "subpath" => datasetpath,
-                "eltype" => Banyan.size_to_str(dataszie),
-                "size" => Banyan.type_to_str(dataeltype),
+                "eltype" => Banyan.type_to_str(dataeltype),
+                "size" => Banyan.size_to_str(datasize),
                 "sample_memory_usage" => string(nbytes),
                 "format" => "hdf5"
             )
@@ -160,8 +159,12 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location)
         sample_path = "$sample_dir/$sample_rate"
 
         # Store metadata and sample in S3
-        Arrow.write(metadata_path, Arrow.Table(); metadata=src_params)
-        serialize(sample_path, dset_sample)
+        if curr_metadata_invalid
+            Arrow.write(metadata_path, Arrow.Table(); metadata=src_params)
+        end
+        if curr_sample_invalid
+            serialize(sample_path, dset_sample)
+        end
 
         # Return Location to client side
         LocationSource("Remote", src_params, nbytes, dset_sample)
diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl
index 23ac8254..0cd0c566 100644
--- a/BanyanHDF5/test/hdf5.jl
+++ b/BanyanHDF5/test/hdf5.jl
@@ -33,19 +33,23 @@ end
 
 # TODO: Add tests here modeled after BDF.jl
 
-@testset "Reading and sampling HDF5 in $src with $scheduling_config with max_num_bytes_exact=$max_num_bytes and shuffled=$shuffled" for scheduling_config in [
-    "default scheduling",
-    "parallelism encouraged",
-    "parallelism and batches encouraged",
-],
-src in ["Internet", "S3"],
-max_num_bytes in [0, Banyan.parse_bytes("100 GB")],
-shuffled in [true, false]
+@testset "Reading $(shuffled ? "shuffled " : "")$src data and sampling it with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for
+    scheduling_config in [
+        "default scheduling",
+        "parallelism encouraged",
+        "parallelism and batches encouraged",
+    ],
+    src in ["Internet", "S3"],
+    max_num_bytes in [0, 100_000_000_000],
+    shuffled in [true, false]
+    
     get_organization_id()
-    use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do
-        invalidate_all_locations()
+    use_session_for_testing(scheduling_config_name = scheduling_config) do
         use_data()
-        configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled)
+        configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled, for_all_locations=true, default=true)
+        exact_sample = max_num_bytes > 0
+
+        invalidate_all_locations()
 
         p = if src == "S3"
             joinpath("s3://", get_cluster_s3_bucket_name(), "fillval.h5/DS1")
@@ -53,12 +57,16 @@ shuffled in [true, false]
             joinpath("https://github.com/banyan-team/banyan-julia/raw/v0.1.1/BanyanArrays/test/res", "fillval.h5/DS1")
         end
 
-        x = read_hdf5(p)
-        sample(x)
-        @show get_sample_rate(x)
+        df = read_hdf5(p; metadata_invalid=true, invalidate_samples=true)
+        sample(df)
+        @show max_num_bytes
+        @show exact_sample
+        @show get_sample_rate(p)
 
         configure_sampling(p; sample_rate=5)
-        x = read_hdf5(p)
+        @show get_sampling_configs()
+        read_hdf5(p)
+        @show get_sampling_configs()
         @test get_sample_rate(p) == 5
         @test has_metadata(p)
         @test has_sample(p)
@@ -69,30 +77,40 @@ shuffled in [true, false]
         @test !has_metadata(p)
         @test !has_sample(p)
 
-        x = read_hdf5(p)
         @show get_sample_rate(p)
-        sample(x)
+        df2 = read_hdf5(p)
+        @show Banyan.LocationPath(p)
+        @show get_sampling_configs()
+        @show get_sampling_config(p)
         @show get_sample_rate(p)
-        x = read_hdf5(p; samples_invalid=true)
-        sample(x)
+        sample(df2)
+        @show get_sample_rate(p)
+        df2 = read_hdf5(p; samples_invalid=true)
+        sample(df2)
+        @test get_sample_rate(p) == 5
         configure_sampling(sample_rate=7, for_all_locations=true)
-        x = read_hdf5(p; metadata_invalid=true)
-        sample(x)
         @test get_sample_rate(p) == 5
+        df2 = read_hdf5(p; metadata_invalid=true)
+        sample(df2)
+        @test get_sample_rate(p) == 5
+        println("Bad get_sample_rate")
         @test get_sample_rate() == 7
-        configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
+        configure_sampling(sample_rate=7, for_all_locations=true)
         @test get_sample_rate(p) == 5
+        println("Bad get_sample_rate")
+        configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
+        @test get_sample_rate(p) == 7
         @test get_sample_rate() == 7
-        x = read_hdf5(p)
+        df2 = read_hdf5(p)
         @test get_sample_rate(p) == 7
         @test get_sample_rate() == 7
-        x = read_hdf5(p; location_invalid=true)
-        sample(x)
+        df2 = read_hdf5(p; location_invalid=true)
+        sample(df2)
         @test has_metadata(p)
         @test has_sample(p)
         @show get_sample_rate(p)
         configure_sampling(p; always_exact=true)
-        sample(x)
+        sample(df2)
     end
 end
 
diff --git a/BanyanImages/Project.toml b/BanyanImages/Project.toml
index cf7cbabd..87afc272 100644
--- a/BanyanImages/Project.toml
+++ b/BanyanImages/Project.toml
@@ -14,6 +14,7 @@ ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
diff --git a/BanyanImages/src/BanyanImages.jl b/BanyanImages/src/BanyanImages.jl
index 3d08dc2e..4bebbe4d 100644
--- a/BanyanImages/src/BanyanImages.jl
+++ b/BanyanImages/src/BanyanImages.jl
@@ -2,7 +2,7 @@ module BanyanImages
 
 using Banyan, BanyanArrays
 
-using Arrow, FileIO, ImageCore, ImageIO, MPI, ProgressMeter, Random, Tables
+using Arrow, FileIO, ImageCore, ImageIO, MPI, ProgressMeter, Random, Serialization, Tables
 
 export read_png, # write_png,
     read_jpg #, write_jpg
diff --git a/BanyanImages/src/image.jl b/BanyanImages/src/image.jl
index b413a9ce..b309ea3b 100644
--- a/BanyanImages/src/image.jl
+++ b/BanyanImages/src/image.jl
@@ -1,11 +1,11 @@
-function read_png(path; add_channelview=false)
+function read_png(path; add_channelview=false, kwargs...)
     invalidate(path; kwargs...)
     image_loc = RemoteImageSource(path, add_channelview)
     image_loc.src_name == "Remote" || error("$path does not exist")
     invalidate(path; after=true, kwargs...)
     image = Future(;source=image_loc, datatype="Array")
-    image_loc_eltype = type_from_str(image_loc.src_parameters["eltype"])
-    image_loc_size = size_from_str(image_loc.src_parameters["size"])
+    image_loc_eltype = Banyan.type_from_str(image_loc.src_parameters["eltype"])
+    image_loc_size = Banyan.size_from_str(image_loc.src_parameters["size"])
     image_loc_ndims = length(image_loc_size)
     BanyanArrays.Array{image_loc_eltype,image_loc_ndims}(image, Future(image_loc_size))
 end
diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl
index 3bd07e40..ac6a180c 100644
--- a/BanyanImages/src/locations.jl
+++ b/BanyanImages/src/locations.jl
@@ -369,18 +369,18 @@ function _remote_image_source(lp::LocationPath, loc::Location, remotepath, add_c
                 "name" => "Remote",
                 "nimages" => string(nimages),
                 "sample_memory_usage" => string(nbytes_res),  # NOTE: We assume all files have same size
-                "size" => size_to_str(datasize_res),
-                "eltype" => type_to_str(dataeltype_res),
+                "size" => Banyan.size_to_str(datasize_res),
+                "eltype" => Banyan.type_to_str(dataeltype_res),
                 "add_channelview" => add_channelview ? "1" : "0",
                 "format" => "image"
             )
         else
-            curr_location.src_parameters
+            Banyan.get_src_params_dict_from_arrow(metadata_path)
         end
 
         # Store metadata and sample in S3
         if curr_metadata_invalid
-            Arrow.write(metadata_path, (path=localpaths,); metadata=src_params)
+            Arrow.write(metadata_path, (path=localpaths,); metadata=src_parameters)
         end
         if curr_sample_invalid
             serialize(sample_path, remote_sample)
@@ -395,11 +395,7 @@ end
 
 RemoteImageSource(remotepath, add_channelview)::Location =
     RemoteSource(
-        LocationPath(
-            remotepath isa String ? remotepath : "lang_jl_$(hash(remotepath))",
-            add_channelview ? "jl_channelview" : "jl",
-            Banyan.get_julia_version()
-        ),
+        LocationPath(remotepath; add_channelview=add_channelview),
         _remote_image_source,
         deserialize,
         identity,
diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl
index ab982b79..b19bdba5 100644
--- a/BanyanImages/test/jpg.jl
+++ b/BanyanImages/test/jpg.jl
@@ -79,65 +79,86 @@ invalid_bool_to_str(metadata_invalid) = metadata_invalid ? "invalid" : "valid"
     end
 end
 
-@testset "Reading and sampling $nimage JPG images on $loc with $format and add_channelview=$add_channelview, max_num_bytes=$max_num_bytes, shuffled=$shuffled" for
+@testset "Reading and sampling $nimages $(shuffled ? "shuffled" : "") JPG images on $loc with $format and add_channelview=$add_channelview with $scheduling_config and a maximum of $max_num_bytes bytes for exact sample" for
+    scheduling_config in [
+        "default scheduling",
+        "parallelism encouraged",
+        "parallelism and batches encouraged",
+    ],
     (loc, format) in [
         ("Internet", "generator"),
         ("S3", "generator"),
         ("S3", "directory")
     ],
-    max_num_bytes in [0, Banyan.parse_bytes("100 GB")],
+    max_num_bytes in [0, 100_000_000_000],
     shuffled in [true, false],
     nimages in [1, 50],
     add_channelview in [true, false]
+
     get_organization_id()
     use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do
-        bucket_name = get_cluster_s3_bucket_name()
+        configure_sampling(max_num_bytes_exact=max_num_bytes, always_shuffled=shuffled, for_all_locations=true, default=true)
+        exact_sample = max_num_bytes > 0
+
         invalidate_all_locations()
-        configure_sampling(max_num_bytes_exact=max_num_bytes, assume_shuffled=shuffled)
 
+        bucket_name = get_cluster_s3_bucket_name()
         p = get_test_path(loc, "generator", "jpg", nimages, bucket_name)
 
-        x = read_jpg(p; add_channelview=add_channelview)
-        sample(x)
-        @show get_sample_rate(x)
-
-        # TODO: Ensure that this triggers parallel cluster<->client data transfer
-        configure_sampling(p; sample_rate=20)
-        x = read_jpg(p; add_channelview=add_channelview)
-        @test get_sample_rate(p) == 20
-        @test has_metadata(p)
-        @test has_sample(p)
-        invalidate_metadata(p)
-        @test !has_metadata(p)
-        @test has_sample(p)
-        invalidate_location(p)
-        @test !has_metadata(p)
-        @test !has_sample(p)
-
-        x = read_jpg(p; add_channelview=add_channelview)
-        @show get_sample_rate(p)
-        sample(x)
-        @show get_sample_rate(p)
-        x = read_jpg(p; add_channelview=add_channelview, samples_invalid=true)
-        sample(x)
+        df = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true, invalidate_samples=true)
+        sample(df)
+        @show max_num_bytes
+        @show exact_sample
+        @show get_sample_rate(p; add_channelview=add_channelview)
+
+        configure_sampling(p; sample_rate=50)
+        @show get_sampling_configs()
+        read_jpg(p; add_channelview=add_channelview)
+        @show get_sampling_configs()
+        @test get_sample_rate(p; add_channelview=add_channelview) == 50
+        @test has_metadata(p; add_channelview=add_channelview)
+        @test has_sample(p; add_channelview=add_channelview)
+        invalidate_metadata(p; add_channelview=add_channelview)
+        @test !has_metadata(p; add_channelview=add_channelview)
+        @test has_sample(p; add_channelview=add_channelview)
+        invalidate_location(p; add_channelview=add_channelview)
+        @test !has_metadata(p; add_channelview=add_channelview)
+        @test !has_sample(p; add_channelview=add_channelview)
+
+        @show get_sample_rate(p; add_channelview=add_channelview)
+        df2 = read_jpg(p; add_channelview=add_channelview)
+        @show Banyan.LocationPath(p; add_channelview=add_channelview)
+        @show get_sampling_configs()
+        @show get_sampling_config(p; add_channelview=add_channelview)
+        @show get_sample_rate(p; add_channelview=add_channelview)
+        sample(df2)
+        @show get_sample_rate(p; add_channelview=add_channelview)
+        df2 = read_jpg(p; add_channelview=add_channelview, samples_invalid=true)
+        sample(df2)
+        @test get_sample_rate(p; add_channelview=add_channelview) == 50
         configure_sampling(sample_rate=75, for_all_locations=true)
-        x = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true)
-        sample(x)
-        @test get_sample_rate(p) == 50
+        @test get_sample_rate(p; add_channelview=add_channelview) == 50
+        df2 = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true)
+        sample(df2)
+        @test get_sample_rate(p; add_channelview=add_channelview) == 50
+        println("Bad get_sample_rate")
         @test get_sample_rate() == 75
+        configure_sampling(sample_rate=75, for_all_locations=true)
+        @test get_sample_rate(p; add_channelview=add_channelview) == 50
+        println("Bad get_sample_rate")
         configure_sampling(sample_rate=75, force_new_sample_rate=true, for_all_locations=true)
-        @test get_sample_rate(p) == 50
+        @test get_sample_rate(p; add_channelview=add_channelview) == 75
         @test get_sample_rate() == 75
-        x = read_jpg(p; add_channelview=add_channelview)
-        @test get_sample_rate(p) == 75
+        df2 = read_jpg(p; add_channelview=add_channelview)
+        @test get_sample_rate(p; add_channelview=add_channelview) == 75
         @test get_sample_rate() == 75
-        x = read_jpg(p; add_channelview=add_channelview, location_invalid=true)
-        sample(x)
-        @test has_metadata(p)
-        @test has_sample(p)
-        @show get_sample_rate(p)
-        configure_sampling(p; always_exact=true)
-        sample(x)
+        df2 = read_jpg(p; add_channelview=add_channelview, location_invalid=true)
+        sample(df2)
+        @test has_metadata(p; add_channelview=add_channelview)
+        @test has_sample(p; add_channelview=add_channelview)
+        @show get_sample_rate(p; add_channelview=add_channelview)
+        configure_sampling(p; add_channelview=add_channelview, always_exact=true)
+        sample(df2)
     end
 end
 
diff --git a/BanyanImages/test/runtests.jl b/BanyanImages/test/runtests.jl
index 695a4f22..58b1eb6b 100644
--- a/BanyanImages/test/runtests.jl
+++ b/BanyanImages/test/runtests.jl
@@ -9,7 +9,7 @@ MPI.Init()
 # Create a dummy test session for unit tests
 test_session_id = "test_session_id"
 test_resource_id = "test_resource_id"
-Banyan.sessions[test_session_id] = Session(ENV["BANYAN_CLUSTER_NAME"], test_session_id, test_resource_id, 2, 2)
+Banyan.sessions[test_session_id] = Session(ENV["BANYAN_CLUSTER_NAME"], test_session_id, test_resource_id, 2)
 
 global sessions_for_testing = Dict()
 

From c627846d2530c533f6ccdc103b482f3dd51f9400 Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Thu, 18 Aug 2022 12:32:05 -0400
Subject: [PATCH 21/25] Fix new sampling system for BanyanImages

---
 BanyanHDF5/src/locations.jl   |  2 +-
 BanyanImages/src/locations.jl | 10 +++++++---
 BanyanImages/test/jpg.jl      | 10 ++++++----
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/BanyanHDF5/src/locations.jl b/BanyanHDF5/src/locations.jl
index fd85b1d4..a725b6b6 100644
--- a/BanyanHDF5/src/locations.jl
+++ b/BanyanHDF5/src/locations.jl
@@ -90,7 +90,7 @@ function _remote_hdf5_source(lp::LocationPath, loc::Location)
         # aggregate and concatenate it on the main worker
         rand_indices_range = split_len(datalength, worker_idx, nworkers)
         rand_indices = sample_from_range(rand_indices_range, sample_rate)
-        exact_sample_needed = nbytes < sc.max_num_bytes_exact
+        exact_sample_needed = nbytes < sc.max_num_bytes_exact || sc.always_exact
         remaining_colons = Base.fill(Colon(), datandims-1)
         dset_sample_value = if !exact_sample_needed
             samples_on_workers = gather_across(
diff --git a/BanyanImages/src/locations.jl b/BanyanImages/src/locations.jl
index ac6a180c..463fcae2 100644
--- a/BanyanImages/src/locations.jl
+++ b/BanyanImages/src/locations.jl
@@ -308,9 +308,13 @@ function _remote_image_source(lp::LocationPath, loc::Location, remotepath, add_c
     # Read in images on each worker. We need to read in at least one image
     # regardless of whether we want to get the sample or the metadata
     _load_img = add_channelview ? _load_image_and_add_channelview : _load_image
-    first_img = is_main ? (localpaths[1] |> _load_img |> _reshape_image) : nothing
-    exact_sample_needed = is_main ? ((sample_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false
-    exact_sample_needed = sync_across(exact_sample_needed)
+    first_img = is_main ? (localpaths[1] |> getpath |> _load_img |> _reshape_image) : nothing
+    exact_sample_needed = if sc.always_exact
+        true
+    else
+        esn = is_main ? ((sample_memory_usage(first_img) * length(localpaths)) < sc.max_num_bytes_exact) : false
+        sync_across(esn)
+    end
     need_to_parallelize = nimages >= 10
     total_num_images_to_read_in = if curr_sample_invalid
         exact_sample_needed ? nimages : cld(nimages, sc.rate)
diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl
index b19bdba5..32861306 100644
--- a/BanyanImages/test/jpg.jl
+++ b/BanyanImages/test/jpg.jl
@@ -90,10 +90,12 @@ end
         ("S3", "generator"),
         ("S3", "directory")
     ],
-    max_num_bytes in [0, 100_000_000_000],
-    shuffled in [true, false],
-    nimages in [1, 50],
-    add_channelview in [true, false]
+    (max_num_bytes, nimages, add_channelview) in [
+        (0, 1, false),
+        (0, 50, true),
+        (100_000_000_000, 1, true)
+    ],
+    shuffled in [true, false]
 
     get_organization_id()
     use_session_for_testing(scheduling_config_name = scheduling_config, sample_rate = 20) do

From 2ae6477b0fae63f51eccd30cc7383cfdf99ae26e Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Thu, 18 Aug 2022 14:09:58 -0400
Subject: [PATCH 22/25] Remove print statements

---
 Banyan/src/clusters.jl                     |  1 -
 Banyan/src/location.jl                     | 31 ------------------
 Banyan/src/locations.jl                    | 32 ------------------
 Banyan/src/queues.jl                       | 12 -------
 Banyan/src/requests.jl                     | 21 ------------
 Banyan/src/utils.jl                        |  4 ---
 BanyanDataFrames/src/locations.jl          | 15 ---------
 BanyanDataFrames/src/pfs.jl                | 38 ++--------------------
 BanyanDataFrames/test/sample_collection.jl | 16 ---------
 BanyanHDF5/src/pfs.jl                      |  4 ---
 BanyanHDF5/test/hdf5.jl                    | 14 --------
 BanyanImages/test/jpg.jl                   | 31 +++++-------------
 BanyanImages/test/runtests.jl              |  1 +
 13 files changed, 11 insertions(+), 209 deletions(-)

diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl
index 2e7572c3..eabb1215 100644
--- a/Banyan/src/clusters.jl
+++ b/Banyan/src/clusters.jl
@@ -195,7 +195,6 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster}
     if !isempty(cluster_name)
         filters["cluster_name"] = cluster_name
     end
-    @show filters
     response = send_request_get_response(:describe_clusters, Dict{String,Any}("filters"=>filters))
     clusters_dict::Dict{String,Cluster} = Dict{String,Cluster}()
     for (name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any}
diff --git a/Banyan/src/location.jl b/Banyan/src/location.jl
index 4b9458d2..b713b55c 100644
--- a/Banyan/src/location.jl
+++ b/Banyan/src/location.jl
@@ -120,7 +120,6 @@ function parse_sample_rate(object_key)
 end
 function get_sample_rate(l_path::LocationPath)
     sc = get_sampling_config(l_path)
-    @show sc
 
     # Get the desired sample rate
     desired_sample_rate = sc.rate
@@ -138,9 +137,6 @@ function get_sample_rate(l_path::LocationPath)
     banyan_samples_bucket = S3Path("s3://$(banyan_samples_bucket_name())")
     banyan_samples_object_dir = joinpath(banyan_samples_bucket, get_sample_path_prefix(l_path))
     sample_rate = -1
-    @show banyan_samples_object_dir
-    @show readdir(banyan_samples_bucket)
-    @show readdir_no_error(banyan_samples_object_dir)
     for object_key in readdir_no_error(banyan_samples_object_dir)
         object_sample_rate = parse(Int64, object_key)
         object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
@@ -157,7 +153,6 @@ end
 has_metadata(p=""; kwargs...) =
     has_metadata(LocationPath(p; kwargs...))
 function has_metadata(l_path:: LocationPath)::Bool
-    println("In has_metadata, checking get_metadata_path(l_path)=$(get_metadata_path(l_path)) and banyan_metadata_bucket_name()=$(banyan_metadata_bucket_name())")
     isfile(S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(l_path))"))
 end
 
@@ -166,20 +161,6 @@ has_sample(p=""; kwargs...) =
 function has_sample(l_path:: LocationPath)::Bool
     sc = get_sampling_config(l_path)
     banyan_sample_dir = S3Path("s3://$(banyan_samples_bucket_name())/$(get_sample_path_prefix(l_path))")
-    println("In has_sample")
-    @show sc
-    @show sc.force_new_sample_rate
-    @show joinpath(banyan_sample_dir, string(sc.rate))
-    @show isdir_no_error(banyan_sample_dir)
-    @show isdir_no_error(banyan_sample_dir) && !isempty(readdir(banyan_sample_dir))
-    @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/"))
-    @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2"))
-    @show isdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2/"))
-    @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_2"))
-    @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arr/"))
-    @show isdir_no_error(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/17268367127015750092_arrow_200/"))
-    @show banyan_sample_dir
-    @show readdir_no_error(banyan_sample_dir)
     if sc.force_new_sample_rate
         isfile(joinpath(banyan_sample_dir, string(sc.rate)))
     else
@@ -322,8 +303,6 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
             "$(dayabbr(lm)), $(twodigit(day(lm))) $(monthabbr(lm)) $(year(lm)) $(twodigit(hour(lm))):$(twodigit(minute(lm))):$(twodigit(second(lm))) GMT"
         sample_s3_path = "/$(banyan_samples_bucket_name())/$sample_path_prefix/$sample_rate"
         try
-            @show sample_local_path
-            @show sample_s3_path
             blob = s3("GET", sample_s3_path, Dict("headers" => Dict("If-Modified-Since" => if_modified_since_string)))
             write(sample_local_path, seekstart(blob.io))  # This overwrites the existing file
             final_local_sample_path = sample_local_path
@@ -352,7 +331,6 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     banyan_samples_object_dir = joinpath(banyan_samples_bucket, sample_path_prefix)
     if isempty(final_local_sample_path)
         final_sample_rate = -1
-        @show readdir_no_error(banyan_samples_object_dir)
         for object_key in readdir_no_error(banyan_samples_object_dir)
             object_sample_rate = parse(Int64, object_key)
             object_sample_rate_diff = abs(object_sample_rate - desired_sample_rate)
@@ -372,7 +350,6 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
                 Path(final_local_sample_path)
             )
         end
-        @show readdir_no_error(banyan_samples_object_dir)
     end
     
     # Construct and return LocationSource
@@ -384,15 +361,7 @@ function get_location_source(lp::LocationPath)::Tuple{Location,String,String}
     )
     res_location.metadata_invalid = isempty(src_params)
     res_location.sample_invalid = isempty(final_local_sample_path)
-    @show res_location
-    @show final_sample_rate
-    @show final_local_sample_path
     final_sample_rate = isempty(final_local_sample_path) ? desired_sample_rate : final_sample_rate
-    @show desired_sample_rate
-    @show sample_local_dir
-    @show readdir(sample_local_dir)
-    println("At end of get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
-    
     (
         res_location,
         metadata_local_path,
diff --git a/Banyan/src/locations.jl b/Banyan/src/locations.jl
index 0a3e3525..4e5a091b 100644
--- a/Banyan/src/locations.jl
+++ b/Banyan/src/locations.jl
@@ -303,7 +303,6 @@ function invalidate_metadata(p; kwargs...)
     end
 
     # Delete from S3
-    println("Deleting get_metadata_path(lp)=$(get_metadata_path(lp))")
     s3p = S3Path("s3://$(banyan_metadata_bucket_name())/$(get_metadata_path(lp))")
     if isfile(s3p)
         rm(s3p)
@@ -325,16 +324,9 @@ function invalidate_samples(p; kwargs...)
 
     # Delete from S3
     s3p = S3Path("s3://$(banyan_samples_bucket_name())/$sample_path_prefix")
-    @show readdir_no_error(s3p)
-    @show s3p
-    @show path_as_dir(s3p)
-    @show readdir(S3Path("s3://$(banyan_samples_bucket_name())"))
     if !isempty(readdir_no_error(s3p))
         rm(path_as_dir(s3p), recursive=true)
     end
-    @show readdir_no_error(s3p)
-    @show s3p
-    @show readdir(S3Path("s3://$(banyan_samples_bucket_name())"))
 end
 function invalidate_location(p; kwargs...)
     invalidate_metadata(p; kwargs...)
@@ -432,13 +424,6 @@ function RemoteSource(
     # Look at local and S3 caches of metadata and samples to attempt to
     # construct a Location.
     loc, local_metadata_path, local_sample_path = get_location_source(lp)
-    let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
-        println("Before get_location_source with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir)) and loc.metadata_invalid=$(loc.metadata_invalid) and loc.sample_invalid=$(loc.sample_invalid)")
-    end
-    @show lp
-    @show get_sampling_configs()
-    @show local_sample_path
-    @show loc
 
     res = if !loc.metadata_invalid && !loc.sample_invalid
         # Case where both sample and parameters are valid
@@ -447,21 +432,8 @@ function RemoteSource(
         loc
     elseif loc.metadata_invalid && !loc.sample_invalid
         # Case where parameters are invalid
-        let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
-            println("Before offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
-        end
-        let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b")
-            println("Before offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))")
-        end
         new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true)
-        let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
-            println("After offloaded with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
-        end
-        let banyan_samples_bucket = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b")
-            println("After offloaded with readdir_no_error(banyan_samples_bucket)=$(readdir_no_error(banyan_samples_bucket))")
-        end
         Arrow.write(local_metadata_path, Arrow.Table(); metadata=new_loc.src_parameters)
-        @show new_loc
         new_loc.sample.value = load_sample(local_sample_path)
         new_loc
     else
@@ -469,7 +441,6 @@ function RemoteSource(
 
         # Get the Location with up-to-date metadata (source parameters) and sample
         new_loc = offloaded(_remote_source, lp, loc, args...; distributed=true)
-        @show new_loc
 
         if !loc.metadata_invalid
             # Store the metadata locally. The local copy just has the source
@@ -484,8 +455,5 @@ function RemoteSource(
         
         new_loc
     end
-    let banyan_samples_object_dir = S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/15117355623592221474_jl_1.8.0-beta3")
-        println("At end of RemoteSource with readdir_no_error(banyan_samples_object_dir)=$(readdir_no_error(banyan_samples_object_dir))")
-    end
     res
 end
\ No newline at end of file
diff --git a/Banyan/src/queues.jl b/Banyan/src/queues.jl
index df6ba120..6e2c16d9 100644
--- a/Banyan/src/queues.jl
+++ b/Banyan/src/queues.jl
@@ -32,8 +32,6 @@ function get_next_message(
         end
     end
     m_dict = m["ReceiveMessageResult"]["Message"]
-    @show m_dict["MessageId"]
-    @show m_dict["ReceiptHandle"]
     if delete
         SQS.delete_message(queue_url, m_dict["ReceiptHandle"]::String)
     end
@@ -150,16 +148,9 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
         end
     end
 
-    for (i, pm) in enumerate(message_ranges)
-        if i > 1
-            println("pm == partial_messages[i-1] = $(message[pm] == message[message_ranges[i-1]])")
-        end
-    end
-
     # Launch asynchronous threads to send SQS messages
     gather_q_url = gather_queue_url()
     num_chunks = length(message_ranges)
-    @show num_chunks
     if num_chunks > 1
         @sync for i = 1:num_chunks
             @async begin
@@ -181,9 +172,6 @@ function send_to_client(value_id::ValueId, value, worker_memory_used = 0)
                         "MessageDeduplicationId" => generated_message_id * string(i)
                     )
                 )
-                @show i
-                @show message_ranges[i]
-                @show length(message[message_ranges[i]])
             end
         end
     else
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 66720fa9..8578786d 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -291,8 +291,6 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
             if is_debug_on()
                 println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
             end
-
-            @show num_chunks
             
             whole_message_contents = if num_chunks > 1
                 partial_messages = Vector{String}(undef, num_chunks)
@@ -301,11 +299,9 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
                     @async begin
                         partial_message, _ = sqs_receive_next_message(gather_queue, p, nothing, nothing)
                         chunk_idx = partial_message["chunk_idx"]
-                        @show chunk_idx
                         partial_messages[chunk_idx] = partial_message["contents"]
                     end
                 end
-                @show length.(partial_messages)
                 join(partial_messages)
             else
                 message["contents"]
@@ -717,34 +713,17 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
                 println("Gathering $num_chunks chunk$(num_chunks > 1 ? "s" : "") to client")
             end
             
-            @show num_chunks
-            
             whole_message_contents = if num_chunks > 1
                 partial_messages = fill("", num_chunks)
                 partial_messages[message["chunk_idx"]] = message["contents"]
-                @show message["chunk_idx"]
                 @sync for _ = 1:num_remaining_chunks
                     @async begin
                         let partial_message = sqs_receive_next_message(gather_queue, p, nothing, nothing)[1]
                             chunk_idx = partial_message["chunk_idx"]
                             partial_messages[chunk_idx] = partial_message["contents"]
-                            @show chunk_idx
-                            @show length(partial_message["contents"])
-                            @show partial_message["contents_length"]
-                            @show length(partial_messages[chunk_idx])
-                            @show last(partial_message["contents"], 20)
-                            @show last(partial_messages[chunk_idx], 20)
-                            @show length.(partial_messages)
                         end
                     end
                 end
-                # TODO: Fix this so that it gets the partial messages which are different lengths
-                @show length.(partial_messages)
-                for (i, pm) in enumerate(partial_messages)
-                    if i > 1
-                        println("pm == partial_messages[i-1] = $(pm == partial_messages[i-1])")
-                    end
-                end
                 join(partial_messages)
             else
                 message["contents"]
diff --git a/Banyan/src/utils.jl b/Banyan/src/utils.jl
index ac93c999..cb6504a4 100644
--- a/Banyan/src/utils.jl
+++ b/Banyan/src/utils.jl
@@ -610,8 +610,6 @@ TYPE_TO_STR =
 STR_TO_TYPE = invert(TYPE_TO_STR)
 
 function type_to_str(ty::DataType)::String
-    @show ty
-    @show TYPE_TO_STR
     global TYPE_TO_STR
     if haskey(TYPE_TO_STR, ty)
         TYPE_TO_STR[ty]
@@ -621,8 +619,6 @@ function type_to_str(ty::DataType)::String
 end
 
 function type_from_str(s::String)
-    @show s
-    @show STR_TO_TYPE
     if startswith(s, "lang_")
         if startswith(s, "lang_jl_")
             from_jl_string(s[9:end])
diff --git a/BanyanDataFrames/src/locations.jl b/BanyanDataFrames/src/locations.jl
index 1300aea7..b67d08d0 100644
--- a/BanyanDataFrames/src/locations.jl
+++ b/BanyanDataFrames/src/locations.jl
@@ -4,11 +4,6 @@ Arrow_Table_retry = retry(Arrow.Table; delays=Base.ExponentialBackOff(; n=5))
 
 function _remote_table_source(lp::LocationPath, loc::Location)::Location
     sampling_config = get_sampling_config(lp)
-    metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
-    metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
-        haskey(s3_res, "Contents") ? s3_res["Contents"] : []
-    end
-    println("In _remote_table_source at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir")
 
     # Setup for sampling
     remotepath = lp.path
@@ -32,7 +27,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location
     sample_dir = "s3/$(banyan_samples_bucket_name())/$(get_sample_path_prefix(lp))"
     mkpath(sample_dir)
     sample_path = "$sample_dir/$sample_rate"
-    println("In _remote_table_source at start with readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))")
 
     # Get metadata if it is still valid
     curr_meta::Arrow.Table = if !curr_metadata_invalid
@@ -348,12 +342,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location
     # If a file does not exist, one of the get_metadata/get_sample functions
     # will error.
 
-    metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
-    metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
-        haskey(s3_res, "Contents") ? s3_res["Contents"] : []
-    end
-    println("In _remote_table_source at end with metadata_dir=$metadata_dir and metadata_bucket_dir=$metadata_bucket_dir and metadata_path=$metadata_path and curr_metadata_invalid=$curr_metadata_invalid")
-
     # Get source parameters
     src_params =
         Dict(
@@ -382,7 +370,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location
             )
         end
 
-        println("In _remote_table_source with curr_sample_invalid=$curr_sample_invalid for writing to $sample_path and readdir_no_error(sample_dir)=$(readdir_no_error(sample_dir))")
         # Write the sample to S3 cache if previously invalid
         if curr_sample_invalid
             write(sample_path, remote_sample.value)
@@ -392,8 +379,6 @@ function _remote_table_source(lp::LocationPath, loc::Location)::Location
             @show (remotepath, meta_path)
         end
 
-        # println("At end of _remote_table_source on get_worker_idx()=$(MPI.Initialized() ? get_worker_idx() : -1)")
-
         # Return LocationSource to client specified
 
         # Construct the `Location` to return
diff --git a/BanyanDataFrames/src/pfs.jl b/BanyanDataFrames/src/pfs.jl
index c1ec11be..bac01b39 100644
--- a/BanyanDataFrames/src/pfs.jl
+++ b/BanyanDataFrames/src/pfs.jl
@@ -335,7 +335,7 @@ function ReadBlockHelper(@nospecialize(format_value))
             dfs = Base.Vector{Any}(undef, ndfs)
 
             if Banyan.INVESTIGATING_BDF_INTERNET_FILE_NOT_FOUND
-                @show (filezs_to_read, get_worker_idx())
+                @show (files_to_read, get_worker_idx())
             end
 
             # Iterate through files and identify which ones correspond to the range of
@@ -387,13 +387,7 @@ function WriteHelper(@nospecialize(format_value))
         comm::MPI.Comm,
         loc_name::String,
         loc_params::Dict{String,Any},
-    )
-        metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
-        metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
-            haskey(s3_res, "Contents") ? s3_res["Contents"] : []
-        end
-        println("In Write at start with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir")
-        
+    )   
         # Get rid of splitting divisions if they were used to split this data into
         # groups
         splitting_divisions = Banyan.get_splitting_divisions()
@@ -545,12 +539,6 @@ function WriteHelper(@nospecialize(format_value))
         # On the main worker, finalize metadata and location info.
         sample_invalid = false
         if is_main
-            metadata_dir = readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
-            metadata_bucket_dir = let s3_res = Banyan.S3.list_objects_v2("banyan-metadata-75c0f7151604587a83055278b28db83b")
-                haskey(s3_res, "Contents") ? s3_res["Contents"] : []
-            end
-            println("In Write with metadata_dir=$metadata_dir, metadata_bucket_dir=$metadata_bucket_dir")
-
             # Determine paths and #s of rows for metadata file
             for worker_i in 1:nworkers
                 push!(
@@ -591,42 +579,21 @@ function WriteHelper(@nospecialize(format_value))
                 sample_invalid = true
             end
 
-            println("In Write with sample_invalid=$sample_invalid (because sample_memory_usage=$sample_memory_usage and sampling_config.max_num_bytes_exact=$(sampling_config.max_num_bytes_exact)) and while sampling_config=$sampling_config, writing to $m_path and $s_path, on batch_idx=$batch_idx with curr_src_parameters=$curr_src_parameters")
-
-            @show get_sampling_configs()
-            @show lp
-            @show get_sampling_config(lp)
-            @show s_path
-            @show s_sample_dir
-
             # Get the actual sample by concatenating
             if !sample_invalid
                 sampled_parts = [gathered[4] for gathered in gathered_data]
                 if batch_idx > 1
                     push!(sampled_parts, Arrow.Table(s_path) |> DataFrames.DataFrame)
                 end
-                println("Writing to s_path=$s_path")
                 Arrow.write(s_path, vcat(sampled_parts...), compress=:zstd)
             else
-                println("Removing s_path=$s_path")
                 rm(s_path, force=true, recursive=true)
             end
 
             # Determine paths for this batch and gather # of rows
-            @show m_path
-            @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b/")
-            @show readdir("s3/banyan-metadata-75c0f7151604587a83055278b28db83b")
-            bucket_dir = readdir("s3/$(banyan_metadata_bucket_name())")
-            println("On main in $(banyan_metadata_bucket_name()): $bucket_dir")
             Arrow.write(m_path, (path=curr_remotepaths, nrows=curr_nrows); compress=:zstd, metadata=curr_src_parameters)
         end
 
-        @show readdir("s3/$(banyan_metadata_bucket_name())")
-        @show Banyan.S3.list_objects_v2(banyan_metadata_bucket_name())["Contents"]
-
-        println("In Write")
-        @show readdir(Banyan.AWSS3.S3Path("s3://banyan-samples-75c0f7151604587a83055278b28db83b/"))
-
         ###################################
         # Handling Final Batch by Copying #
         ###################################
@@ -640,7 +607,6 @@ function WriteHelper(@nospecialize(format_value))
                 cp(m_path, actual_meta_path, force=true)
                 if !sample_invalid
                     mkpath(actual_sample_dir)
-                    println("Copying from s_path=$s_path to actual_sample_path=$actual_sample_path")
                     cp(s_path, actual_sample_path, force=true)
                 end
             end
diff --git a/BanyanDataFrames/test/sample_collection.jl b/BanyanDataFrames/test/sample_collection.jl
index 4f8b102a..f160f9fd 100644
--- a/BanyanDataFrames/test/sample_collection.jl
+++ b/BanyanDataFrames/test/sample_collection.jl
@@ -101,19 +101,12 @@ end
 
         p1 = "s3://$(bucket)/iris_large.$format"
         p2 = "s3://$(bucket)/iris_large_tmp.$format"
-        println("has_sample(p2)=$(has_sample(p2)) after invalidation")
 
         df = read_table(p1; metadata_invalid=true, invalidate_samples=true)
         sample(df)
-        @show max_num_bytes
-        @show exact_sample
-        @show get_sample_rate(p1)
 
         configure_sampling(p2; sample_rate=5)
-        println("Before write_table")
-        @show get_sampling_configs()
         write_table(df, p2)
-        @show get_sampling_configs()
         @test get_sample_rate(p2) == 5
         @test has_metadata(p2)
         # NOTE: We don't compute _exact_ samples on writing
@@ -125,21 +118,13 @@ end
         @test !has_metadata(p2)
         @test !has_sample(p2)
 
-        @show get_sample_rate(p2)
         df2 = read_table(p2)
-        @show Banyan.LocationPath(p2)
-        @show get_sampling_configs()
-        @show get_sampling_config(p2)
-        @show get_sample_rate(p2)
         sample(df2)
-        @show get_sample_rate(p2)
         df2 = read_table(p2; samples_invalid=true)
         sample(df2)
         @test get_sample_rate(p2) == 5
-        println("After bad get_sample_rate")
         configure_sampling(sample_rate=7, for_all_locations=true)
         @test get_sample_rate(p2) == 5
-        println("After bad get_sample_rate")
         df2 = read_table(p2; metadata_invalid=true)
         sample(df2)
         @test get_sample_rate(p2) == 5
@@ -156,7 +141,6 @@ end
         sample(df2)
         @test has_metadata(p2)
         @test has_sample(p2)
-        @show get_sample_rate(p2)
         configure_sampling(p2; always_exact=true)
         sample(df2)
     end
diff --git a/BanyanHDF5/src/pfs.jl b/BanyanHDF5/src/pfs.jl
index 73456013..4c69264d 100644
--- a/BanyanHDF5/src/pfs.jl
+++ b/BanyanHDF5/src/pfs.jl
@@ -603,10 +603,6 @@ function WriteHelperHDF5(
         fsync_file(path)
         MPI.Barrier(comm)
     end
-    if true#is_main
-        f = h5open("/home/ec2-user/s3/banyan-cluster-data-test-lustre-0ce21f27/fillval.h5", "r+", comm, info)
-        close(f)
-    end
     nothing
 end
 
diff --git a/BanyanHDF5/test/hdf5.jl b/BanyanHDF5/test/hdf5.jl
index 0cd0c566..c1575f0a 100644
--- a/BanyanHDF5/test/hdf5.jl
+++ b/BanyanHDF5/test/hdf5.jl
@@ -59,14 +59,9 @@ end
 
         df = read_hdf5(p; metadata_invalid=true, invalidate_samples=true)
         sample(df)
-        @show max_num_bytes
-        @show exact_sample
-        @show get_sample_rate(p)
 
         configure_sampling(p; sample_rate=5)
-        @show get_sampling_configs()
         read_hdf5(p)
-        @show get_sampling_configs()
         @test get_sample_rate(p) == 5
         @test has_metadata(p)
         @test has_sample(p)
@@ -77,14 +72,8 @@ end
         @test !has_metadata(p)
         @test !has_sample(p)
 
-        @show get_sample_rate(p)
         df2 = read_hdf5(p)
-        @show Banyan.LocationPath(p)
-        @show get_sampling_configs()
-        @show get_sampling_config(p)
-        @show get_sample_rate(p)
         sample(df2)
-        @show get_sample_rate(p)
         df2 = read_hdf5(p; samples_invalid=true)
         sample(df2)
         @test get_sample_rate(p) == 5
@@ -93,11 +82,9 @@ end
         df2 = read_hdf5(p; metadata_invalid=true)
         sample(df2)
         @test get_sample_rate(p) == 5
-        println("Bad get_sample_rate")
         @test get_sample_rate() == 7
         configure_sampling(sample_rate=7, for_all_locations=true)
         @test get_sample_rate(p) == 5
-        println("Bad get_sample_rate")
         configure_sampling(sample_rate=7, force_new_sample_rate=true, for_all_locations=true)
         @test get_sample_rate(p) == 7
         @test get_sample_rate() == 7
@@ -108,7 +95,6 @@ end
         sample(df2)
         @test has_metadata(p)
         @test has_sample(p)
-        @show get_sample_rate(p)
         configure_sampling(p; always_exact=true)
         sample(df2)
     end
diff --git a/BanyanImages/test/jpg.jl b/BanyanImages/test/jpg.jl
index 32861306..fff3793a 100644
--- a/BanyanImages/test/jpg.jl
+++ b/BanyanImages/test/jpg.jl
@@ -85,15 +85,14 @@ end
         "parallelism encouraged",
         "parallelism and batches encouraged",
     ],
-    (loc, format) in [
-        ("Internet", "generator"),
-        ("S3", "generator"),
-        ("S3", "directory")
-    ],
-    (max_num_bytes, nimages, add_channelview) in [
-        (0, 1, false),
-        (0, 50, true),
-        (100_000_000_000, 1, true)
+    (loc, format, max_num_bytes, nimages, add_channelview) in [
+        ("Internet", "generator", 0, 1, false),
+        ("Internet", "generator", 0, 50, true),
+        ("Internet", "generator", 100_000_000_000, 1, true),
+        ("S3", "generator", 100_000_000_000, 1, true),
+        ("S3", "directory", 0, 1, false),
+        ("S3", "directory", 0, 50, true),
+        ("S3", "directory", 100_000_000_000, 1, true)
     ],
     shuffled in [true, false]
 
@@ -109,14 +108,9 @@ end
 
         df = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true, invalidate_samples=true)
         sample(df)
-        @show max_num_bytes
-        @show exact_sample
-        @show get_sample_rate(p; add_channelview=add_channelview)
 
         configure_sampling(p; sample_rate=50)
-        @show get_sampling_configs()
         read_jpg(p; add_channelview=add_channelview)
-        @show get_sampling_configs()
         @test get_sample_rate(p; add_channelview=add_channelview) == 50
         @test has_metadata(p; add_channelview=add_channelview)
         @test has_sample(p; add_channelview=add_channelview)
@@ -127,14 +121,8 @@ end
         @test !has_metadata(p; add_channelview=add_channelview)
         @test !has_sample(p; add_channelview=add_channelview)
 
-        @show get_sample_rate(p; add_channelview=add_channelview)
         df2 = read_jpg(p; add_channelview=add_channelview)
-        @show Banyan.LocationPath(p; add_channelview=add_channelview)
-        @show get_sampling_configs()
-        @show get_sampling_config(p; add_channelview=add_channelview)
-        @show get_sample_rate(p; add_channelview=add_channelview)
         sample(df2)
-        @show get_sample_rate(p; add_channelview=add_channelview)
         df2 = read_jpg(p; add_channelview=add_channelview, samples_invalid=true)
         sample(df2)
         @test get_sample_rate(p; add_channelview=add_channelview) == 50
@@ -143,11 +131,9 @@ end
         df2 = read_jpg(p; add_channelview=add_channelview, metadata_invalid=true)
         sample(df2)
         @test get_sample_rate(p; add_channelview=add_channelview) == 50
-        println("Bad get_sample_rate")
         @test get_sample_rate() == 75
         configure_sampling(sample_rate=75, for_all_locations=true)
         @test get_sample_rate(p; add_channelview=add_channelview) == 50
-        println("Bad get_sample_rate")
         configure_sampling(sample_rate=75, force_new_sample_rate=true, for_all_locations=true)
         @test get_sample_rate(p; add_channelview=add_channelview) == 75
         @test get_sample_rate() == 75
@@ -158,7 +144,6 @@ end
         sample(df2)
         @test has_metadata(p; add_channelview=add_channelview)
         @test has_sample(p; add_channelview=add_channelview)
-        @show get_sample_rate(p; add_channelview=add_channelview)
         configure_sampling(p; add_channelview=add_channelview, always_exact=true)
         sample(df2)
     end
diff --git a/BanyanImages/test/runtests.jl b/BanyanImages/test/runtests.jl
index 58b1eb6b..3d650ff1 100644
--- a/BanyanImages/test/runtests.jl
+++ b/BanyanImages/test/runtests.jl
@@ -107,4 +107,5 @@ finally
     # Destroy jobs to clean up.
     # destroy_all_jobs_for_testing()
     cleanup_s3_test_files(get_cluster_s3_bucket_name(ENV["BANYAN_CLUSTER_NAME"]))
+    end_all_sessions_for_testing()
 end
\ No newline at end of file

From 28d5894b0971803457669ef423827798572fcb6f Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Fri, 19 Aug 2022 08:28:20 -0400
Subject: [PATCH 23/25] Make start_session create cluster if needed and change
 default # of workers to 150

---
 Banyan/src/clusters.jl | 112 +++++++++++++++++++--------------------
 Banyan/src/sessions.jl | 117 +++++++++++++++++++++++++++++++++++------
 2 files changed, 158 insertions(+), 71 deletions(-)

diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl
index eabb1215..8ec83cc1 100644
--- a/Banyan/src/clusters.jl
+++ b/Banyan/src/clusters.jl
@@ -1,5 +1,5 @@
 struct Cluster
-    name::String
+    cluster_name::String
     status::Symbol
     status_explanation::String
     s3_bucket_arn::String
@@ -21,7 +21,7 @@ end
 @nospecialize
 
 function create_cluster(;
-    name::Union{String,Nothing} = nothing,
+    cluster_name::Union{String,Nothing} = nothing,
     instance_type::Union{String,Nothing} = "m4.4xlarge",
     max_num_workers::Union{Int,Nothing} = 2048,
     initial_num_workers::Union{Int,Nothing} = 16,
@@ -42,29 +42,29 @@ function create_cluster(;
     # Configure using parameters
     c = configure(; kwargs...)
     
-    clusters = get_clusters(name; kwargs...)
-    if isnothing(name)
-        name = "Cluster " * string(length(clusters) + 1)
+    clusters = get_clusters(cluster_name; kwargs...)
+    if isnothing(cluster_name)
+        cluster_name = "cluster-" * string(length(clusters) + 1)
     end
     if isnothing(region)
         region = get_aws_config_region()
     end
 
-    # Check if the configuration for this cluster name already exists
+    # Check if the configuration for this cluster cluster_name already exists
     # If it does, then recreate cluster
-    if haskey(clusters, name)
-        if force_create || clusters[name].status == :terminated
-            @info "Started re-creating cluster named $name"
+    if haskey(clusters, cluster_name)
+        if force_create || clusters[cluster_name].status == :terminated
+            @info "Started re-creating cluster named $cluster_name"
             send_request_get_response(
                 :create_cluster,
-                Dict("cluster_name" => name, "recreate" => true, "force_create" => true),
+                Dict("cluster_name" => cluster_name, "recreate" => true, "force_create" => true),
             )
             if !nowait
-                wait_for_cluster(name; kwargs...)
+                wait_for_cluster(cluster_name; kwargs...)
             end
-            return get_cluster(name; kwargs...)
+            return get_cluster(cluster_name; kwargs...)
         else
-            error("Cluster with name $name already exists and its current status is $(string(clusters[name].status))")
+            error("Cluster with cluster_name $cluster_name already exists and its current status is $(string(clusters[cluster_name].status))")
         end
     end
 
@@ -82,7 +82,7 @@ function create_cluster(;
 
     # Construct cluster creation
     cluster_config = Dict{String,Any}(
-        "cluster_name" => name,
+        "cluster_name" => cluster_name,
         "instance_type" => instance_type,
         "max_num_workers" => max_num_workers,
         "initial_num_workers" => initial_num_workers,
@@ -110,60 +110,60 @@ function create_cluster(;
         cluster_config["subnet_id"] = subnet_id
     end
 
-    @info "Started creating cluster named $name"
+    @info "Started creating cluster named $cluster_name"
 
     # Send request to create cluster
     send_request_get_response(:create_cluster, cluster_config)
 
     if !nowait
-        wait_for_cluster(name; kwargs...)
+        wait_for_cluster(cluster_name; kwargs...)
     end
 
     # Cache info
-    get_cluster(name; kwargs...)
+    get_cluster(cluster_name; kwargs...)
 
-    return get_clusters_dict()[name]
+    return get_clusters_dict()[cluster_name]
 end
 
-function destroy_cluster(name::String; kwargs...)
+function destroy_cluster(cluster_name::String; kwargs...)
     configure(; kwargs...)
-    @info "Destroying cluster named $name"
-    send_request_get_response(:destroy_cluster, Dict{String,Any}("cluster_name" => name))
+    @info "Destroying cluster named $cluster_name"
+    send_request_get_response(:destroy_cluster, Dict{String,Any}("cluster_name" => cluster_name))
 end
 
-function delete_cluster(name::String; kwargs...)
+function delete_cluster(cluster_name::String; kwargs...)
     configure(; kwargs...)
-    @info "Deleting cluster named $name"
+    @info "Deleting cluster named $cluster_name"
     send_request_get_response(
         :destroy_cluster,
-        Dict{String,Any}("cluster_name" => name, "permanently_delete" => true),
+        Dict{String,Any}("cluster_name" => cluster_name, "permanently_delete" => true),
     )
 end
 
-function update_cluster(name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...)
+function update_cluster(cluster_name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...)
     configure(; kwargs...)
-    @info "Updating cluster named $name"
+    @info "Updating cluster named $cluster_name"
     send_request_get_response(
         :update_cluster,
         Dict{String, Any}(
-            "cluster_name" => name,
+            "cluster_name" => cluster_name,
             "force_update" => force_update,
             "update_linux_packages" => update_linux_packages,
             "reinstall_julia" => reinstall_julia
         )
     )
     if !nowait
-        wait_for_cluster(name)
+        wait_for_cluster(cluster_name)
     end
 end
 
-function assert_cluster_is_ready(name::String; kwargs...)
-    @info "Setting status of cluster named $name to running"
+function assert_cluster_is_ready(cluster_name::String; kwargs...)
+    @info "Setting status of cluster named $cluster_name to running"
 
     # Configure
     configure(; kwargs...)
 
-    send_request_get_response(:set_cluster_ready, Dict{String,Any}("cluster_name" => name))
+    send_request_get_response(:set_cluster_ready, Dict{String,Any}("cluster_name" => cluster_name))
 end
 
 parsestatus(status::String)::Symbol =
@@ -197,9 +197,9 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster}
     end
     response = send_request_get_response(:describe_clusters, Dict{String,Any}("filters"=>filters))
     clusters_dict::Dict{String,Cluster} = Dict{String,Cluster}()
-    for (name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any}
-        clusters_dict[name] = Cluster(
-            name,
+    for (cluster_name::String, c::Dict{String,Any}) in response["clusters"]::Dict{String,Any}
+        clusters_dict[cluster_name] = Cluster(
+            cluster_name,
             parsestatus(c["status"]::String),
             haskey(c, "status_explanation") ? c["status_explanation"]::String : "",
             c["s3_read_write_resource"]::String,
@@ -212,16 +212,16 @@ function _get_clusters(cluster_name::String)::Dict{String,Cluster}
 
     # Cache info
     curr_clusters_dict = get_clusters_dict()
-    for (name, c) in clusters_dict
-        curr_clusters_dict[name] = c
+    for (cluster_name, c) in clusters_dict
+        curr_clusters_dict[cluster_name] = c
     end
 
     clusters_dict
 end
 
-function get_clusters(cluster_name=nothing; kwargs...)::Dict{String,Cluster}
+function get_clusters(cluster_name=""; kwargs...)::Dict{String,Cluster}
     configure(; kwargs...)
-    _get_clusters(isnothing(cluster_name) ? "" : cluster_name)
+    _get_clusters(cluster_name)
 end
 
 function get_cluster_s3_bucket_arn(cluster_name=get_cluster_name(); kwargs...)
@@ -236,19 +236,19 @@ end
 get_cluster_s3_bucket_name(cluster_name=get_cluster_name(); kwargs...) =
     s3_bucket_arn_to_name(get_cluster_s3_bucket_arn(cluster_name; kwargs...))
 
-get_cluster(name::String=get_cluster_name(); kwargs...)::Cluster = get_clusters(name; kwargs...)[name]
+get_cluster(cluster_name::String=get_cluster_name(); kwargs...)::Cluster = get_clusters(cluster_name; kwargs...)[cluster_name]
 
 get_running_clusters(args...; kwargs...) = filter(entry -> entry[2].status == :running, get_clusters(args...; kwargs...))
 
-function get_cluster_status(name::String)::Symbol
+function get_cluster_status(cluster_name::String)::Symbol
     clusters_dict = get_clusters_dict()
     clusters::Dict{String,Cluster}
-    if haskey(clusters_dict, name)
-        if clusters_dict[name].status == :failed
-            @error clusters_dict[name].status_explanation
+    if haskey(clusters_dict, cluster_name)
+        if clusters_dict[cluster_name].status == :failed
+            @error clusters_dict[cluster_name].status_explanation
         end
     end
-    c::Cluster = get_clusters(name)[name]
+    c::Cluster = get_clusters(cluster_name)[cluster_name]
     if c.status == :failed
         @error c.status_explanation
     end
@@ -256,16 +256,16 @@ function get_cluster_status(name::String)::Symbol
 end
 get_cluster_status() = get_cluster_status(get_cluster_name())
 
-function _wait_for_cluster(name::String)
+function _wait_for_cluster(cluster_name::String)
     t::Int64 = 5
-    cluster_status::Symbol = get_cluster_status(name)
-    p::ProgressUnknown = ProgressUnknown("Finding status of cluster $name", enabled=false)
+    cluster_status::Symbol = get_cluster_status(cluster_name)
+    p::ProgressUnknown = ProgressUnknown("Finding status of cluster $cluster_name", enabled=false)
     while (cluster_status == :creating || cluster_status == :updating)
         if !p.enabled
             if cluster_status == :creating
-                p = ProgressUnknown("Setting up cluster $name", spinner=true)
+                p = ProgressUnknown("Setting up cluster $cluster_name", spinner=true)
             else
-                p = ProgressUnknown("Updating cluster $name", spinner=true)
+                p = ProgressUnknown("Updating cluster $cluster_name", spinner=true)
             end
         end
         sleep(t)
@@ -273,28 +273,28 @@ function _wait_for_cluster(name::String)
         if t < 80
             t *= 2
         end
-        cluster_status = get_cluster_status(name)
+        cluster_status = get_cluster_status(cluster_name)
     end
     if p.enabled
         finish!(p, spinner = (cluster_status == :running ? '✓' : '✗'))
     end
     if cluster_status == :running
-        # @info "Cluster $name is ready"
+        # @info "Cluster $cluster_name is ready"
     elseif cluster_status == :terminated
-        error("Cluster $name no longer exists")
+        error("Cluster $cluster_name no longer exists")
     elseif cluster_status != :creating && cluster_status != :updating
-        error("Failed to set up cluster named $name")
+        error("Failed to set up cluster named $cluster_name")
     else
-        error("Cluster $name has unexpected status: $cluster_status")
+        error("Cluster $cluster_name has unexpected status: $cluster_status")
     end
 end
 function wait_for_cluster(;kwargs...)
     configure(;kwargs...)
     _wait_for_cluster(get_cluster_name())
 end
-function wait_for_cluster(name::String; kwargs...)
+function wait_for_cluster(cluster_name::String; kwargs...)
     configure(;kwargs...)
-    _wait_for_cluster(name)
+    _wait_for_cluster(cluster_name)
 end
 
 function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cluster_name(), kwargs...)
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index dd6cd25d..b9feed7b 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -80,6 +80,7 @@ const NOTHING_STRING = "NOTHING_STRING"
 
 function _start_session(
     cluster_name::String,
+    c::Cluster,
     nworkers::Int64,
     release_resources_after::Integer,
     print_logs::Bool,
@@ -112,17 +113,6 @@ function _start_session(
 )
     global session_sampling_configs
 
-    # Construct parameters for starting session
-    cluster_name = if cluster_name == NOTHING_STRING
-        running_clusters = get_running_clusters()
-        if length(running_clusters) == 0
-            error("Failed to start session: you don't have any clusters created")
-        end
-        first(keys(running_clusters))
-    else
-        cluster_name
-    end
-
     version = get_julia_version()
 
     not_in_modules = m -> !(m in not_using_modules)
@@ -151,7 +141,7 @@ function _start_session(
     if !no_email
         session_configuration["email_when_ready"] = email_when_ready
     end
-    c::Cluster = get_cluster(cluster_name)
+    
     s3_bucket_name = s3_bucket_arn_to_name(c.s3_bucket_arn)
     organization_id = c.organization_id
     curr_cluster_instance_id = c.curr_cluster_instance_id
@@ -292,9 +282,105 @@ function _start_session(
     session_id
 end
 
+function start_session_with_cluster(
+    cluster_name::String,
+    nworkers::Int64,
+    release_resources_after::Integer,
+    print_logs::Bool,
+    store_logs_in_s3::Bool,
+    store_logs_on_cluster::Bool,
+    log_initialization::Bool,
+    session_name::String,
+    files::Vector{String},
+    code_files::Vector{String},
+    force_update_files::Bool,
+    pf_dispatch_table::Vector{String},
+    no_pf_dispatch_table::Bool,
+    using_modules::Vector{String},
+    # We currently can't use modules that require GUI
+    not_using_modules::Vector{String},
+    url::String,
+    branch::String,
+    directory::String,
+    dev_paths::Vector{String},
+    force_sync::Bool,
+    force_pull::Bool,
+    force_install::Bool,
+    estimate_available_memory::Bool,
+    nowait::Bool,
+    email_when_ready::Bool,
+    no_email::Bool,
+    for_running::Bool,
+    sessions::Dict{String,Session},
+    sampling_configs::Dict{LocationPath,SamplingConfig},
+    kwargs...
+)
+    # Construct parameters for starting session
+    cluster_name::String, c::Cluster = if cluster_name == NOTHING_STRING
+        running_clusters = get_running_clusters()
+        if isempty(running_clusters)
+            new_c = create_cluster(;
+                nowait=false,
+                initial_num_workers=nworkers,
+                kwargs...
+            )
+            new_c.cluster_name, new_c
+        else
+            first(running_clusters)
+        end
+    else
+        c_dict::Dict{String,Cluster} = get_running_clusters(cluster_name)
+        cluster_name, if haskey(c_dict, cluster_name)
+            c_dict[cluster_name]
+        else
+            create_cluster(;
+                cluster_name=cluster_name,
+                nowait=false,
+                initial_num_workers=nworkers,
+                kwargs...
+            )
+        end
+    end
+
+    _start_session(
+        cluster_name::String,
+        c::Cluster,
+        nworkers::Int64,
+        release_resources_after::Integer,
+        print_logs::Bool,
+        store_logs_in_s3::Bool,
+        store_logs_on_cluster::Bool,
+        log_initialization::Bool,
+        session_name::String,
+        files::Vector{String},
+        code_files::Vector{String},
+        force_update_files::Bool,
+        pf_dispatch_table::Vector{String},
+        no_pf_dispatch_table::Bool,
+        using_modules::Vector{String},
+        # We currently can't use modules that require GUI
+        not_using_modules::Vector{String},
+        url::String,
+        branch::String,
+        directory::String,
+        dev_paths::Vector{String},
+        force_sync::Bool,
+        force_pull::Bool,
+        force_install::Bool,
+        estimate_available_memory::Bool,
+        nowait::Bool,
+        email_when_ready::Bool,
+        no_email::Bool,
+        for_running::Bool,
+        sessions::Dict{String,Session},
+        sampling_configs::Dict{LocationPath,SamplingConfig}
+    )
+end
+
 function start_session(;
     cluster_name::String = NOTHING_STRING,
-    nworkers::Int64 = 16,
+    # Default 100x speedup
+    nworkers::Int64 = 150,
     release_resources_after::Union{Integer,Nothing} = 20,
     print_logs::Bool = false,
     store_logs_in_s3::Bool = true,
@@ -334,7 +420,7 @@ function start_session(;
     configure(; kwargs...)
     configure_sampling(; kwargs...)
     
-    current_session_id = _start_session(
+    current_session_id = start_session_with_cluster(
         cluster_name,
         nworkers,
         isnothing(release_resources_after) ? -1 : release_resources_after,
@@ -364,7 +450,8 @@ function start_session(;
         isnothing(email_when_ready),
         for_running,
         sessions,
-        get_sampling_configs()
+        get_sampling_configs(),
+        kwargs...
     )
     current_session_id
 end

From a99563855b4bd8586b6ea69911dcddcf83f8d53f Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Fri, 19 Aug 2022 12:39:16 -0400
Subject: [PATCH 24/25] Add automatic destruction of idle clusters

---
 Banyan/src/clusters.jl | 4 +++-
 Banyan/src/sessions.jl | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl
index 8ec83cc1..1d9ee6d9 100644
--- a/Banyan/src/clusters.jl
+++ b/Banyan/src/clusters.jl
@@ -36,6 +36,7 @@ function create_cluster(;
     subnet_id = nothing,
     nowait=false,
     force_create=false,
+    destroy_cluster_after = -1,
     kwargs...,
 )
 
@@ -95,7 +96,8 @@ function create_cluster(;
         # by size of 1 GiB and then round up. Then the backend will determine how to adjust the
         # disk capacity to an allowable increment (e.g., 1200 GiB or an increment of 2400 GiB
         # for AWS FSx Lustre filesystems)
-        "disk_capacity" => disk_capacity == "auto" ? -1 : ceil(Int64, parse_bytes(disk_capacity) / 1.073741824e7)
+        "disk_capacity" => disk_capacity == "auto" ? -1 : ceil(Int64, parse_bytes(disk_capacity) / 1.073741824e7),
+        "destroy_cluster_after" => destroy_cluster_after
     )
     if haskey(c["aws"], "ec2_key_pair_name")
         cluster_config["ec2_key_pair"] = c["aws"]["ec2_key_pair_name"]
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index b9feed7b..5a35c4c2 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -319,9 +319,12 @@ function start_session_with_cluster(
     cluster_name::String, c::Cluster = if cluster_name == NOTHING_STRING
         running_clusters = get_running_clusters()
         if isempty(running_clusters)
+            # If the user is not separately creating a cluster, we should
+            # by default destroy it after 12 hours.
             new_c = create_cluster(;
                 nowait=false,
                 initial_num_workers=nworkers,
+                destroy_after=(12 * 60)
                 kwargs...
             )
             new_c.cluster_name, new_c
@@ -337,6 +340,7 @@ function start_session_with_cluster(
                 cluster_name=cluster_name,
                 nowait=false,
                 initial_num_workers=nworkers,
+                destroy_after=(12 * 60),
                 kwargs...
             )
         end

From 045f2e01ab1a93268b84484c705fe800fef97dff Mon Sep 17 00:00:00 2001
From: Caleb Winston <calebhwin@gmail.com>
Date: Mon, 22 Aug 2022 06:15:39 -0700
Subject: [PATCH 25/25] Implement all changes to make starting sessions lazy

---
 Banyan/src/clusters.jl                 |  54 ++-
 Banyan/src/requests.jl                 |   7 +-
 Banyan/src/samples.jl                  |   3 +-
 Banyan/src/sessions.jl                 | 364 ++++++++++------
 Banyan/test/Project.toml               |   1 -
 Banyan/test/clusters.jl                |   6 +-
 Banyan/test/run_session_test_script.jl |   1 +
 Banyan/test/runtests.jl                |   2 -
 Banyan/test/sessions.jl                | 581 ++++++++++++++-----------
 BanyanArrays/test/Project.toml         |   2 +-
 BanyanHDF5/test/Project.toml           |   4 +-
 BanyanHDF5/test/runtests.jl            |   2 +
 BanyanImages/test/Project.toml         |   2 +-
 13 files changed, 608 insertions(+), 421 deletions(-)
 create mode 100644 Banyan/test/run_session_test_script.jl

diff --git a/Banyan/src/clusters.jl b/Banyan/src/clusters.jl
index 1d9ee6d9..c74cc0b0 100644
--- a/Banyan/src/clusters.jl
+++ b/Banyan/src/clusters.jl
@@ -34,9 +34,10 @@ function create_cluster(;
     region = nothing,
     vpc_id = nothing,
     subnet_id = nothing,
-    nowait=false,
+    wait_now=true,
     force_create=false,
     destroy_cluster_after = -1,
+    show_progress = true,
     kwargs...,
 )
 
@@ -55,17 +56,24 @@ function create_cluster(;
     # If it does, then recreate cluster
     if haskey(clusters, cluster_name)
         if force_create || clusters[cluster_name].status == :terminated
-            @info "Started re-creating cluster named $cluster_name"
+            if show_progress
+                @info "Started re-creating cluster named $cluster_name"
+            end
             send_request_get_response(
                 :create_cluster,
                 Dict("cluster_name" => cluster_name, "recreate" => true, "force_create" => true),
             )
-            if !nowait
+            if wait_now
+                wait_for_cluster(cluster_name; kwargs...)
+            end
+            return get_cluster(cluster_name; kwargs...)
+        elseif clusters[cluster_name].status == :creating
+            if wait_now
                 wait_for_cluster(cluster_name; kwargs...)
             end
             return get_cluster(cluster_name; kwargs...)
         else
-            error("Cluster with cluster_name $cluster_name already exists and its current status is $(string(clusters[cluster_name].status))")
+            error("Cluster with name $cluster_name already exists and its current status is $(string(clusters[cluster_name].status))")
         end
     end
 
@@ -112,12 +120,14 @@ function create_cluster(;
         cluster_config["subnet_id"] = subnet_id
     end
 
-    @info "Started creating cluster named $cluster_name"
+    if show_progress
+        @info "Started creating cluster named $cluster_name"
+    end
 
     # Send request to create cluster
     send_request_get_response(:create_cluster, cluster_config)
 
-    if !nowait
+    if wait_now
         wait_for_cluster(cluster_name; kwargs...)
     end
 
@@ -131,6 +141,7 @@ function destroy_cluster(cluster_name::String; kwargs...)
     configure(; kwargs...)
     @info "Destroying cluster named $cluster_name"
     send_request_get_response(:destroy_cluster, Dict{String,Any}("cluster_name" => cluster_name))
+    ;
 end
 
 function delete_cluster(cluster_name::String; kwargs...)
@@ -140,9 +151,10 @@ function delete_cluster(cluster_name::String; kwargs...)
         :destroy_cluster,
         Dict{String,Any}("cluster_name" => cluster_name, "permanently_delete" => true),
     )
+    ;
 end
 
-function update_cluster(cluster_name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, nowait=false, kwargs...)
+function update_cluster(cluster_name::String; force_update=false, update_linux_packages=true, reinstall_julia=false, wait_now=true, kwargs...)
     configure(; kwargs...)
     @info "Updating cluster named $cluster_name"
     send_request_get_response(
@@ -154,9 +166,10 @@ function update_cluster(cluster_name::String; force_update=false, update_linux_p
             "reinstall_julia" => reinstall_julia
         )
     )
-    if !nowait
+    if wait_now
         wait_for_cluster(cluster_name)
     end
+    ;
 end
 
 function assert_cluster_is_ready(cluster_name::String; kwargs...)
@@ -166,6 +179,7 @@ function assert_cluster_is_ready(cluster_name::String; kwargs...)
     configure(; kwargs...)
 
     send_request_get_response(:set_cluster_ready, Dict{String,Any}("cluster_name" => cluster_name))
+    ;
 end
 
 parsestatus(status::String)::Symbol =
@@ -258,26 +272,28 @@ function get_cluster_status(cluster_name::String)::Symbol
 end
 get_cluster_status() = get_cluster_status(get_cluster_name())
 
-function _wait_for_cluster(cluster_name::String)
+function _wait_for_cluster(cluster_name::String, show_progress::Bool)
     t::Int64 = 5
     cluster_status::Symbol = get_cluster_status(cluster_name)
-    p::ProgressUnknown = ProgressUnknown("Finding status of cluster $cluster_name", enabled=false)
+    p::ProgressUnknown =  ProgressUnknown("Finding status of cluster $cluster_name", enabled=show_progress)
     while (cluster_status == :creating || cluster_status == :updating)
-        if !p.enabled
+        if show_progress && !p.enabled
             if cluster_status == :creating
-                p = ProgressUnknown("Setting up cluster $cluster_name", spinner=true)
+                p = ProgressUnknown("Setting up cluster $cluster_name", spinner=true, enabled=show_progress)
             else
-                p = ProgressUnknown("Updating cluster $cluster_name", spinner=true)
+                p = ProgressUnknown("Updating cluster $cluster_name", spinner=true, enabled=show_progress)
             end
         end
         sleep(t)
-        next!(p)
+        if show_progress
+            next!(p)
+        end
         if t < 80
             t *= 2
         end
         cluster_status = get_cluster_status(cluster_name)
     end
-    if p.enabled
+    if show_progress
         finish!(p, spinner = (cluster_status == :running ? '✓' : '✗'))
     end
     if cluster_status == :running
@@ -290,13 +306,13 @@ function _wait_for_cluster(cluster_name::String)
         error("Cluster $cluster_name has unexpected status: $cluster_status")
     end
 end
-function wait_for_cluster(;kwargs...)
+function wait_for_cluster(show_progress=true; kwargs...)
     configure(;kwargs...)
-    _wait_for_cluster(get_cluster_name())
+    _wait_for_cluster(get_cluster_name(), show_progress)
 end
-function wait_for_cluster(cluster_name::String; kwargs...)
+function wait_for_cluster(cluster_name::String, show_progress=true; kwargs...)
     configure(;kwargs...)
-    _wait_for_cluster(cluster_name)
+    _wait_for_cluster(cluster_name, show_progress)
 end
 
 function upload_to_s3(src_path; dst_name=basename(src_path), cluster_name=get_cluster_name(), kwargs...)
diff --git a/Banyan/src/requests.jl b/Banyan/src/requests.jl
index 8578786d..36218d52 100644
--- a/Banyan/src/requests.jl
+++ b/Banyan/src/requests.jl
@@ -236,7 +236,7 @@ function _partitioned_computation_concrete(fut::Future, destination::Location, n
     # There are two cases: either we
     # TODO: Maybe we don't need to wait_For_session
 
-    # There is a problem where we start a session with nowait=true and then it
+    # There is a problem where we start a session with wait_now=false and then it
     # reuses a resource that is in a creating state. Since the session is still
     # creating and we have not yet waited for it to start, if we have
     # `estimate_available_memory=false` then we will end up with job info not
@@ -524,7 +524,7 @@ end
 function send_evaluation(value_id::ValueId, session_id::SessionId)
     # First we ensure that the session is ready. This way, we can get a good
     # estimate of available worker memory before calling evaluate.
-    wait_for_session(session_id)
+    session = get_session(session_id)
 
     encourage_parallelism = get_encourage_parallelism()
     encourage_parallelism_with_batches = get_encourage_parallelism_with_batches()
@@ -690,12 +690,11 @@ function offloaded(given_function::Function, args...; distributed::Bool = false)
 
     # We must wait for session because otherwise we will slurp up the session
     # ready message on the gather queue.
-    wait_for_session(session_id)
+    session = get_session(session_id)
 
     # job_id = Banyan.get_job_id()
     p = ProgressUnknown("Running offloaded code", spinner=true)
     
-    session = get_session()
     gather_queue = gather_queue_url()
     stored_res = nothing
     error_for_main_stuck, error_for_main_stuck_time = nothing, nothing
diff --git a/Banyan/src/samples.jl b/Banyan/src/samples.jl
index 2ce34517..66df1558 100644
--- a/Banyan/src/samples.jl
+++ b/Banyan/src/samples.jl
@@ -1,5 +1,6 @@
 function configure_sampling(
     path="";
+    nworkers=nothing,
     sample_rate=nothing,
     always_exact=nothing,
     max_num_bytes_exact=nothing,
@@ -13,7 +14,7 @@ function configure_sampling(
 
     sc = default ? DEFAULT_SAMPLING_CONFIG : get_sampling_config(path; kwargs...)
     nsc = SamplingConfig(
-        (!isnothing(sample_rate)) ? sample_rate : sc.rate,
+        (!isnothing(sample_rate)) ? sample_rate : (!isnothing(nworkers) ? (nworkers * 8) : sc.rate),
         (!isnothing(always_exact)) ? always_exact : sc.always_exact,
         (!isnothing(max_num_bytes_exact)) ? max_num_bytes_exact : sc.max_num_bytes_exact,
         (!isnothing(force_new_sample_rate)) ? force_new_sample_rate : sc.force_new_sample_rate,
diff --git a/Banyan/src/sessions.jl b/Banyan/src/sessions.jl
index 5a35c4c2..d92a63ec 100644
--- a/Banyan/src/sessions.jl
+++ b/Banyan/src/sessions.jl
@@ -16,6 +16,9 @@ global sessions = Dict{SessionId,Session}()
 # ergonomic.
 global current_session_id = ""
 
+# Tasks for starting sessions
+global start_session_tasks = Dict{SessionId,Task}()
+
 function set_session(session_id::SessionId)
     global current_session_id
     current_session_id = session_id
@@ -23,17 +26,49 @@ end
 
 function _get_session_id_no_error()::SessionId
     global current_session_id
-    current_session_id
+    global sessions
+    !haskey(sessions, current_session_id) ? "" : current_session_id
 end
 
-function get_session_id()::SessionId
+function get_session_id(session_id="")::SessionId
     global current_session_id
-    if isempty(current_session_id)
-        error(
-            "No session started or selected using `start_session` or `with_session` or `set_session`. The current session may have been destroyed or no session started yet.",
-        )
+    global sessions
+    global start_session_tasks
+    global session_sampling_configs
+
+    if isempty(session_id)
+        session_id = current_session_id
+    end
+
+    if haskey(sessions, session_id)
+        session_id
+    elseif haskey(start_session_tasks, session_id)
+        start_session_task = start_session_tasks[session_id]
+        if istaskdone(start_session_task) && length(start_session_task.result) == 2
+            e, bt = start_session_task.result
+            showerror(stderr, e, bt)
+            error("Failed to start session with ID $session_id")
+            session_id
+        elseif istaskdone(start_session_task) && length(start_session_task.result) == 3
+            new_session_id, session, sampling_configs = start_session_task.result
+            sessions[new_session_id] = session
+            session_sampling_configs[new_session_id] = sampling_configs
+            if session_id == current_session_id
+                current_session_id = new_session_id
+            end
+            new_session_id
+        else
+            # Otherwise, the task is still running or hasn't yet been started
+            # in which case we will just return the ID of the start_session task
+            session_id
+        end
+    elseif isempty(session_id)
+        start_session()
+    elseif startswith(session_id, "start-session-")
+        error("The session with ID $session_id was not created in this Julia session")
+    else
+        session_id
     end
-    current_session_id
 end
 
 function get_sessions_dict()::Dict{SessionId,Session}
@@ -41,19 +76,46 @@ function get_sessions_dict()::Dict{SessionId,Session}
     sessions
 end
 
-function get_session()::Session
-    session_id = get_session_id()
+function get_session(session_id=get_session_id(), show_progress=true)::Session
     sessions_dict = get_sessions_dict()
-    if !haskey(sessions_dict, session_id)
-        error("The selected session does not have any information; if it was started by this process, it has either failed or been destroyed.")
+    global start_session_tasks
+    if haskey(sessions_dict, session_id)
+        sessions_dict[session_id]
+    elseif haskey(start_session_tasks, session_id)
+        # Schedule the task if not yet scheduled
+        start_session_task = start_session_tasks[session_id]
+        if !istaskstarted(start_session_task)
+            yield(start_session_task)
+        end
+
+        # Keep looping till the task is created
+        
+        p = ProgressUnknown("Preparing session with ID $session_id", spinner=true, enabled=show_progress)
+        try
+            while !haskey(get_sessions_dict(), get_session_id(session_id))
+                if p.enabled
+                    next!(p)
+                end
+            end
+        catch e
+            if p.enabled
+                finish!(p, spinner = '✗')
+            end
+            rethrow()
+        end
+        if p.enabled
+            finish!(p, spinner = '✓')
+        end
+        get_sessions_dict()[get_session_id(session_id)]
+    else
+        error("The current session ID $session_id is not stored as a session starting task in progress or a running session")
     end
-    sessions_dict[session_id]
 end
 
 get_cluster_name()::String = get_session().cluster_name
 
 function get_loaded_packages()
-    global current_session_id
+    current_session_id = _get_session_id_no_error()
     loaded_packages::Set{String} = if !isempty(current_session_id)
         get_sessions_dict()[current_session_id].loaded_packages
     else
@@ -78,6 +140,8 @@ end
 
 const NOTHING_STRING = "NOTHING_STRING"
 
+const StartSessionResult = Tuple{SessionId,Session,Dict{LocationPath,SamplingConfig}}
+
 function _start_session(
     cluster_name::String,
     c::Cluster,
@@ -104,13 +168,12 @@ function _start_session(
     force_pull::Bool,
     force_install::Bool,
     estimate_available_memory::Bool,
-    nowait::Bool,
     email_when_ready::Bool,
     no_email::Bool,
     for_running::Bool,
     sessions::Dict{String,Session},
     sampling_configs::Dict{LocationPath,SamplingConfig}
-)
+)::StartSessionResult
     global session_sampling_configs
 
     version = get_julia_version()
@@ -254,9 +317,9 @@ function _start_session(
         end
         message
     end
-    @info msg
+    # @info msg
     # Store in global state
-    sessions[session_id] = Session(
+    new_session = Session(
         cluster_name,
         session_id,
         resource_id,
@@ -270,16 +333,15 @@ function _start_session(
         gather_queue_url=gather_queue_url,
         execution_queue_url=execution_queue_url
     )
-    session_sampling_configs[session_id] = sampling_configs
 
-    if !nowait
-        wait_for_session(session_id)
-    elseif !reusing_resources
-        @warn "Starting this session requires creating new cloud computing resources which will take 10-30 minutes for the first computation."
-    end
+    # if !nowait
+    wait_for_session(session_id, false)
+    # elseif !reusing_resources
+    #     @warn "Starting this session requires creating new cloud computing resources which will take 10-30 minutes for the first computation."
+    # end
 
     @debug "Finished call to start_session with ID $session_id"
-    session_id
+    session_id, new_session, sampling_configs
 end
 
 function start_session_with_cluster(
@@ -307,14 +369,13 @@ function start_session_with_cluster(
     force_pull::Bool,
     force_install::Bool,
     estimate_available_memory::Bool,
-    nowait::Bool,
     email_when_ready::Bool,
     no_email::Bool,
     for_running::Bool,
     sessions::Dict{String,Session},
     sampling_configs::Dict{LocationPath,SamplingConfig},
     kwargs...
-)
+)::StartSessionResult
     # Construct parameters for starting session
     cluster_name::String, c::Cluster = if cluster_name == NOTHING_STRING
         running_clusters = get_running_clusters()
@@ -322,9 +383,10 @@ function start_session_with_cluster(
             # If the user is not separately creating a cluster, we should
             # by default destroy it after 12 hours.
             new_c = create_cluster(;
-                nowait=false,
+                wait_now=true,
                 initial_num_workers=nworkers,
-                destroy_after=(12 * 60)
+                destroy_after=(12 * 60),
+                show_progress=false,
                 kwargs...
             )
             new_c.cluster_name, new_c
@@ -338,9 +400,10 @@ function start_session_with_cluster(
         else
             create_cluster(;
                 cluster_name=cluster_name,
-                nowait=false,
+                wait_now=true,
                 initial_num_workers=nworkers,
                 destroy_after=(12 * 60),
+                show_progress=false,
                 kwargs...
             )
         end
@@ -372,7 +435,6 @@ function start_session_with_cluster(
         force_pull::Bool,
         force_install::Bool,
         estimate_available_memory::Bool,
-        nowait::Bool,
         email_when_ready::Bool,
         no_email::Bool,
         for_running::Bool,
@@ -384,7 +446,7 @@ end
 function start_session(;
     cluster_name::String = NOTHING_STRING,
     # Default 100x speedup
-    nworkers::Int64 = 150,
+    nworkers::Int64 = -1,
     release_resources_after::Union{Integer,Nothing} = 20,
     print_logs::Bool = false,
     store_logs_in_s3::Bool = true,
@@ -406,9 +468,10 @@ function start_session(;
     force_pull::Bool = false,
     force_install::Bool = false,
     estimate_available_memory::Bool = true,
-    nowait::Bool = true,
     email_when_ready::Union{Bool,Nothing} = nothing,
     for_running::Bool = false,
+    start_now::Bool = false,
+    wait_now::Bool = false,
     kwargs...,
 )::SessionId
     # Should save 5ms of overhead
@@ -418,60 +481,91 @@ function start_session(;
     global BANYAN_JULIA_PACKAGES
 
     sessions = get_sessions_dict()
-    global current_session_id
+    global start_session_tasks
 
     # Configure
     configure(; kwargs...)
-    configure_sampling(; kwargs...)
+    nworkers = nworkers == -1 ? (is_debug_on() ? 2 : 150) : nworkers
+    configure_sampling(; nworkers=nworkers, kwargs...)
     
-    current_session_id = start_session_with_cluster(
-        cluster_name,
-        nworkers,
-        isnothing(release_resources_after) ? -1 : release_resources_after,
-        print_logs,
-        store_logs_in_s3,
-        store_logs_on_cluster,
-        log_initialization,
-        session_name,
-        files,
-        code_files,
-        force_update_files,
-        isnothing(pf_dispatch_table) ? String[] : pf_dispatch_table,
-        isnothing(pf_dispatch_table),
-        using_modules,
-        # We currently can't use modules that require GUI
-        not_using_modules,
-        url,
-        branch,
-        directory,
-        dev_paths,
-        force_sync,
-        force_pull,
-        force_install,
-        estimate_available_memory,
-        nowait,
-        isnothing(email_when_ready) ? false : email_when_ready,
-        isnothing(email_when_ready),
-        for_running,
-        sessions,
-        get_sampling_configs(),
-        kwargs...
-    )
-    current_session_id
+    # Create task for starting session
+    new_start_session_task_id = "start-session-$(length(start_session_tasks) + 1)"
+    new_start_session_task =
+        Task(
+            () -> try
+                start_session_with_cluster(
+                    cluster_name,
+                    nworkers,
+                    isnothing(release_resources_after) ? -1 : release_resources_after,
+                    print_logs,
+                    store_logs_in_s3,
+                    store_logs_on_cluster,
+                    log_initialization,
+                    session_name,
+                    files,
+                    code_files,
+                    force_update_files,
+                    isnothing(pf_dispatch_table) ? String[] : pf_dispatch_table,
+                    isnothing(pf_dispatch_table),
+                    using_modules,
+                    # We currently can't use modules that require GUI
+                    not_using_modules,
+                    url,
+                    branch,
+                    directory,
+                    dev_paths,
+                    force_sync,
+                    force_pull,
+                    force_install,
+                    estimate_available_memory,
+                    isnothing(email_when_ready) ? false : email_when_ready,
+                    isnothing(email_when_ready),
+                    for_running,
+                    sessions,
+                    get_sampling_configs(),
+                    kwargs...
+                )
+            catch e
+                bt = catch_backtrace()
+                (e, bt)
+            end
+        )
+    start_session_tasks[new_start_session_task_id] = new_start_session_task
+    set_session(new_start_session_task_id)
+
+    # Start now or wait now if requested
+    if start_now || wait_now
+        yield(new_start_session_task)
+    end
+    if wait_now
+        get_session(new_start_session_task_id)
+    end
+
+    # Return the current session ID
+    get_session_id()
 end
 
-function end_session(session_id::SessionId = get_session_id(); failed = false, release_resources_now = false, release_resources_after = nothing, kwargs...)
+function end_session(session_id::SessionId = get_session_id(); failed = false, release_resources_now = false, release_resources_after = nothing, destroy_cluster=false, kwargs...)
     sessions = get_sessions_dict()
     global current_session_id
+    global start_session_tasks
 
     # Configure using parameters
     configure(; kwargs...)
 
+    # Ensure that the session ID is not of a creating task
+    # TODO: Get the session ID before the task begins wait_for_session
+    # so that it can be ended sooner. (maybe use local storage of the task)
+    if haskey(start_session_tasks, session_id)
+        @warn "Session with ID $session_id must be started before it can be destroyed"
+        session_id = get_session(session_id).id
+    end
+
     request_params = Dict{String,Any}("session_id" => session_id, "failed" => failed, "release_resources_now" => release_resources_now)
     if !isnothing(release_resources_after)
         request_params["release_resources_after"] = release_resources_after
     end
-    send_request_get_response(
+    resp = send_request_get_response(
         :end_session,
         request_params,
     )
@@ -480,6 +574,16 @@ function end_session(session_id::SessionId = get_session_id(); failed = false, r
     # Remove from global state
     set_session("")
     delete!(sessions, session_id)
+
+    # Destroy cluster if desired
+    if destroy_cluster
+        if isnothing(resp) || !haskey(resp, "cluster_name")
+            @warn "Unable to destroy cluster for session with ID $session_id"
+        else
+            destroy_cluster(resp["cluster_name"])
+        end
+    end
+
     session_id
 end
 
@@ -573,6 +677,7 @@ get_running_sessions(args...; kwargs...) = get_sessions(args...; status="running
 function download_session_logs(session_id::SessionId, cluster_name::String, filename::Union{String,Nothing}=nothing; kwargs...)
     @debug "Downloading logs for session"
     configure(; kwargs...)
+    session_id = get_session_id(session_id)
     s3_bucket_name = get_cluster_s3_bucket_name(cluster_name; kwargs...)
     log_file_name = "banyan-log-for-session-$(session_id)"
     if isnothing(filename) & !isdir(joinpath(homedir(), ".banyan", "logs"))
@@ -584,7 +689,9 @@ function download_session_logs(session_id::SessionId, cluster_name::String, file
     return filename
 end
 
-function print_session_logs(session_id, cluster_name, delete_file=true)
+function print_session_logs(session_id, cluster_name, delete_file=true; kwargs...)
+    configure(; kwargs...)
+    session_id = get_session_id(session_id)
     s3_bucket_name = get_cluster_s3_bucket_name(cluster_name)
     log_file_name = "banyan-log-for-session-$(session_id)"
     logs = s3_get(global_aws_config(), s3_bucket_name, log_file_name)
@@ -614,8 +721,12 @@ function end_all_sessions(cluster_name::String; release_resources_now = false, r
     end
 end
 
-function get_session_status(session_id::String=get_session_id(); kwargs...)::String
+function get_session_status(session_id::String=_get_session_id_no_error(); kwargs...)::String
+    global start_session_tasks
     sessions = get_sessions_dict()
+    if !haskey(sessions, session_id) && haskey(start_session_tasks, session_id) && !istaskdone(start_session_tasks[session_id])
+        return :creating
+    end
     configure(; kwargs...)
     filters = Dict{String,Any}("session_id" => session_id)
     params = Dict{String,Any}("filters"=>filters)
@@ -640,10 +751,10 @@ function get_session_status(session_id::String=get_session_id(); kwargs...)::Str
     session_status
 end
 
-function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...)
+function _wait_for_session(session_id::SessionId, show_progress; kwargs...)
     sessions_dict = get_sessions_dict()
     session_status = get_session_status(session_id; kwargs...)
-    p = ProgressUnknown("Preparing session with ID $session_id", spinner=true)
+    p = ProgressUnknown("Preparing session with ID $session_id", spinner=true, enabled=show_progress)
     t = 0
     st = time()
     while session_status == "creating"
@@ -653,10 +764,14 @@ function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...)
         else
             7
         end
-        next!(p)
+        if p.enabled
+            next!(p)
+        end
         session_status = get_session_status(session_id; kwargs...)
     end
-    finish!(p, spinner = session_status == "running" ? '✓' : '✗')
+    if p.enabled
+        finish!(p, spinner = session_status == "running" ? '✓' : '✗')
+    end
     if session_status == "running"
         @debug "Session with ID $session_id is ready"
         if haskey(sessions_dict, session_id)
@@ -671,26 +786,33 @@ function _wait_for_session(session_id::SessionId=get_session_id(); kwargs...)
     end
 end
 
-function wait_for_session(session_id::SessionId=get_session_id(); kwargs...)
+function wait_for_session(session_id::SessionId=get_session_id(), show_progress=true; kwargs...)
+    global start_session_tasks
     sessions_dict = get_sessions_dict()
-    is_session_ready = if haskey(sessions_dict, session_id)
-        session_info::Session = sessions_dict[session_id]
-        if !session_info.is_cluster_ready
-            wait_for_cluster(session_info.cluster_name, kwargs...)
-        end
-        session_info.is_session_ready
+
+    if haskey(start_session_tasks, session_id)
+        get_session(session_id, show_progress)
     else
-        false
-    end
-    if !is_session_ready
-        _wait_for_session(session_id; kwargs...)
+        is_session_ready = if haskey(sessions_dict, session_id)
+            session_info::Session = sessions_dict[session_id]
+            if !session_info.is_cluster_ready
+                wait_for_cluster(session_info.cluster_name, show_progress, kwargs...)
+            end
+            session_info.is_session_ready
+        else
+            false
+        end
+        if !is_session_ready
+            _wait_for_session(session_id, show_progress; kwargs...)
+        end
     end
+    ;
 end
 
 function with_session(f::Function; kwargs...)
     # This is not a constructor; this is just a function that ensures that
     # every session is always destroyed even in the case of an error
-    use_existing_session = :session in keys(kwargs)
+    use_existing_session = haskey(kwargs, :session)
     end_session_on_error = get(kwargs, :end_session_on_error, true)::Bool
     end_session_on_exit = get(kwargs, :end_session_on_exit, true)::Bool
     j = use_existing_session ? kwargs[:session] : start_session(; kwargs...)
@@ -716,71 +838,49 @@ function with_session(f::Function; kwargs...)
     end
 end
 
-
-function run_session(;
-    cluster_name::String = NOTHING_STRING,
-    nworkers::Int64 = 16,
-    release_resources_after::Union{Integer,Nothing} = 20,
+function run_session(code_files::Union{String,Vector{String}};
     print_logs::Bool = false,
     store_logs_in_s3::Bool = true,
-    store_logs_on_cluster::Bool = false,
-    sample_rate::Int64 = nworkers,
-    session_name::String = NOTHING_STRING,
-    files::Vector{String} = String[],
-    code_files::Vector{String} = String[],
-    force_update_files::Bool = true,
-    pf_dispatch_table::Union{Vector{String},Nothing} = nothing,
-    using_modules::Vector{String} = String[],
-    url::String = NOTHING_STRING,
-    branch::String = NOTHING_STRING,
-    directory::String = NOTHING_STRING,
-    dev_paths::Vector{String} = String[],
-    force_sync::Bool = false,
-    force_pull::Bool = false,
-    force_install::Bool = false,
-    estimate_available_memory::Bool = true,
-    email_when_ready::Union{Bool,Nothing}=nothing,
     kwargs...,)::SessionId
 
-    force_update_files = true
     store_logs_in_s3_orig = store_logs_in_s3
+    cluster_name = ""
     try
         if print_logs
             # If logs need to be printed, ensure that we save logs in S3. If
             # store_logs_in_s3==False, then delete logs in S3 later
             store_logs_in_s3 = true
         end
-        start_session(;cluster_name = cluster_name, nworkers = nworkers, release_resources_after = release_resources_after, 
-                    print_logs = print_logs, store_logs_in_s3 = store_logs_in_s3, store_logs_on_cluster = store_logs_on_cluster, 
-                    sample_rate = sample_rate, session_name = session_name, files = files, code_files = code_files, force_update_files = force_update_files,
-                    pf_dispatch_table = pf_dispatch_table, using_modules = using_modules, url = url, branch = branch,
-                    directory = directory, dev_paths = dev_paths, force_sync = force_sync, force_pull = force_pull, force_install = force_install, 
-                    estimate_available_memory = estimate_available_memory, nowait = false, email_when_ready = email_when_ready, for_running = true)
+        s = start_session(;
+            print_logs = print_logs,
+            store_logs_in_s3 = store_logs_in_s3,
+            wait_now = true,
+            for_running = true,
+            force_update_files = true,
+            code_files = code_files isa String ? String[code_files] : code_files,
+            kwargs...
+        )
+        cluster_name = get_session().cluster_name
+        s
     catch
-        session_id = try
-            get_session_id()
-        catch
-            nothing
-        end
-        if !isnothing(session_id)
+        session_id = _get_session_id_no_error()
+        if !isempty(session_id)
             end_session(session_id, failed=true, release_resources_now=true)
-            if print_logs
+            if print_logs && !isempty(cluster_name)
                 print_session_logs(session_id, cluster_name, !store_logs_in_s3_orig)
             end
         end
         rethrow()
+        session_id
     finally
-        session_id = try
-            get_session_id()
-        catch
-            nothing
-        end
-        if !isnothing(session_id)
+        session_id = _get_session_id_no_error()
+        if !isempty(session_id)
             end_session(session_id, failed=false, release_resources_now=true)
-            if print_logs
+            if print_logs && !isempty(cluster_name)
                 print_session_logs(session_id, cluster_name, !store_logs_in_s3_orig)
             end
-        end    
+        end
+        session_id
     end
 end
 
diff --git a/Banyan/test/Project.toml b/Banyan/test/Project.toml
index 61c14273..7fd73758 100644
--- a/Banyan/test/Project.toml
+++ b/Banyan/test/Project.toml
@@ -3,7 +3,6 @@ AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
 AWSCore = "4f1ea46c-232b-54a6-9b17-cc2d0f3e6598"
 AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
-Banyan = "706d138b-e922-45b9-a636-baf8ae0d5317"
 Base64 = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
diff --git a/Banyan/test/clusters.jl b/Banyan/test/clusters.jl
index 61af636e..bea02fc4 100644
--- a/Banyan/test/clusters.jl
+++ b/Banyan/test/clusters.jl
@@ -63,7 +63,7 @@ end
         name=cluster_name,
         instance_type="t3.large",
         s3_bucket_name=s3_bucket,
-        nowait=true
+        wait_now=false
     )
     sleep(30) # Just to ensure that cluster creation has initiated
     s3_bucket_name = get_cluster_s3_bucket_name(cluster_name)
@@ -85,7 +85,7 @@ end
     end
     c_r = create_cluster(
         name=cluster_name,
-        nowait=true
+        wait_now=false
     )
     s3_bucket_name_r = get_cluster_s3_bucket_name(cluster_name)
     s3_bucket_exists = bucket_exists(s3_bucket_name_r)
@@ -99,7 +99,7 @@ end
     @test !s3_bucket_exists
 
     # Check that the cluster cannot be created again
-    @test_throws ErrorException create_cluster(name=cluster_name, nowait=true)
+    @test_throws ErrorException create_cluster(name=cluster_name, wait_now=false)
 end
 
 @testset "Benchmark create_cluster with $instance_type instance type" for instance_type in [
diff --git a/Banyan/test/run_session_test_script.jl b/Banyan/test/run_session_test_script.jl
new file mode 100644
index 00000000..0b817e83
--- /dev/null
+++ b/Banyan/test/run_session_test_script.jl
@@ -0,0 +1 @@
+@show get_worker_idx()
\ No newline at end of file
diff --git a/Banyan/test/runtests.jl b/Banyan/test/runtests.jl
index 36ea9c3b..81139ec2 100644
--- a/Banyan/test/runtests.jl
+++ b/Banyan/test/runtests.jl
@@ -16,8 +16,6 @@ end
 function use_session_for_testing(
     f::Function;
     nworkers = parse(Int64, get(ENV, "BANYAN_NWORKERS", "2")),
-    sample_rate = 2,
-    nworkers = 2,
     scheduling_config_name = "default scheduling",
 )
     haskey(ENV, "BANYAN_CLUSTER_NAME") || error(
diff --git a/Banyan/test/sessions.jl b/Banyan/test/sessions.jl
index 738f9837..55d13b97 100644
--- a/Banyan/test/sessions.jl
+++ b/Banyan/test/sessions.jl
@@ -1,282 +1,353 @@
-# Tests for Sessions:
-#  Start a session that creates a new job
-#  Start a session that reuses a job
-#    Previous session was successfully ended (by calling end_session with delayed destruction)
-#    Previous session had a session failure 
-
-@testset "Get sessions with status $status" for status in [
-    "all",
-    "creating",
-    "running",
-    "failed",
-    "completed",
-    "invalid_status"
-]
-    cluster_name = ENV["BANYAN_CLUSTER_NAME"]
-
-    if status == "all"
-        sessions = get_sessions(cluster_name)
-    else
-        filtered_sessions = get_sessions(cluster_name, status=status)
-        @test all(s -> s[2]["status"] == status, filtered_sessions)
-    end
-end
-
-@testset "Get running sessions" begin
-    # Start a session 
-    Pkg.activate("./")
-    cluster_name = ENV["BANYAN_CLUSTER_NAME"]    
-    
-    session_id = start_session(cluster_name=cluster_name, nworkers=2)
-    running_sessions = get_running_sessions(cluster_name)
-    end_session(session_id, release_resources_now=true)
-    sessions = get_sessions(cluster_name)
-
-    @test all(s -> s[2]["status"] == "running", running_sessions)
-    @test any(s -> s[1] == session_id, running_sessions)
-    @test any(s -> (s[1] == session_id && s[2]["status"] == "completed"), sessions)
-end
-
-# Test that starting a second session after one has been ended
-# reuses the same job, if the parameters match.
-@testset "Start and end multiple sessions" begin
-    # Pkg.activate("envs/DataAnalysisProject/")
-    Pkg.activate("./")
-    cluster_name = ENV["BANYAN_CLUSTER_NAME"]
-    delay_time = 5
+# # Tests for Sessions:
+# #  Start a session that creates a new job
+# #  Start a session that reuses a job
+# #    Previous session was successfully ended (by calling end_session with delayed destruction)
+# #    Previous session had a session failure 
+
+# @testset "Get sessions with status $status" for status in [
+#     "all",
+#     "creating",
+#     "running",
+#     "failed",
+#     "completed",
+#     "invalid_status"
+# ]
+#     cluster_name = ENV["BANYAN_CLUSTER_NAME"]
 
-    # Start a session and end it
-    session_id_1 = start_session(
-        cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 2,
-        force_synce = true,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-        release_resources_after=delay_time
-    )
-    resource_id_1 = get_session().resource_id
-    session_status = get_session_status(session_id_1)
-    @test session_status == "running"
+#     if status == "all"
+#         sessions = get_sessions(cluster_name)
+#     else
+#         filtered_sessions = get_sessions(cluster_name, status=status)
+#         @test all(s -> s[2]["status"] == status, filtered_sessions)
+#     end
+# end
 
-    end_session(session_id_1)
-    sleep(60) # To ensure session gets ended
-    session_status = get_session_status(session_id_1)
-    @test session_status == "completed"
+# @testset "Get running sessions" begin
+#     # Start a session 
+#     Pkg.activate("./")
+#     cluster_name = ENV["BANYAN_CLUSTER_NAME"]    
+    
+#     session_id = start_session(cluster_name=cluster_name, nworkers=2)
+#     running_sessions = get_running_sessions(cluster_name)
+#     end_session(session_id, release_resources_now=true)
+#     sessions = get_sessions(cluster_name)
+
+#     @test all(s -> s[2]["status"] == "running", running_sessions)
+#     @test any(s -> s[1] == session_id, running_sessions)
+#     @test any(s -> (s[1] == session_id && s[2]["status"] == "completed"), sessions)
+# end
 
-    # Start another session with same nworkers and verify the job ID matches
-    session_id_2 = start_session(
-        cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 2,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-        release_resources_after=delay_time
-    )
-    resource_id_2 = get_session().resource_id
-    session_status = get_session_status(session_id_2)
-    @test session_status == "running"
-    @test resource_id_2 == resource_id_1  # it should have reused resource
+# # Test that starting a second session after one has been ended
+# # reuses the same job, if the parameters match.
+# @testset "Start and end multiple sessions" begin
+#     # Pkg.activate("envs/DataAnalysisProject/")
+#     Pkg.activate("./")
+#     cluster_name = ENV["BANYAN_CLUSTER_NAME"]
+#     delay_time = 5
+
+#     # Start a session and end it
+#     session_id_1 = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 2,
+#         force_synce = true,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#         release_resources_after=delay_time
+#     )
+#     resource_id_1 = get_session().resource_id
+#     session_status = get_session_status(session_id_1)
+#     @test session_status == "running"
+
+#     end_session(session_id_1)
+#     sleep(60) # To ensure session gets ended
+#     session_status = get_session_status(session_id_1)
+#     @test session_status == "completed"
+
+#     # Start another session with same nworkers and verify the job ID matches
+#     session_id_2 = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 2,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#         release_resources_after=delay_time
+#     )
+#     resource_id_2 = get_session().resource_id
+#     session_status = get_session_status(session_id_2)
+#     @test session_status == "running"
+#     @test resource_id_2 == resource_id_1  # it should have reused resource
     
-    end_session(session_id_2)
-    sleep(60)
-    session_status = get_session_status(session_id_2)
-    @test session_status == "completed"
-
-    # Start another session with different nworkers and verify the job ID
-    # is different
-    session_id_3 = start_session(
-        cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 4,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-        release_resources_after=delay_time
-    )
-    resource_id_3 = get_session().resource_id
-    session_status = get_session_status(session_id_3)
-    @test session_status == "running"
-    @test resource_id_3 != resource_id_1
+#     end_session(session_id_2)
+#     sleep(60)
+#     session_status = get_session_status(session_id_2)
+#     @test session_status == "completed"
+
+#     # Start another session with different nworkers and verify the job ID
+#     # is different
+#     session_id_3 = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 4,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#         release_resources_after=delay_time
+#     )
+#     resource_id_3 = get_session().resource_id
+#     session_status = get_session_status(session_id_3)
+#     @test session_status == "running"
+#     @test resource_id_3 != resource_id_1
     
-    end_session(session_id_3)
-    sleep(60)
-    session_status = get_session_status(session_id_3)
-    @test session_status == "completed"
-
-    # Sleep for the delay_time and check that the underlying resources are destroyed
-    # by creating a new session and ensuring that it uses different resources
-    sleep(delay_time * 60)
-    session_id_4 = start_session(
-        cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 2,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-        release_resources_after=delay_time,
-        nowait=true
-    )
-    resource_id_4 = get_session().resource_id
-    @test resource_id_4 != resource_id_1
+#     end_session(session_id_3)
+#     sleep(60)
+#     session_status = get_session_status(session_id_3)
+#     @test session_status == "completed"
+
+#     # Sleep for the delay_time and check that the underlying resources are destroyed
+#     # by creating a new session and ensuring that it uses different resources
+#     sleep(delay_time * 60)
+#     session_id_4 = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 2,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#         release_resources_after=delay_time,
+#         wait_now=false
+#     )
+#     resource_id_4 = get_session().resource_id
+#     @test resource_id_4 != resource_id_1
     
-    end_session(session_id_4, release_resources_now=true)
-end
+#     end_session(session_id_4, release_resources_now=true)
+# end
+
+# @testset "Start a session with dev paths" begin
+#     session_id = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 2,
+#         url = "https://github.com/banyan-team/banyan-julia.git",
+#         branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()),
+#         directory = "banyan-julia/Banyan/test",
+#         dev_paths = [
+#             "banyan-julia/Banyan",
+#         ],
+#         force_pull = true,
+#         force_sync = true,
+#         force_install = true,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#     )
+#     session_status = get_session_status(session_id)
+#     end_session(session_id, release_resources_now=true)
+#     @test session_status == "running"
+# end
+
+# @testset "Create sessions with nowait=$nowait" for
+#         nowait in [true, false]
+#     Pkg.activate("./")
+#     cluster_name = ENV["BANYAN_CLUSTER_NAME"]
+
+#     session_id = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 2,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#         nowait=nowait
+#     )
 
-@testset "Start a session with dev paths" begin
-    session_id = start_session(
+#     session_status = get_session_status(session_id)
+#     if !nowait
+#         @test session_status == "running"
+#     else
+#         @test session_status == "creating"
+#         while session_status == "creating"
+#             sleep(20)
+#             session_status = get_session_status(session_id)
+#         end
+#         @test session_status == "running"
+#     end
+
+#     end_session(session_id, release_resources_now=true)
+# end
+
+# @testset "Create sessions where store_logs_in_s3=$store_logs_in_s3" for 
+#         store_logs_in_s3 in [true, false]
+#     Pkg.activate("./")
+#     cluster_name = ENV["BANYAN_CLUSTER_NAME"]
+
+#     session_id = start_session(
+#         cluster_name=cluster_name,
+#         nworkers = 2,
+#         store_logs_in_s3=store_logs_in_s3,
+#     )
+#     end_session(session_id, release_resources_now=true)
+#     sleep(60)
+
+#     log_file = "banyan-log-for-session-$session_id"
+#     println("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)")
+#     @test store_logs_in_s3 == isfile(
+#         S3Path("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)",
+#         config=Banyan.global_aws_config())
+#     )
+# end
+
+# @testset "Starting session with failure in $scenario" for scenario in [
+#     "invalid julia version",
+#     "invalid branch name",
+#     "invalid dev paths"
+# ]
+#     Pkg.activate("./")
+
+#     try
+#         if scenario == "invalid julia version"
+#             # Temporarily overwrite `get_julia_version`
+#             Banyan.get_julia_version() = "invalidversion"
+#             @test_throws begin
+#                 session_id = start_session(
+#                     cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#                     nworkers = 2,
+#                     store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#                 )
+#             end ErrorException
+#         elseif scenario == "invalid branch name"
+#             @test_throws begin
+#                 session_id = start_session(
+#                     cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#                     nworkers = 2,
+#                     url = "https://github.com/banyan-team/banyan-julia.git",
+#                     branch = "nonexistant-branch",
+#                     directory = "banyan-julia/Banyan/test",
+#                     dev_paths = [
+#                         "banyan-julia/Banyan",
+#                     ],
+#                     force_pull = true,
+#                     force_sync = true,
+#                     force_install = true,
+#                 )
+#             end ErrorException
+#         elseif scenario == "invalid dev paths"
+#             @test_throws begin
+#                 session_id = start_session(
+#                     cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#                     nworkers = 2,
+#                     url = "https://github.com/banyan-team/banyan-julia.git",
+#                     branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()),
+#                     directory = "banyan-julia/Banyan/test",
+#                     dev_paths = [
+#                         "banyan-julia/Banyan",
+#                         "banyan-julia/NonExistantPackage"
+#                     ],
+#                     force_pull = true,
+#                     force_sync = true,
+#                     force_install = true,
+#                 )
+#             end ErrorException
+#         end
+#     catch
+#     end
+# end
+
+# @testset "Reusing session that fails" begin
+#     Pkg.activate("./")
+#     cluster_name = ENV["BANYAN_CLUSTER_NAME"]
+
+#     # Start a session
+#     session_id_1 = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 2,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#         force_sync=true
+#     )
+#     resource_id_1 = get_session().resource_id
+#     session_status_1 = get_session_status(session_id_1)
+
+#     # Trigger a failure in the session that will end the session
+#     try
+#         @test_throws begin
+#             offloaded(distributed=true) do
+#                 error("Oops sorry this is an error")
+#             end
+#         end ErrorException
+#     catch
+#     end
+#     session_status_1_after_failure = get_session_status(session_id_1)
+
+#     # Start a new session (it should reuse the resources of the failed session) and then end it
+#     session_id_2 = start_session(
+#         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
+#         nworkers = 2,
+#         store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+#         wait_now=false
+#     )
+#     resource_id_2 = get_session().resource_id
+#     session_status_2 = get_session_status(session_id_2)
+#     end_session(session_id_2, release_resources_now=true)
+
+#     # Assert
+#     @test session_status_1 == "running"
+#     @test session_status_1_after_failure == "failed"
+#     @test resource_id_2 == resource_id_1
+# end
+
+@testset "Running session with print_logs=$print_logs and store_logs_in_s3=$store_logs_in_s3" for
+    print_logs in [true, false],
+    store_logs_in_s3 in [true, false]
+
+    println("Before run_session")
+    run_session(
+        "file://run_session_test_script.jl",
         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 2,
+        nworkers=1,
         url = "https://github.com/banyan-team/banyan-julia.git",
         branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()),
         directory = "banyan-julia/Banyan/test",
         dev_paths = [
             "banyan-julia/Banyan",
         ],
-        force_pull = true,
-        force_sync = true,
-        force_install = true,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
+        print_logs = print_logs,
+        store_logs_in_s3 = store_logs_in_s3,
+        instance_type="t3.large",
+        disk_capacity="auto"
     )
-    session_status = get_session_status(session_id)
-    end_session(session_id, release_resources_now=true)
-    @test session_status == "running"
 end
 
-@testset "Create sessions with nowait=$nowait" for
-        nowait in [true, false]
-    Pkg.activate("./")
-    cluster_name = ENV["BANYAN_CLUSTER_NAME"]
-
-    session_id = start_session(
+@testset "Starting session" begin
+    s = start_session(
         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 2,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-        nowait=nowait
-    )
-
-    session_status = get_session_status(session_id)
-    if !nowait
-        @test session_status == "running"
-    else
-        @test session_status == "creating"
-        while session_status == "creating"
-            sleep(20)
-            session_status = get_session_status(session_id)
-        end
-        @test session_status == "running"
-    end
-
-    end_session(session_id, release_resources_now=true)
-end
-
-@testset "Create sessions where store_logs_in_s3=$store_logs_in_s3" for 
-        store_logs_in_s3 in [true, false]
-    Pkg.activate("./")
-    cluster_name = ENV["BANYAN_CLUSTER_NAME"]
-
-    session_id = start_session(
-        cluster_name=cluster_name,
-        nworkers = 2,
-        store_logs_in_s3=store_logs_in_s3,
-    )
-    end_session(session_id, release_resources_now=true)
-    sleep(60)
-
-    log_file = "banyan-log-for-session-$session_id"
-    println("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)")
-    @test store_logs_in_s3 == isfile(
-        S3Path("s3://$(get_cluster_s3_bucket_name(cluster_name))/$(log_file)",
-        config=Banyan.global_aws_config())
+        nworkers=1,
+        url = "https://github.com/banyan-team/banyan-julia.git",
+        branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()),
+        directory = "banyan-julia/Banyan/test",
+        dev_paths = [
+            "banyan-julia/Banyan",
+        ],
+        instance_type="t3.large",
+        disk_capacity="auto"
     )
-end
-
-@testset "Starting session with failure in $scenario" for scenario in [
-    "invalid julia version",
-    "invalid branch name",
-    "invalid dev paths"
-]
-    Pkg.activate("./")
-
-    try
-        if scenario == "invalid julia version"
-            # Temporarily overwrite `get_julia_version`
-            Banyan.get_julia_version() = "invalidversion"
-            @test_throws begin
-                session_id = start_session(
-                    cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-                    nworkers = 2,
-                    store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-                )
-            end ErrorException
-        elseif scenario == "invalid branch name"
-            @test_throws begin
-                session_id = start_session(
-                    cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-                    nworkers = 2,
-                    url = "https://github.com/banyan-team/banyan-julia.git",
-                    branch = "nonexistant-branch",
-                    directory = "banyan-julia/Banyan/test",
-                    dev_paths = [
-                        "banyan-julia/Banyan",
-                    ],
-                    force_pull = true,
-                    force_sync = true,
-                    force_install = true,
-                )
-            end ErrorException
-        elseif scenario == "invalid dev paths"
-            @test_throws begin
-                session_id = start_session(
-                    cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-                    nworkers = 2,
-                    url = "https://github.com/banyan-team/banyan-julia.git",
-                    branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()),
-                    directory = "banyan-julia/Banyan/test",
-                    dev_paths = [
-                        "banyan-julia/Banyan",
-                        "banyan-julia/NonExistantPackage"
-                    ],
-                    force_pull = true,
-                    force_sync = true,
-                    force_install = true,
-                )
-            end ErrorException
-        end
-    catch
-    end
-end
-
-@testset "Reusing session that fails" begin
-    Pkg.activate("./")
-    cluster_name = ENV["BANYAN_CLUSTER_NAME"]
+    @test get_session().id == get_session_id()
+    end_session(s)
 
-    # Start a session
-    session_id_1 = start_session(
+    s = start_session(
         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 2,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-        force_sync=true
+        nworkers=1,
+        url = "https://github.com/banyan-team/banyan-julia.git",
+        branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()),
+        directory = "banyan-julia/Banyan/test",
+        dev_paths = [
+            "banyan-julia/Banyan",
+        ],
+        start_now=true,
+        instance_type="t3.large",
+        disk_capacity="auto"
     )
-    resource_id_1 = get_session().resource_id
-    session_status_1 = get_session_status(session_id_1)
-
-    # Trigger a failure in the session that will end the session
-    try
-        @test_throws begin
-            offloaded(distributed=true) do
-                error("Oops sorry this is an error")
-            end
-        end ErrorException
-    catch
-    end
-    session_status_1_after_failure = get_session_status(session_id_1)
-
-    # Start a new session (it should reuse the resources of the failed session) and then end it
-    session_id_2 = start_session(
+    @test get_session().id == get_session_id()
+    end_session(s)
+
+    s = start_session(
         cluster_name = ENV["BANYAN_CLUSTER_NAME"],
-        nworkers = 2,
-        store_logs_on_cluster=get(ENV, "BANYAN_STORE_LOGS_ON_CLUSTER", "0") == "1",
-        nowait=true
+        nworkers=1,
+        url = "https://github.com/banyan-team/banyan-julia.git",
+        branch = get(ENV, "BANYAN_JULIA_BRANCH", Banyan.get_branch_name()),
+        directory = "banyan-julia/Banyan/test",
+        dev_paths = [
+            "banyan-julia/Banyan",
+        ],
+        wait_now=true,
+        instance_type="t3.large",
+        disk_capacity="auto" 
     )
-    resource_id_2 = get_session().resource_id
-    session_status_2 = get_session_status(session_id_2)
-    end_session(session_id_2, release_resources_now=true)
-
-    # Assert
-    @test session_status_1 == "running"
-    @test session_status_1_after_failure == "failed"
-    @test resource_id_2 == resource_id_1
+    @test get_session().id == get_session_id()
+    end_session(s)
 end
 
 # Outdated testset...revisit later...probably alread tested through above tests
diff --git a/BanyanArrays/test/Project.toml b/BanyanArrays/test/Project.toml
index 1ed9683e..9d71cb8c 100644
--- a/BanyanArrays/test/Project.toml
+++ b/BanyanArrays/test/Project.toml
@@ -10,6 +10,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
-Banyan = "0.4.0"
+Banyan = "0.4.1"
 ReTest = "0.3.2"
 julia = "^1.6"
diff --git a/BanyanHDF5/test/Project.toml b/BanyanHDF5/test/Project.toml
index 21ece6ad..6fe1cb9b 100644
--- a/BanyanHDF5/test/Project.toml
+++ b/BanyanHDF5/test/Project.toml
@@ -13,7 +13,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
-Banyan = "0.4.0"
-BanyanArrays = "0.4.0"
+Banyan = "0.4.1"
+BanyanArrays = "0.4.1"
 ReTest = "0.3.2"
 julia = "^1.6"
diff --git a/BanyanHDF5/test/runtests.jl b/BanyanHDF5/test/runtests.jl
index 56b16b58..52a8d861 100644
--- a/BanyanHDF5/test/runtests.jl
+++ b/BanyanHDF5/test/runtests.jl
@@ -30,6 +30,7 @@ function use_session_for_testing(
 
     # Set the session and create a new one if needed
     global sessions_for_testing
+    println("sessions_for_testing=$(sessions_for_testing)")
     set_session(
         if haskey(sessions_for_testing, session_config_hash)
             sessions_for_testing[session_config_hash]
@@ -69,6 +70,7 @@ function use_session_for_testing(
     )
     # If selected session has already failed, this will throw an error.
     sessions_for_testing[session_config_hash] = get_session_id()
+    println("Set sessions_for_testing[session_config_hash] to get_session_id() for $(sessions_for_testing[session_config_hash])")
 
     configure_scheduling(name = scheduling_config_name)
 
diff --git a/BanyanImages/test/Project.toml b/BanyanImages/test/Project.toml
index 699fe6ac..5739e8e9 100644
--- a/BanyanImages/test/Project.toml
+++ b/BanyanImages/test/Project.toml
@@ -13,6 +13,6 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReTest = "e0db7c4e-2690-44b9-bad6-7687da720f89"
 
 [compat]
-Banyan = "0.4.0"
+Banyan = "0.4.1"
 ReTest = "0.3.2"
 julia = "^1.6"